1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21//    of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23//    widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25//    of vectorization. It decides on the optimal vector width, which
26//    can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46//  Data for SIMD
47//
48// Other ideas/concepts are from:
49//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52//  Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/STLExtras.h"
70#include "llvm/ADT/SmallPtrSet.h"
71#include "llvm/ADT/SmallSet.h"
72#include "llvm/ADT/SmallVector.h"
73#include "llvm/ADT/Statistic.h"
74#include "llvm/ADT/StringRef.h"
75#include "llvm/ADT/Twine.h"
76#include "llvm/ADT/iterator_range.h"
77#include "llvm/Analysis/AssumptionCache.h"
78#include "llvm/Analysis/BasicAliasAnalysis.h"
79#include "llvm/Analysis/BlockFrequencyInfo.h"
80#include "llvm/Analysis/CFG.h"
81#include "llvm/Analysis/CodeMetrics.h"
82#include "llvm/Analysis/DemandedBits.h"
83#include "llvm/Analysis/GlobalsModRef.h"
84#include "llvm/Analysis/LoopAccessAnalysis.h"
85#include "llvm/Analysis/LoopAnalysisManager.h"
86#include "llvm/Analysis/LoopInfo.h"
87#include "llvm/Analysis/LoopIterator.h"
88#include "llvm/Analysis/OptimizationRemarkEmitter.h"
89#include "llvm/Analysis/ProfileSummaryInfo.h"
90#include "llvm/Analysis/ScalarEvolution.h"
91#include "llvm/Analysis/ScalarEvolutionExpressions.h"
92#include "llvm/Analysis/TargetLibraryInfo.h"
93#include "llvm/Analysis/TargetTransformInfo.h"
94#include "llvm/Analysis/ValueTracking.h"
95#include "llvm/Analysis/VectorUtils.h"
96#include "llvm/IR/Attributes.h"
97#include "llvm/IR/BasicBlock.h"
98#include "llvm/IR/CFG.h"
99#include "llvm/IR/Constant.h"
100#include "llvm/IR/Constants.h"
101#include "llvm/IR/DataLayout.h"
102#include "llvm/IR/DebugInfo.h"
103#include "llvm/IR/DebugInfoMetadata.h"
104#include "llvm/IR/DebugLoc.h"
105#include "llvm/IR/DerivedTypes.h"
106#include "llvm/IR/DiagnosticInfo.h"
107#include "llvm/IR/Dominators.h"
108#include "llvm/IR/Function.h"
109#include "llvm/IR/IRBuilder.h"
110#include "llvm/IR/InstrTypes.h"
111#include "llvm/IR/Instruction.h"
112#include "llvm/IR/Instructions.h"
113#include "llvm/IR/IntrinsicInst.h"
114#include "llvm/IR/Intrinsics.h"
115#include "llvm/IR/MDBuilder.h"
116#include "llvm/IR/Metadata.h"
117#include "llvm/IR/Module.h"
118#include "llvm/IR/Operator.h"
119#include "llvm/IR/PatternMatch.h"
120#include "llvm/IR/ProfDataUtils.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/Support/Casting.h"
128#include "llvm/Support/CommandLine.h"
129#include "llvm/Support/Compiler.h"
130#include "llvm/Support/Debug.h"
131#include "llvm/Support/ErrorHandling.h"
132#include "llvm/Support/InstructionCost.h"
133#include "llvm/Support/MathExtras.h"
134#include "llvm/Support/raw_ostream.h"
135#include "llvm/Transforms/Utils/BasicBlockUtils.h"
136#include "llvm/Transforms/Utils/InjectTLIMappings.h"
137#include "llvm/Transforms/Utils/LoopSimplify.h"
138#include "llvm/Transforms/Utils/LoopUtils.h"
139#include "llvm/Transforms/Utils/LoopVersioning.h"
140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cmath>
146#include <cstdint>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <map>
151#include <memory>
152#include <string>
153#include <tuple>
154#include <utility>
155
156using namespace llvm;
157
158#define LV_NAME "loop-vectorize"
159#define DEBUG_TYPE LV_NAME
160
161#ifndef NDEBUG
162const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163#endif
164
165/// @{
166/// Metadata attribute names
167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168const char LLVMLoopVectorizeFollowupVectorized[] =
169    "llvm.loop.vectorize.followup_vectorized";
170const char LLVMLoopVectorizeFollowupEpilogue[] =
171    "llvm.loop.vectorize.followup_epilogue";
172/// @}
173
174STATISTIC(LoopsVectorized, "Number of loops vectorized");
175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177
178static cl::opt<bool> EnableEpilogueVectorization(
179    "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180    cl::desc("Enable vectorization of epilogue loops."));
181
182static cl::opt<unsigned> EpilogueVectorizationForceVF(
183    "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184    cl::desc("When epilogue vectorization is enabled, and a value greater than "
185             "1 is specified, forces the given VF for all applicable epilogue "
186             "loops."));
187
188static cl::opt<unsigned> EpilogueVectorizationMinVF(
189    "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190    cl::desc("Only loops with vectorization factor equal to or larger than "
191             "the specified value are considered for epilogue vectorization."));
192
193/// Loops with a known constant trip count below this number are vectorized only
194/// if no scalar iteration overheads are incurred.
195static cl::opt<unsigned> TinyTripCountVectorThreshold(
196    "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197    cl::desc("Loops with a constant trip count that is smaller than this "
198             "value are vectorized only if no scalar iteration overheads "
199             "are incurred."));
200
201static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202    "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203    cl::desc("The maximum allowed number of runtime memory checks"));
204
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206// that predication is preferred, and this lists all options. I.e., the
207// vectorizer will try to fold the tail-loop (epilogue) into the vector body
208// and predicate the instructions accordingly. If tail-folding fails, there are
209// different fallback strategies depending on these values:
210namespace PreferPredicateTy {
211  enum Option {
212    ScalarEpilogue = 0,
213    PredicateElseScalarEpilogue,
214    PredicateOrDontVectorize
215  };
216} // namespace PreferPredicateTy
217
218static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219    "prefer-predicate-over-epilogue",
220    cl::init(PreferPredicateTy::ScalarEpilogue),
221    cl::Hidden,
222    cl::desc("Tail-folding and predication preferences over creating a scalar "
223             "epilogue loop."),
224    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                         "scalar-epilogue",
226                         "Don't tail-predicate loops, create scalar epilogue"),
227              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                         "predicate-else-scalar-epilogue",
229                         "prefer tail-folding, create scalar epilogue if tail "
230                         "folding fails."),
231              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                         "predicate-dont-vectorize",
233                         "prefers tail-folding, don't attempt vectorization if "
234                         "tail-folding fails.")));
235
236static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237    "force-tail-folding-style", cl::desc("Force the tail folding style"),
238    cl::init(TailFoldingStyle::None),
239    cl::values(
240        clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241        clEnumValN(
242            TailFoldingStyle::Data, "data",
243            "Create lane mask for data only, using active.lane.mask intrinsic"),
244        clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245                   "data-without-lane-mask",
246                   "Create lane mask with compare/stepvector"),
247        clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248                   "Create lane mask using active.lane.mask intrinsic, and use "
249                   "it for both data and control flow"),
250        clEnumValN(
251            TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252            "data-and-control-without-rt-check",
253            "Similar to data-and-control, but remove the runtime check")));
254
255static cl::opt<bool> MaximizeBandwidth(
256    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257    cl::desc("Maximize bandwidth when selecting vectorization factor which "
258             "will be determined by the smallest type in loop."));
259
260static cl::opt<bool> EnableInterleavedMemAccesses(
261    "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262    cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
263
264/// An interleave-group may need masking if it resides in a block that needs
265/// predication, or in order to mask away gaps.
266static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
269
270static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271    "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272    cl::desc("We don't interleave loops with a estimated constant trip count "
273             "below this number"));
274
275static cl::opt<unsigned> ForceTargetNumScalarRegs(
276    "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277    cl::desc("A flag that overrides the target's number of scalar registers."));
278
279static cl::opt<unsigned> ForceTargetNumVectorRegs(
280    "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281    cl::desc("A flag that overrides the target's number of vector registers."));
282
283static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285    cl::desc("A flag that overrides the target's max interleave factor for "
286             "scalar loops."));
287
288static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290    cl::desc("A flag that overrides the target's max interleave factor for "
291             "vectorized loops."));
292
293static cl::opt<unsigned> ForceTargetInstructionCost(
294    "force-target-instruction-cost", cl::init(0), cl::Hidden,
295    cl::desc("A flag that overrides the target's expected cost for "
296             "an instruction to a single constant value. Mostly "
297             "useful for getting consistent testing."));
298
299static cl::opt<bool> ForceTargetSupportsScalableVectors(
300    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301    cl::desc(
302        "Pretend that scalable vectors are supported, even if the target does "
303        "not support them. This flag should only be used for testing."));
304
305static cl::opt<unsigned> SmallLoopCost(
306    "small-loop-cost", cl::init(20), cl::Hidden,
307    cl::desc(
308        "The cost of a loop that is considered 'small' by the interleaver."));
309
310static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311    "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312    cl::desc("Enable the use of the block frequency analysis to access PGO "
313             "heuristics minimizing code growth in cold regions and being more "
314             "aggressive in hot regions."));
315
316// Runtime interleave loops for load/store throughput.
317static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318    "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319    cl::desc(
320        "Enable runtime interleaving until load/store ports are saturated"));
321
322/// Interleave small loops with scalar reductions.
323static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325    cl::desc("Enable interleaving for loops with small iteration counts that "
326             "contain scalar reductions to expose ILP."));
327
328/// The number of stores in a loop that are allowed to need predication.
329static cl::opt<unsigned> NumberOfStoresToPredicate(
330    "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331    cl::desc("Max number of stores to be predicated behind an if."));
332
333static cl::opt<bool> EnableIndVarRegisterHeur(
334    "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335    cl::desc("Count the induction variable only once when interleaving"));
336
337static cl::opt<bool> EnableCondStoresVectorization(
338    "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339    cl::desc("Enable if predication of stores during vectorization."));
340
341static cl::opt<unsigned> MaxNestedScalarReductionIC(
342    "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343    cl::desc("The maximum interleave count to use when interleaving a scalar "
344             "reduction in a nested loop."));
345
346static cl::opt<bool>
347    PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348                           cl::Hidden,
349                           cl::desc("Prefer in-loop vector reductions, "
350                                    "overriding the targets preference."));
351
352static cl::opt<bool> ForceOrderedReductions(
353    "force-ordered-reductions", cl::init(false), cl::Hidden,
354    cl::desc("Enable the vectorisation of loops with in-order (strict) "
355             "FP reductions"));
356
357static cl::opt<bool> PreferPredicatedReductionSelect(
358    "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359    cl::desc(
360        "Prefer predicating a reduction operation over an after loop select."));
361
362namespace llvm {
363cl::opt<bool> EnableVPlanNativePath(
364    "enable-vplan-native-path", cl::Hidden,
365    cl::desc("Enable VPlan-native vectorization path with "
366             "support for outer loop vectorization."));
367}
368
369// This flag enables the stress testing of the VPlan H-CFG construction in the
370// VPlan-native vectorization path. It must be used in conjuction with
371// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372// verification of the H-CFGs built.
373static cl::opt<bool> VPlanBuildStressTest(
374    "vplan-build-stress-test", cl::init(false), cl::Hidden,
375    cl::desc(
376        "Build VPlan for every supported loop nest in the function and bail "
377        "out right after the build (stress test the VPlan H-CFG construction "
378        "in the VPlan-native vectorization path)."));
379
380cl::opt<bool> llvm::EnableLoopInterleaving(
381    "interleave-loops", cl::init(true), cl::Hidden,
382    cl::desc("Enable loop interleaving in Loop vectorization passes"));
383cl::opt<bool> llvm::EnableLoopVectorization(
384    "vectorize-loops", cl::init(true), cl::Hidden,
385    cl::desc("Run the Loop vectorization passes"));
386
387static cl::opt<bool> PrintVPlansInDotFormat(
388    "vplan-print-in-dot-format", cl::Hidden,
389    cl::desc("Use dot format instead of plain text when dumping VPlans"));
390
391static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392    "force-widen-divrem-via-safe-divisor", cl::Hidden,
393    cl::desc(
394        "Override cost based safe divisor widening for div/rem instructions"));
395
396static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397    "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398    cl::Hidden,
399    cl::desc("Try wider VFs if they enable the use of vector variants"));
400
401// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402// variables not overflowing do not hold. See `emitSCEVChecks`.
403static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404// Likelyhood of bypassing the vectorized loop because pointers overlap. See
405// `emitMemRuntimeChecks`.
406static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407// Likelyhood of bypassing the vectorized loop because there are zero trips left
408// after prolog. See `emitIterationCountCheck`.
409static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410
411/// A helper function that returns true if the given type is irregular. The
412/// type is irregular if its allocated size doesn't equal the store size of an
413/// element of the corresponding vector type.
414static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415  // Determine if an array of N elements of type Ty is "bitcast compatible"
416  // with a <N x Ty> vector.
417  // This is only true if there is no padding between the array elements.
418  return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
419}
420
421/// A helper function that returns the reciprocal of the block probability of
422/// predicated blocks. If we return X, we are assuming the predicated block
423/// will execute once for every X iterations of the loop header.
424///
425/// TODO: We should use actual block probability here, if available. Currently,
426///       we always assume predicated blocks have a 50% chance of executing.
427static unsigned getReciprocalPredBlockProb() { return 2; }
428
429/// Returns "best known" trip count for the specified loop \p L as defined by
430/// the following procedure:
431///   1) Returns exact trip count if it is known.
432///   2) Returns expected trip count according to profile data if any.
433///   3) Returns upper bound estimate if it is known.
434///   4) Returns std::nullopt if all of the above failed.
435static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436                                                   Loop *L) {
437  // Check if exact trip count is known.
438  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439    return ExpectedTC;
440
441  // Check if there is an expected trip count available from profile data.
442  if (LoopVectorizeWithBlockFrequency)
443    if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444      return *EstimatedTC;
445
446  // Check if upper bound estimate is known.
447  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448    return ExpectedTC;
449
450  return std::nullopt;
451}
452
453/// Return a vector containing interleaved elements from multiple
454/// smaller input vectors.
455static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456                                const Twine &Name) {
457  unsigned Factor = Vals.size();
458  assert(Factor > 1 && "Tried to interleave invalid number of vectors");
459
460  VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461#ifndef NDEBUG
462  for (Value *Val : Vals)
463    assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464#endif
465
466  // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467  // must use intrinsics to interleave.
468  if (VecTy->isScalableTy()) {
469    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470    return Builder.CreateIntrinsic(
471        WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472        /*FMFSource=*/nullptr, Name);
473  }
474
475  // Fixed length. Start by concatenating all vectors into a wide vector.
476  Value *WideVec = concatenateVectors(Builder, Vals);
477
478  // Interleave the elements into the wide vector.
479  const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480  return Builder.CreateShuffleVector(
481      WideVec, createInterleaveMask(NumElts, Factor), Name);
482}
483
484namespace {
485// Forward declare GeneratedRTChecks.
486class GeneratedRTChecks;
487
488using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489} // namespace
490
491namespace llvm {
492
493AnalysisKey ShouldRunExtraVectorPasses::Key;
494
495/// InnerLoopVectorizer vectorizes loops which contain only one basic
496/// block to a specified vectorization factor (VF).
497/// This class performs the widening of scalars into vectors, or multiple
498/// scalars. This class also implements the following features:
499/// * It inserts an epilogue loop for handling loops that don't have iteration
500///   counts that are known to be a multiple of the vectorization factor.
501/// * It handles the code generation for reduction variables.
502/// * Scalarization (implementation using scalars) of un-vectorizable
503///   instructions.
504/// InnerLoopVectorizer does not perform any vectorization-legality
505/// checks, and relies on the caller to check for the different legality
506/// aspects. The InnerLoopVectorizer relies on the
507/// LoopVectorizationLegality class to provide information about the induction
508/// and reduction variables that were found to a given vectorization factor.
509class InnerLoopVectorizer {
510public:
511  InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512                      LoopInfo *LI, DominatorTree *DT,
513                      const TargetLibraryInfo *TLI,
514                      const TargetTransformInfo *TTI, AssumptionCache *AC,
515                      OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516                      ElementCount MinProfitableTripCount,
517                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519                      ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522        Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523        PSI(PSI), RTChecks(RTChecks) {
524    // Query this against the original loop and save it here because the profile
525    // of the original loop header may change as the transformation happens.
526    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
528
529    if (MinProfitableTripCount.isZero())
530      this->MinProfitableTripCount = VecWidth;
531    else
532      this->MinProfitableTripCount = MinProfitableTripCount;
533  }
534
535  virtual ~InnerLoopVectorizer() = default;
536
537  /// Create a new empty loop that will contain vectorized instructions later
538  /// on, while the old loop will be used as the scalar remainder. Control flow
539  /// is generated around the vectorized (and scalar epilogue) loops consisting
540  /// of various checks and bypasses. Return the pre-header block of the new
541  /// loop and the start value for the canonical induction, if it is != 0. The
542  /// latter is the case when vectorizing the epilogue loop. In the case of
543  /// epilogue vectorization, this function is overriden to handle the more
544  /// complex control flow around the loops.  \p ExpandedSCEVs is used to
545  /// look up SCEV expansions for expressions needed during skeleton creation.
546  virtual std::pair<BasicBlock *, Value *>
547  createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
548
549  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550  void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
551
552  // Return true if any runtime check is added.
553  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
554
555  /// A type for vectorized values in the new loop. Each value from the
556  /// original loop, when vectorized, is represented by UF vector values in the
557  /// new unrolled loop, where UF is the unroll factor.
558  using VectorParts = SmallVector<Value *, 2>;
559
560  /// A helper function to scalarize a single Instruction in the innermost loop.
561  /// Generates a sequence of scalar instances for each lane between \p MinLane
562  /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563  /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564  /// Instr's operands.
565  void scalarizeInstruction(const Instruction *Instr,
566                            VPReplicateRecipe *RepRecipe,
567                            const VPIteration &Instance,
568                            VPTransformState &State);
569
570  /// Try to vectorize interleaved access group \p Group with the base address
571  /// given in \p Addr, optionally masking the vector operations if \p
572  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573  /// values in the vectorized loop.
574  void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575                                ArrayRef<VPValue *> VPDefs,
576                                VPTransformState &State, VPValue *Addr,
577                                ArrayRef<VPValue *> StoredValues,
578                                VPValue *BlockInMask, bool NeedsMaskForGaps);
579
580  /// Fix the non-induction PHIs in \p Plan.
581  void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
582
583  /// Returns true if the reordering of FP operations is not allowed, but we are
584  /// able to vectorize with strict in-order reductions for the given RdxDesc.
585  bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
586
587  /// Create a new phi node for the induction variable \p OrigPhi to resume
588  /// iteration count in the scalar epilogue, from where the vectorized loop
589  /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590  /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591  /// and the resume values can come from an additional bypass block, the \p
592  /// AdditionalBypass pair provides information about the bypass block and the
593  /// end value on the edge from bypass to this loop.
594  PHINode *createInductionResumeValue(
595      PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596      ArrayRef<BasicBlock *> BypassBlocks,
597      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
598
599  /// Returns the original loop trip count.
600  Value *getTripCount() const { return TripCount; }
601
602  /// Used to set the trip count after ILV's construction and after the
603  /// preheader block has been executed. Note that this always holds the trip
604  /// count of the original loop for both main loop and epilogue vectorization.
605  void setTripCount(Value *TC) { TripCount = TC; }
606
607protected:
608  friend class LoopVectorizationPlanner;
609
610  /// A small list of PHINodes.
611  using PhiVector = SmallVector<PHINode *, 4>;
612
613  /// A type for scalarized values in the new loop. Each value from the
614  /// original loop, when scalarized, is represented by UF x VF scalar values
615  /// in the new unrolled loop, where UF is the unroll factor and VF is the
616  /// vectorization factor.
617  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
618
619  /// Set up the values of the IVs correctly when exiting the vector loop.
620  void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
621                    Value *VectorTripCount, Value *EndValue,
622                    BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623                    VPlan &Plan, VPTransformState &State);
624
625  /// Create the exit value of first order recurrences in the middle block and
626  /// update their users.
627  void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
628                               VPTransformState &State);
629
630  /// Create code for the loop exit value of the reduction.
631  void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
632
633  /// Iteratively sink the scalarized operands of a predicated instruction into
634  /// the block that was created for it.
635  void sinkScalarOperands(Instruction *PredInst);
636
637  /// Returns (and creates if needed) the trip count of the widened loop.
638  Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
639
640  /// Returns a bitcasted value to the requested vector type.
641  /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642  Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
643                                const DataLayout &DL);
644
645  /// Emit a bypass check to see if the vector trip count is zero, including if
646  /// it overflows.
647  void emitIterationCountCheck(BasicBlock *Bypass);
648
649  /// Emit a bypass check to see if all of the SCEV assumptions we've
650  /// had to make are correct. Returns the block containing the checks or
651  /// nullptr if no checks have been added.
652  BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
653
654  /// Emit bypass checks to check any memory assumptions we may have made.
655  /// Returns the block containing the checks or nullptr if no checks have been
656  /// added.
657  BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
658
659  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660  /// vector loop preheader, middle block and scalar preheader.
661  void createVectorLoopSkeleton(StringRef Prefix);
662
663  /// Create new phi nodes for the induction variables to resume iteration count
664  /// in the scalar epilogue, from where the vectorized loop left off.
665  /// In cases where the loop skeleton is more complicated (eg. epilogue
666  /// vectorization) and the resume values can come from an additional bypass
667  /// block, the \p AdditionalBypass pair provides information about the bypass
668  /// block and the end value on the edge from bypass to this loop.
669  void createInductionResumeValues(
670      const SCEV2ValueTy &ExpandedSCEVs,
671      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
672
673  /// Complete the loop skeleton by adding debug MDs, creating appropriate
674  /// conditional branches in the middle block, preparing the builder and
675  /// running the verifier. Return the preheader of the completed vector loop.
676  BasicBlock *completeLoopSkeleton();
677
678  /// Collect poison-generating recipes that may generate a poison value that is
679  /// used after vectorization, even when their operands are not poison. Those
680  /// recipes meet the following conditions:
681  ///  * Contribute to the address computation of a recipe generating a widen
682  ///    memory load/store (VPWidenMemoryInstructionRecipe or
683  ///    VPInterleaveRecipe).
684  ///  * Such a widen memory load/store has at least one underlying Instruction
685  ///    that is in a basic block that needs predication and after vectorization
686  ///    the generated instruction won't be predicated.
687  void collectPoisonGeneratingRecipes(VPTransformState &State);
688
689  /// Allow subclasses to override and print debug traces before/after vplan
690  /// execution, when trace information is requested.
691  virtual void printDebugTracesAtStart(){};
692  virtual void printDebugTracesAtEnd(){};
693
694  /// The original loop.
695  Loop *OrigLoop;
696
697  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698  /// dynamic knowledge to simplify SCEV expressions and converts them to a
699  /// more usable form.
700  PredicatedScalarEvolution &PSE;
701
702  /// Loop Info.
703  LoopInfo *LI;
704
705  /// Dominator Tree.
706  DominatorTree *DT;
707
708  /// Target Library Info.
709  const TargetLibraryInfo *TLI;
710
711  /// Target Transform Info.
712  const TargetTransformInfo *TTI;
713
714  /// Assumption Cache.
715  AssumptionCache *AC;
716
717  /// Interface to emit optimization remarks.
718  OptimizationRemarkEmitter *ORE;
719
720  /// The vectorization SIMD factor to use. Each vector will have this many
721  /// vector elements.
722  ElementCount VF;
723
724  ElementCount MinProfitableTripCount;
725
726  /// The vectorization unroll factor to use. Each scalar is vectorized to this
727  /// many different vector instructions.
728  unsigned UF;
729
730  /// The builder that we use
731  IRBuilder<> Builder;
732
733  // --- Vectorization state ---
734
735  /// The vector-loop preheader.
736  BasicBlock *LoopVectorPreHeader;
737
738  /// The scalar-loop preheader.
739  BasicBlock *LoopScalarPreHeader;
740
741  /// Middle Block between the vector and the scalar.
742  BasicBlock *LoopMiddleBlock;
743
744  /// The unique ExitBlock of the scalar loop if one exists.  Note that
745  /// there can be multiple exiting edges reaching this block.
746  BasicBlock *LoopExitBlock;
747
748  /// The scalar loop body.
749  BasicBlock *LoopScalarBody;
750
751  /// A list of all bypass blocks. The first block is the entry of the loop.
752  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
753
754  /// Store instructions that were predicated.
755  SmallVector<Instruction *, 4> PredicatedInstructions;
756
757  /// Trip count of the original loop.
758  Value *TripCount = nullptr;
759
760  /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761  Value *VectorTripCount = nullptr;
762
763  /// The legality analysis.
764  LoopVectorizationLegality *Legal;
765
766  /// The profitablity analysis.
767  LoopVectorizationCostModel *Cost;
768
769  // Record whether runtime checks are added.
770  bool AddedSafetyChecks = false;
771
772  // Holds the end values for each induction variable. We save the end values
773  // so we can later fix-up the external users of the induction variables.
774  DenseMap<PHINode *, Value *> IVEndValues;
775
776  /// BFI and PSI are used to check for profile guided size optimizations.
777  BlockFrequencyInfo *BFI;
778  ProfileSummaryInfo *PSI;
779
780  // Whether this loop should be optimized for size based on profile guided size
781  // optimizatios.
782  bool OptForSizeBasedOnProfile;
783
784  /// Structure to hold information about generated runtime checks, responsible
785  /// for cleaning the checks, if vectorization turns out unprofitable.
786  GeneratedRTChecks &RTChecks;
787
788  // Holds the resume values for reductions in the loops, used to set the
789  // correct start value of reduction PHIs when vectorizing the epilogue.
790  SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
791      ReductionResumeValues;
792};
793
794class InnerLoopUnroller : public InnerLoopVectorizer {
795public:
796  InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
797                    LoopInfo *LI, DominatorTree *DT,
798                    const TargetLibraryInfo *TLI,
799                    const TargetTransformInfo *TTI, AssumptionCache *AC,
800                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
801                    LoopVectorizationLegality *LVL,
802                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803                    ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805                            ElementCount::getFixed(1),
806                            ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807                            BFI, PSI, Check) {}
808};
809
810/// Encapsulate information regarding vectorization of a loop and its epilogue.
811/// This information is meant to be updated and used across two stages of
812/// epilogue vectorization.
813struct EpilogueLoopVectorizationInfo {
814  ElementCount MainLoopVF = ElementCount::getFixed(0);
815  unsigned MainLoopUF = 0;
816  ElementCount EpilogueVF = ElementCount::getFixed(0);
817  unsigned EpilogueUF = 0;
818  BasicBlock *MainLoopIterationCountCheck = nullptr;
819  BasicBlock *EpilogueIterationCountCheck = nullptr;
820  BasicBlock *SCEVSafetyCheck = nullptr;
821  BasicBlock *MemSafetyCheck = nullptr;
822  Value *TripCount = nullptr;
823  Value *VectorTripCount = nullptr;
824
825  EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826                                ElementCount EVF, unsigned EUF)
827      : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828    assert(EUF == 1 &&
829           "A high UF for the epilogue loop is likely not beneficial.");
830  }
831};
832
833/// An extension of the inner loop vectorizer that creates a skeleton for a
834/// vectorized loop that has its epilogue (residual) also vectorized.
835/// The idea is to run the vplan on a given loop twice, firstly to setup the
836/// skeleton and vectorize the main loop, and secondly to complete the skeleton
837/// from the first step and vectorize the epilogue.  This is achieved by
838/// deriving two concrete strategy classes from this base class and invoking
839/// them in succession from the loop vectorizer planner.
840class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841public:
842  InnerLoopAndEpilogueVectorizer(
843      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844      DominatorTree *DT, const TargetLibraryInfo *TLI,
845      const TargetTransformInfo *TTI, AssumptionCache *AC,
846      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849      GeneratedRTChecks &Checks)
850      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851                            EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852                            CM, BFI, PSI, Checks),
853        EPI(EPI) {}
854
855  // Override this function to handle the more complex control flow around the
856  // three loops.
857  std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858      const SCEV2ValueTy &ExpandedSCEVs) final {
859    return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
860  }
861
862  /// The interface for creating a vectorized skeleton using one of two
863  /// different strategies, each corresponding to one execution of the vplan
864  /// as described above.
865  virtual std::pair<BasicBlock *, Value *>
866  createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
867
868  /// Holds and updates state information required to vectorize the main loop
869  /// and its epilogue in two separate passes. This setup helps us avoid
870  /// regenerating and recomputing runtime safety checks. It also helps us to
871  /// shorten the iteration-count-check path length for the cases where the
872  /// iteration count of the loop is so small that the main vector loop is
873  /// completely skipped.
874  EpilogueLoopVectorizationInfo &EPI;
875};
876
877/// A specialized derived class of inner loop vectorizer that performs
878/// vectorization of *main* loops in the process of vectorizing loops and their
879/// epilogues.
880class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881public:
882  EpilogueVectorizerMainLoop(
883      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884      DominatorTree *DT, const TargetLibraryInfo *TLI,
885      const TargetTransformInfo *TTI, AssumptionCache *AC,
886      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889      GeneratedRTChecks &Check)
890      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891                                       EPI, LVL, CM, BFI, PSI, Check) {}
892  /// Implements the interface for creating a vectorized skeleton using the
893  /// *main loop* strategy (ie the first pass of vplan execution).
894  std::pair<BasicBlock *, Value *>
895  createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
896
897protected:
898  /// Emits an iteration count bypass check once for the main loop (when \p
899  /// ForEpilogue is false) and once for the epilogue loop (when \p
900  /// ForEpilogue is true).
901  BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902  void printDebugTracesAtStart() override;
903  void printDebugTracesAtEnd() override;
904};
905
906// A specialized derived class of inner loop vectorizer that performs
907// vectorization of *epilogue* loops in the process of vectorizing loops and
908// their epilogues.
909class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910public:
911  EpilogueVectorizerEpilogueLoop(
912      Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913      DominatorTree *DT, const TargetLibraryInfo *TLI,
914      const TargetTransformInfo *TTI, AssumptionCache *AC,
915      OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916      LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917      BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918      GeneratedRTChecks &Checks)
919      : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920                                       EPI, LVL, CM, BFI, PSI, Checks) {
921    TripCount = EPI.TripCount;
922  }
923  /// Implements the interface for creating a vectorized skeleton using the
924  /// *epilogue loop* strategy (ie the second pass of vplan execution).
925  std::pair<BasicBlock *, Value *>
926  createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
927
928protected:
929  /// Emits an iteration count bypass check after the main vector loop has
930  /// finished to see if there are any iterations left to execute by either
931  /// the vector epilogue or the scalar epilogue.
932  BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933                                                      BasicBlock *Bypass,
934                                                      BasicBlock *Insert);
935  void printDebugTracesAtStart() override;
936  void printDebugTracesAtEnd() override;
937};
938} // end namespace llvm
939
940/// Look for a meaningful debug location on the instruction or it's
941/// operands.
942static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
943  if (!I)
944    return DebugLoc();
945
946  DebugLoc Empty;
947  if (I->getDebugLoc() != Empty)
948    return I->getDebugLoc();
949
950  for (Use &Op : I->operands()) {
951    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
952      if (OpInst->getDebugLoc() != Empty)
953        return OpInst->getDebugLoc();
954  }
955
956  return I->getDebugLoc();
957}
958
959/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960/// is passed, the message relates to that particular instruction.
961#ifndef NDEBUG
962static void debugVectorizationMessage(const StringRef Prefix,
963                                      const StringRef DebugMsg,
964                                      Instruction *I) {
965  dbgs() << "LV: " << Prefix << DebugMsg;
966  if (I != nullptr)
967    dbgs() << " " << *I;
968  else
969    dbgs() << '.';
970  dbgs() << '\n';
971}
972#endif
973
974/// Create an analysis remark that explains why vectorization failed
975///
976/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
977/// RemarkName is the identifier for the remark.  If \p I is passed it is an
978/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
979/// the location of the remark.  \return the remark object that can be
980/// streamed to.
981static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
982    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
983  Value *CodeRegion = TheLoop->getHeader();
984  DebugLoc DL = TheLoop->getStartLoc();
985
986  if (I) {
987    CodeRegion = I->getParent();
988    // If there is no debug location attached to the instruction, revert back to
989    // using the loop's.
990    if (I->getDebugLoc())
991      DL = I->getDebugLoc();
992  }
993
994  return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
995}
996
997namespace llvm {
998
999/// Return a value for Step multiplied by VF.
1000Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001                       int64_t Step) {
1002  assert(Ty->isIntegerTy() && "Expected an integer step");
1003  return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1004}
1005
1006/// Return the runtime value for VF.
1007Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008  return B.CreateElementCount(Ty, VF);
1009}
1010
1011const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012                                Loop *OrigLoop) {
1013  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014  assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1015
1016  ScalarEvolution &SE = *PSE.getSE();
1017  return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1018}
1019
1020void reportVectorizationFailure(const StringRef DebugMsg,
1021                                const StringRef OREMsg, const StringRef ORETag,
1022                                OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023                                Instruction *I) {
1024  LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1025  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026  ORE->emit(
1027      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028      << "loop not vectorized: " << OREMsg);
1029}
1030
1031void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032                             OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033                             Instruction *I) {
1034  LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035  LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036  ORE->emit(
1037      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038      << Msg);
1039}
1040
1041/// Report successful vectorization of the loop. In case an outer loop is
1042/// vectorized, prepend "outer" to the vectorization remark.
1043static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044                                VectorizationFactor VF, unsigned IC) {
1045  LLVM_DEBUG(debugVectorizationMessage(
1046      "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047      nullptr));
1048  StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049  ORE->emit([&]() {
1050    return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051                              TheLoop->getHeader())
1052           << "vectorized " << LoopType << "loop (vectorization width: "
1053           << ore::NV("VectorizationFactor", VF.Width)
1054           << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1055  });
1056}
1057
1058} // end namespace llvm
1059
1060#ifndef NDEBUG
1061/// \return string containing a file name and a line # for the given loop.
1062static std::string getDebugLocString(const Loop *L) {
1063  std::string Result;
1064  if (L) {
1065    raw_string_ostream OS(Result);
1066    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1067      LoopDbgLoc.print(OS);
1068    else
1069      // Just print the module name.
1070      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1071    OS.flush();
1072  }
1073  return Result;
1074}
1075#endif
1076
1077void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078    VPTransformState &State) {
1079
1080  // Collect recipes in the backward slice of `Root` that may generate a poison
1081  // value that is used after vectorization.
1082  SmallPtrSet<VPRecipeBase *, 16> Visited;
1083  auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1084    SmallVector<VPRecipeBase *, 16> Worklist;
1085    Worklist.push_back(Root);
1086
1087    // Traverse the backward slice of Root through its use-def chain.
1088    while (!Worklist.empty()) {
1089      VPRecipeBase *CurRec = Worklist.back();
1090      Worklist.pop_back();
1091
1092      if (!Visited.insert(CurRec).second)
1093        continue;
1094
1095      // Prune search if we find another recipe generating a widen memory
1096      // instruction. Widen memory instructions involved in address computation
1097      // will lead to gather/scatter instructions, which don't need to be
1098      // handled.
1099      if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1100          isa<VPInterleaveRecipe>(CurRec) ||
1101          isa<VPScalarIVStepsRecipe>(CurRec) ||
1102          isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103          isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1104        continue;
1105
1106      // This recipe contributes to the address computation of a widen
1107      // load/store. If the underlying instruction has poison-generating flags,
1108      // drop them directly.
1109      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110        RecWithFlags->dropPoisonGeneratingFlags();
1111      } else {
1112        Instruction *Instr = dyn_cast_or_null<Instruction>(
1113            CurRec->getVPSingleValue()->getUnderlyingValue());
1114        (void)Instr;
1115        assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116               "found instruction with poison generating flags not covered by "
1117               "VPRecipeWithIRFlags");
1118      }
1119
1120      // Add new definitions to the worklist.
1121      for (VPValue *operand : CurRec->operands())
1122        if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123          Worklist.push_back(OpDef);
1124    }
1125  });
1126
1127  // Traverse all the recipes in the VPlan and collect the poison-generating
1128  // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129  // VPInterleaveRecipe.
1130  auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1131  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1132    for (VPRecipeBase &Recipe : *VPBB) {
1133      if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1134        Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135        VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1136        if (AddrDef && WidenRec->isConsecutive() &&
1137            Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138          collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1139      } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140        VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1141        if (AddrDef) {
1142          // Check if any member of the interleave group needs predication.
1143          const InterleaveGroup<Instruction> *InterGroup =
1144              InterleaveRec->getInterleaveGroup();
1145          bool NeedPredication = false;
1146          for (int I = 0, NumMembers = InterGroup->getNumMembers();
1147               I < NumMembers; ++I) {
1148            Instruction *Member = InterGroup->getMember(I);
1149            if (Member)
1150              NeedPredication |=
1151                  Legal->blockNeedsPredication(Member->getParent());
1152          }
1153
1154          if (NeedPredication)
1155            collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1156        }
1157      }
1158    }
1159  }
1160}
1161
1162namespace llvm {
1163
1164// Loop vectorization cost-model hints how the scalar epilogue loop should be
1165// lowered.
1166enum ScalarEpilogueLowering {
1167
1168  // The default: allowing scalar epilogues.
1169  CM_ScalarEpilogueAllowed,
1170
1171  // Vectorization with OptForSize: don't allow epilogues.
1172  CM_ScalarEpilogueNotAllowedOptSize,
1173
1174  // A special case of vectorisation with OptForSize: loops with a very small
1175  // trip count are considered for vectorization under OptForSize, thereby
1176  // making sure the cost of their loop body is dominant, free of runtime
1177  // guards and scalar iteration overheads.
1178  CM_ScalarEpilogueNotAllowedLowTripLoop,
1179
1180  // Loop hint predicate indicating an epilogue is undesired.
1181  CM_ScalarEpilogueNotNeededUsePredicate,
1182
1183  // Directive indicating we must either tail fold or not vectorize
1184  CM_ScalarEpilogueNotAllowedUsePredicate
1185};
1186
1187using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1188
1189/// LoopVectorizationCostModel - estimates the expected speedups due to
1190/// vectorization.
1191/// In many cases vectorization is not profitable. This can happen because of
1192/// a number of reasons. In this class we mainly attempt to predict the
1193/// expected speedup/slowdowns due to the supported instruction set. We use the
1194/// TargetTransformInfo to query the different backends for the cost of
1195/// different operations.
1196class LoopVectorizationCostModel {
1197public:
1198  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1199                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
1200                             LoopVectorizationLegality *Legal,
1201                             const TargetTransformInfo &TTI,
1202                             const TargetLibraryInfo *TLI, DemandedBits *DB,
1203                             AssumptionCache *AC,
1204                             OptimizationRemarkEmitter *ORE, const Function *F,
1205                             const LoopVectorizeHints *Hints,
1206                             InterleavedAccessInfo &IAI)
1207      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209        Hints(Hints), InterleaveInfo(IAI) {}
1210
1211  /// \return An upper bound for the vectorization factors (both fixed and
1212  /// scalable). If the factors are 0, vectorization and interleaving should be
1213  /// avoided up front.
1214  FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1215
1216  /// \return True if runtime checks are required for vectorization, and false
1217  /// otherwise.
1218  bool runtimeChecksRequired();
1219
1220  /// Setup cost-based decisions for user vectorization factor.
1221  /// \return true if the UserVF is a feasible VF to be chosen.
1222  bool selectUserVectorizationFactor(ElementCount UserVF) {
1223    collectUniformsAndScalars(UserVF);
1224    collectInstsToScalarize(UserVF);
1225    return expectedCost(UserVF).first.isValid();
1226  }
1227
1228  /// \return The size (in bits) of the smallest and widest types in the code
1229  /// that needs to be vectorized. We ignore values that remain scalar such as
1230  /// 64 bit loop indices.
1231  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1232
1233  /// \return The desired interleave count.
1234  /// If interleave count has been specified by metadata it will be returned.
1235  /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236  /// are the selected vectorization factor and the cost of the selected VF.
1237  unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1238
1239  /// Memory access instruction may be vectorized in more than one way.
1240  /// Form of instruction after vectorization depends on cost.
1241  /// This function takes cost-based decisions for Load/Store instructions
1242  /// and collects them in a map. This decisions map is used for building
1243  /// the lists of loop-uniform and loop-scalar instructions.
1244  /// The calculated cost is saved with widening decision in order to
1245  /// avoid redundant calculations.
1246  void setCostBasedWideningDecision(ElementCount VF);
1247
1248  /// A call may be vectorized in different ways depending on whether we have
1249  /// vectorized variants available and whether the target supports masking.
1250  /// This function analyzes all calls in the function at the supplied VF,
1251  /// makes a decision based on the costs of available options, and stores that
1252  /// decision in a map for use in planning and plan execution.
1253  void setVectorizedCallDecision(ElementCount VF);
1254
1255  /// A struct that represents some properties of the register usage
1256  /// of a loop.
1257  struct RegisterUsage {
1258    /// Holds the number of loop invariant values that are used in the loop.
1259    /// The key is ClassID of target-provided register class.
1260    SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1261    /// Holds the maximum number of concurrent live intervals in the loop.
1262    /// The key is ClassID of target-provided register class.
1263    SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1264  };
1265
1266  /// \return Returns information about the register usages of the loop for the
1267  /// given vectorization factors.
1268  SmallVector<RegisterUsage, 8>
1269  calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1270
1271  /// Collect values we want to ignore in the cost model.
1272  void collectValuesToIgnore();
1273
1274  /// Collect all element types in the loop for which widening is needed.
1275  void collectElementTypesForWidening();
1276
1277  /// Split reductions into those that happen in the loop, and those that happen
1278  /// outside. In loop reductions are collected into InLoopReductions.
1279  void collectInLoopReductions();
1280
1281  /// Returns true if we should use strict in-order reductions for the given
1282  /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283  /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284  /// of FP operations.
1285  bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286    return !Hints->allowReordering() && RdxDesc.isOrdered();
1287  }
1288
1289  /// \returns The smallest bitwidth each instruction can be represented with.
1290  /// The vector equivalents of these instructions should be truncated to this
1291  /// type.
1292  const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1293    return MinBWs;
1294  }
1295
1296  /// \returns True if it is more profitable to scalarize instruction \p I for
1297  /// vectorization factor \p VF.
1298  bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299    assert(VF.isVector() &&
1300           "Profitable to scalarize relevant only for VF > 1.");
1301
1302    // Cost model is not run in the VPlan-native path - return conservative
1303    // result until this changes.
1304    if (EnableVPlanNativePath)
1305      return false;
1306
1307    auto Scalars = InstsToScalarize.find(VF);
1308    assert(Scalars != InstsToScalarize.end() &&
1309           "VF not yet analyzed for scalarization profitability");
1310    return Scalars->second.contains(I);
1311  }
1312
1313  /// Returns true if \p I is known to be uniform after vectorization.
1314  bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315    // Pseudo probe needs to be duplicated for each unrolled iteration and
1316    // vector lane so that profiled loop trip count can be accurately
1317    // accumulated instead of being under counted.
1318    if (isa<PseudoProbeInst>(I))
1319      return false;
1320
1321    if (VF.isScalar())
1322      return true;
1323
1324    // Cost model is not run in the VPlan-native path - return conservative
1325    // result until this changes.
1326    if (EnableVPlanNativePath)
1327      return false;
1328
1329    auto UniformsPerVF = Uniforms.find(VF);
1330    assert(UniformsPerVF != Uniforms.end() &&
1331           "VF not yet analyzed for uniformity");
1332    return UniformsPerVF->second.count(I);
1333  }
1334
1335  /// Returns true if \p I is known to be scalar after vectorization.
1336  bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337    if (VF.isScalar())
1338      return true;
1339
1340    // Cost model is not run in the VPlan-native path - return conservative
1341    // result until this changes.
1342    if (EnableVPlanNativePath)
1343      return false;
1344
1345    auto ScalarsPerVF = Scalars.find(VF);
1346    assert(ScalarsPerVF != Scalars.end() &&
1347           "Scalar values are not calculated for VF");
1348    return ScalarsPerVF->second.count(I);
1349  }
1350
1351  /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352  /// for vectorization factor \p VF.
1353  bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354    return VF.isVector() && MinBWs.contains(I) &&
1355           !isProfitableToScalarize(I, VF) &&
1356           !isScalarAfterVectorization(I, VF);
1357  }
1358
1359  /// Decision that was taken during cost calculation for memory instruction.
1360  enum InstWidening {
1361    CM_Unknown,
1362    CM_Widen,         // For consecutive accesses with stride +1.
1363    CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364    CM_Interleave,
1365    CM_GatherScatter,
1366    CM_Scalarize,
1367    CM_VectorCall,
1368    CM_IntrinsicCall
1369  };
1370
1371  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372  /// instruction \p I and vector width \p VF.
1373  void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374                           InstructionCost Cost) {
1375    assert(VF.isVector() && "Expected VF >=2");
1376    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1377  }
1378
1379  /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380  /// interleaving group \p Grp and vector width \p VF.
1381  void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382                           ElementCount VF, InstWidening W,
1383                           InstructionCost Cost) {
1384    assert(VF.isVector() && "Expected VF >=2");
1385    /// Broadcast this decicion to all instructions inside the group.
1386    /// But the cost will be assigned to one instruction only.
1387    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388      if (auto *I = Grp->getMember(i)) {
1389        if (Grp->getInsertPos() == I)
1390          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391        else
1392          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393      }
1394    }
1395  }
1396
1397  /// Return the cost model decision for the given instruction \p I and vector
1398  /// width \p VF. Return CM_Unknown if this instruction did not pass
1399  /// through the cost modeling.
1400  InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401    assert(VF.isVector() && "Expected VF to be a vector VF");
1402    // Cost model is not run in the VPlan-native path - return conservative
1403    // result until this changes.
1404    if (EnableVPlanNativePath)
1405      return CM_GatherScatter;
1406
1407    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408    auto Itr = WideningDecisions.find(InstOnVF);
1409    if (Itr == WideningDecisions.end())
1410      return CM_Unknown;
1411    return Itr->second.first;
1412  }
1413
1414  /// Return the vectorization cost for the given instruction \p I and vector
1415  /// width \p VF.
1416  InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417    assert(VF.isVector() && "Expected VF >=2");
1418    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419    assert(WideningDecisions.contains(InstOnVF) &&
1420           "The cost is not calculated");
1421    return WideningDecisions[InstOnVF].second;
1422  }
1423
1424  struct CallWideningDecision {
1425    InstWidening Kind;
1426    Function *Variant;
1427    Intrinsic::ID IID;
1428    std::optional<unsigned> MaskPos;
1429    InstructionCost Cost;
1430  };
1431
1432  void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433                               Function *Variant, Intrinsic::ID IID,
1434                               std::optional<unsigned> MaskPos,
1435                               InstructionCost Cost) {
1436    assert(!VF.isScalar() && "Expected vector VF");
1437    CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438                                                     MaskPos, Cost};
1439  }
1440
1441  CallWideningDecision getCallWideningDecision(CallInst *CI,
1442                                               ElementCount VF) const {
1443    assert(!VF.isScalar() && "Expected vector VF");
1444    return CallWideningDecisions.at(std::make_pair(CI, VF));
1445  }
1446
1447  /// Return True if instruction \p I is an optimizable truncate whose operand
1448  /// is an induction variable. Such a truncate will be removed by adding a new
1449  /// induction variable with the destination type.
1450  bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1451    // If the instruction is not a truncate, return false.
1452    auto *Trunc = dyn_cast<TruncInst>(I);
1453    if (!Trunc)
1454      return false;
1455
1456    // Get the source and destination types of the truncate.
1457    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1458    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1459
1460    // If the truncate is free for the given types, return false. Replacing a
1461    // free truncate with an induction variable would add an induction variable
1462    // update instruction to each iteration of the loop. We exclude from this
1463    // check the primary induction variable since it will need an update
1464    // instruction regardless.
1465    Value *Op = Trunc->getOperand(0);
1466    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1467      return false;
1468
1469    // If the truncated value is not an induction variable, return false.
1470    return Legal->isInductionPhi(Op);
1471  }
1472
1473  /// Collects the instructions to scalarize for each predicated instruction in
1474  /// the loop.
1475  void collectInstsToScalarize(ElementCount VF);
1476
1477  /// Collect Uniform and Scalar values for the given \p VF.
1478  /// The sets depend on CM decision for Load/Store instructions
1479  /// that may be vectorized as interleave, gather-scatter or scalarized.
1480  /// Also make a decision on what to do about call instructions in the loop
1481  /// at that VF -- scalarize, call a known vector routine, or call a
1482  /// vector intrinsic.
1483  void collectUniformsAndScalars(ElementCount VF) {
1484    // Do the analysis once.
1485    if (VF.isScalar() || Uniforms.contains(VF))
1486      return;
1487    setCostBasedWideningDecision(VF);
1488    setVectorizedCallDecision(VF);
1489    collectLoopUniforms(VF);
1490    collectLoopScalars(VF);
1491  }
1492
1493  /// Returns true if the target machine supports masked store operation
1494  /// for the given \p DataType and kind of access to \p Ptr.
1495  bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496    return Legal->isConsecutivePtr(DataType, Ptr) &&
1497           TTI.isLegalMaskedStore(DataType, Alignment);
1498  }
1499
1500  /// Returns true if the target machine supports masked load operation
1501  /// for the given \p DataType and kind of access to \p Ptr.
1502  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503    return Legal->isConsecutivePtr(DataType, Ptr) &&
1504           TTI.isLegalMaskedLoad(DataType, Alignment);
1505  }
1506
1507  /// Returns true if the target machine can represent \p V as a masked gather
1508  /// or scatter operation.
1509  bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1510    bool LI = isa<LoadInst>(V);
1511    bool SI = isa<StoreInst>(V);
1512    if (!LI && !SI)
1513      return false;
1514    auto *Ty = getLoadStoreType(V);
1515    Align Align = getLoadStoreAlignment(V);
1516    if (VF.isVector())
1517      Ty = VectorType::get(Ty, VF);
1518    return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519           (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520  }
1521
1522  /// Returns true if the target machine supports all of the reduction
1523  /// variables found for the given VF.
1524  bool canVectorizeReductions(ElementCount VF) const {
1525    return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526      const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527      return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528    }));
1529  }
1530
1531  /// Given costs for both strategies, return true if the scalar predication
1532  /// lowering should be used for div/rem.  This incorporates an override
1533  /// option so it is not simply a cost comparison.
1534  bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535                                     InstructionCost SafeDivisorCost) const {
1536    switch (ForceSafeDivisor) {
1537    case cl::BOU_UNSET:
1538      return ScalarCost < SafeDivisorCost;
1539    case cl::BOU_TRUE:
1540      return false;
1541    case cl::BOU_FALSE:
1542      return true;
1543    };
1544    llvm_unreachable("impossible case value");
1545  }
1546
1547  /// Returns true if \p I is an instruction which requires predication and
1548  /// for which our chosen predication strategy is scalarization (i.e. we
1549  /// don't have an alternate strategy such as masking available).
1550  /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551  bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1552
1553  /// Returns true if \p I is an instruction that needs to be predicated
1554  /// at runtime.  The result is independent of the predication mechanism.
1555  /// Superset of instructions that return true for isScalarWithPredication.
1556  bool isPredicatedInst(Instruction *I) const;
1557
1558  /// Return the costs for our two available strategies for lowering a
1559  /// div/rem operation which requires speculating at least one lane.
1560  /// First result is for scalarization (will be invalid for scalable
1561  /// vectors); second is for the safe-divisor strategy.
1562  std::pair<InstructionCost, InstructionCost>
1563  getDivRemSpeculationCost(Instruction *I,
1564                           ElementCount VF) const;
1565
1566  /// Returns true if \p I is a memory instruction with consecutive memory
1567  /// access that can be widened.
1568  bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1569
1570  /// Returns true if \p I is a memory instruction in an interleaved-group
1571  /// of memory accesses that can be vectorized with wide vector loads/stores
1572  /// and shuffles.
1573  bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1574
1575  /// Check if \p Instr belongs to any interleaved access group.
1576  bool isAccessInterleaved(Instruction *Instr) {
1577    return InterleaveInfo.isInterleaved(Instr);
1578  }
1579
1580  /// Get the interleaved access group that \p Instr belongs to.
1581  const InterleaveGroup<Instruction> *
1582  getInterleavedAccessGroup(Instruction *Instr) {
1583    return InterleaveInfo.getInterleaveGroup(Instr);
1584  }
1585
1586  /// Returns true if we're required to use a scalar epilogue for at least
1587  /// the final iteration of the original loop.
1588  bool requiresScalarEpilogue(bool IsVectorizing) const {
1589    if (!isScalarEpilogueAllowed())
1590      return false;
1591    // If we might exit from anywhere but the latch, must run the exiting
1592    // iteration in scalar form.
1593    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594      return true;
1595    return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1596  }
1597
1598  /// Returns true if we're required to use a scalar epilogue for at least
1599  /// the final iteration of the original loop for all VFs in \p Range.
1600  /// A scalar epilogue must either be required for all VFs in \p Range or for
1601  /// none.
1602  bool requiresScalarEpilogue(VFRange Range) const {
1603    auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604      return requiresScalarEpilogue(VF.isVector());
1605    };
1606    bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607    assert(
1608        (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609        "all VFs in range must agree on whether a scalar epilogue is required");
1610    return IsRequired;
1611  }
1612
1613  /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614  /// loop hint annotation.
1615  bool isScalarEpilogueAllowed() const {
1616    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1617  }
1618
1619  /// Returns the TailFoldingStyle that is best for the current loop.
1620  TailFoldingStyle
1621  getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622    if (!CanFoldTailByMasking)
1623      return TailFoldingStyle::None;
1624
1625    if (ForceTailFoldingStyle.getNumOccurrences())
1626      return ForceTailFoldingStyle;
1627
1628    return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1629  }
1630
1631  /// Returns true if all loop blocks should be masked to fold tail loop.
1632  bool foldTailByMasking() const {
1633    return getTailFoldingStyle() != TailFoldingStyle::None;
1634  }
1635
1636  /// Returns true if the instructions in this block requires predication
1637  /// for any reason, e.g. because tail folding now requires a predicate
1638  /// or because the block in the original loop was predicated.
1639  bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1640    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1641  }
1642
1643  /// Returns true if the Phi is part of an inloop reduction.
1644  bool isInLoopReduction(PHINode *Phi) const {
1645    return InLoopReductions.contains(Phi);
1646  }
1647
1648  /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649  /// with factor VF.  Return the cost of the instruction, including
1650  /// scalarization overhead if it's needed.
1651  InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1652
1653  /// Estimate cost of a call instruction CI if it were vectorized with factor
1654  /// VF. Return the cost of the instruction, including scalarization overhead
1655  /// if it's needed.
1656  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1657
1658  /// Invalidates decisions already taken by the cost model.
1659  void invalidateCostModelingDecisions() {
1660    WideningDecisions.clear();
1661    CallWideningDecisions.clear();
1662    Uniforms.clear();
1663    Scalars.clear();
1664  }
1665
1666  /// The vectorization cost is a combination of the cost itself and a boolean
1667  /// indicating whether any of the contributing operations will actually
1668  /// operate on vector values after type legalization in the backend. If this
1669  /// latter value is false, then all operations will be scalarized (i.e. no
1670  /// vectorization has actually taken place).
1671  using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672
1673  /// Returns the expected execution cost. The unit of the cost does
1674  /// not matter because we use the 'cost' units to compare different
1675  /// vector widths. The cost that is returned is *not* normalized by
1676  /// the factor width. If \p Invalid is not nullptr, this function
1677  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678  /// each instruction that has an Invalid cost for the given VF.
1679  VectorizationCostTy
1680  expectedCost(ElementCount VF,
1681               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1682
1683  bool hasPredStores() const { return NumPredStores > 0; }
1684
1685  /// Returns true if epilogue vectorization is considered profitable, and
1686  /// false otherwise.
1687  /// \p VF is the vectorization factor chosen for the original loop.
1688  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1689
1690private:
1691  unsigned NumPredStores = 0;
1692
1693  /// \return An upper bound for the vectorization factors for both
1694  /// fixed and scalable vectorization, where the minimum-known number of
1695  /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696  /// disabled or unsupported, then the scalable part will be equal to
1697  /// ElementCount::getScalable(0).
1698  FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1699                                           ElementCount UserVF,
1700                                           bool FoldTailByMasking);
1701
1702  /// \return the maximized element count based on the targets vector
1703  /// registers and the loop trip-count, but limited to a maximum safe VF.
1704  /// This is a helper function of computeFeasibleMaxVF.
1705  ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706                                       unsigned SmallestType,
1707                                       unsigned WidestType,
1708                                       ElementCount MaxSafeVF,
1709                                       bool FoldTailByMasking);
1710
1711  /// \return the maximum legal scalable VF, based on the safe max number
1712  /// of elements.
1713  ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1714
1715  /// Returns the execution time cost of an instruction for a given vector
1716  /// width. Vector width of one means scalar.
1717  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1718
1719  /// The cost-computation logic from getInstructionCost which provides
1720  /// the vector type as an output parameter.
1721  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722                                     Type *&VectorTy);
1723
1724  /// Return the cost of instructions in an inloop reduction pattern, if I is
1725  /// part of that pattern.
1726  std::optional<InstructionCost>
1727  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728                          TTI::TargetCostKind CostKind) const;
1729
1730  /// Calculate vectorization cost of memory instruction \p I.
1731  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1732
1733  /// The cost computation for scalarized memory instruction.
1734  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1735
1736  /// The cost computation for interleaving group of memory instructions.
1737  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1738
1739  /// The cost computation for Gather/Scatter instruction.
1740  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1741
1742  /// The cost computation for widening instruction \p I with consecutive
1743  /// memory access.
1744  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1745
1746  /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747  /// Load: scalar load + broadcast.
1748  /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1749  /// element)
1750  InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1751
1752  /// Estimate the overhead of scalarizing an instruction. This is a
1753  /// convenience wrapper for the type-based getScalarizationOverhead API.
1754  InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755                                           TTI::TargetCostKind CostKind) const;
1756
1757  /// Returns true if an artificially high cost for emulated masked memrefs
1758  /// should be used.
1759  bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1760
1761  /// Map of scalar integer values to the smallest bitwidth they can be legally
1762  /// represented as. The vector equivalents of these values should be truncated
1763  /// to this type.
1764  MapVector<Instruction *, uint64_t> MinBWs;
1765
1766  /// A type representing the costs for instructions if they were to be
1767  /// scalarized rather than vectorized. The entries are Instruction-Cost
1768  /// pairs.
1769  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1770
1771  /// A set containing all BasicBlocks that are known to present after
1772  /// vectorization as a predicated block.
1773  DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774      PredicatedBBsAfterVectorization;
1775
1776  /// Records whether it is allowed to have the original scalar loop execute at
1777  /// least once. This may be needed as a fallback loop in case runtime
1778  /// aliasing/dependence checks fail, or to handle the tail/remainder
1779  /// iterations when the trip count is unknown or doesn't divide by the VF,
1780  /// or as a peel-loop to handle gaps in interleave-groups.
1781  /// Under optsize and when the trip count is very small we don't allow any
1782  /// iterations to execute in the scalar loop.
1783  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1784
1785  /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786  bool CanFoldTailByMasking = false;
1787
1788  /// A map holding scalar costs for different vectorization factors. The
1789  /// presence of a cost for an instruction in the mapping indicates that the
1790  /// instruction will be scalarized when vectorizing with the associated
1791  /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1793
1794  /// Holds the instructions known to be uniform after vectorization.
1795  /// The data is collected per VF.
1796  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1797
1798  /// Holds the instructions known to be scalar after vectorization.
1799  /// The data is collected per VF.
1800  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1801
1802  /// Holds the instructions (address computations) that are forced to be
1803  /// scalarized.
1804  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1805
1806  /// PHINodes of the reductions that should be expanded in-loop.
1807  SmallPtrSet<PHINode *, 4> InLoopReductions;
1808
1809  /// A Map of inloop reduction operations and their immediate chain operand.
1810  /// FIXME: This can be removed once reductions can be costed correctly in
1811  /// VPlan. This was added to allow quick lookup of the inloop operations.
1812  DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1813
1814  /// Returns the expected difference in cost from scalarizing the expression
1815  /// feeding a predicated instruction \p PredInst. The instructions to
1816  /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817  /// non-negative return value implies the expression will be scalarized.
1818  /// Currently, only single-use chains are considered for scalarization.
1819  InstructionCost computePredInstDiscount(Instruction *PredInst,
1820                                          ScalarCostsTy &ScalarCosts,
1821                                          ElementCount VF);
1822
1823  /// Collect the instructions that are uniform after vectorization. An
1824  /// instruction is uniform if we represent it with a single scalar value in
1825  /// the vectorized loop corresponding to each vector iteration. Examples of
1826  /// uniform instructions include pointer operands of consecutive or
1827  /// interleaved memory accesses. Note that although uniformity implies an
1828  /// instruction will be scalar, the reverse is not true. In general, a
1829  /// scalarized instruction will be represented by VF scalar values in the
1830  /// vectorized loop, each corresponding to an iteration of the original
1831  /// scalar loop.
1832  void collectLoopUniforms(ElementCount VF);
1833
1834  /// Collect the instructions that are scalar after vectorization. An
1835  /// instruction is scalar if it is known to be uniform or will be scalarized
1836  /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837  /// to the list if they are used by a load/store instruction that is marked as
1838  /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839  /// VF values in the vectorized loop, each corresponding to an iteration of
1840  /// the original scalar loop.
1841  void collectLoopScalars(ElementCount VF);
1842
1843  /// Keeps cost model vectorization decision and cost for instructions.
1844  /// Right now it is used for memory instructions only.
1845  using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846                                std::pair<InstWidening, InstructionCost>>;
1847
1848  DecisionList WideningDecisions;
1849
1850  using CallDecisionList =
1851      DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1852
1853  CallDecisionList CallWideningDecisions;
1854
1855  /// Returns true if \p V is expected to be vectorized and it needs to be
1856  /// extracted.
1857  bool needsExtract(Value *V, ElementCount VF) const {
1858    Instruction *I = dyn_cast<Instruction>(V);
1859    if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860        TheLoop->isLoopInvariant(I))
1861      return false;
1862
1863    // Assume we can vectorize V (and hence we need extraction) if the
1864    // scalars are not computed yet. This can happen, because it is called
1865    // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866    // the scalars are collected. That should be a safe assumption in most
1867    // cases, because we check if the operands have vectorizable types
1868    // beforehand in LoopVectorizationLegality.
1869    return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1870  };
1871
1872  /// Returns a range containing only operands needing to be extracted.
1873  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874                                                   ElementCount VF) const {
1875    return SmallVector<Value *, 4>(make_filter_range(
1876        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1877  }
1878
1879public:
1880  /// The loop that we evaluate.
1881  Loop *TheLoop;
1882
1883  /// Predicated scalar evolution analysis.
1884  PredicatedScalarEvolution &PSE;
1885
1886  /// Loop Info analysis.
1887  LoopInfo *LI;
1888
1889  /// Vectorization legality.
1890  LoopVectorizationLegality *Legal;
1891
1892  /// Vector target information.
1893  const TargetTransformInfo &TTI;
1894
1895  /// Target Library Info.
1896  const TargetLibraryInfo *TLI;
1897
1898  /// Demanded bits analysis.
1899  DemandedBits *DB;
1900
1901  /// Assumption cache.
1902  AssumptionCache *AC;
1903
1904  /// Interface to emit optimization remarks.
1905  OptimizationRemarkEmitter *ORE;
1906
1907  const Function *TheFunction;
1908
1909  /// Loop Vectorize Hint.
1910  const LoopVectorizeHints *Hints;
1911
1912  /// The interleave access information contains groups of interleaved accesses
1913  /// with the same stride and close to each other.
1914  InterleavedAccessInfo &InterleaveInfo;
1915
1916  /// Values to ignore in the cost model.
1917  SmallPtrSet<const Value *, 16> ValuesToIgnore;
1918
1919  /// Values to ignore in the cost model when VF > 1.
1920  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1921
1922  /// All element types found in the loop.
1923  SmallPtrSet<Type *, 16> ElementTypesInLoop;
1924};
1925} // end namespace llvm
1926
1927namespace {
1928/// Helper struct to manage generating runtime checks for vectorization.
1929///
1930/// The runtime checks are created up-front in temporary blocks to allow better
1931/// estimating the cost and un-linked from the existing IR. After deciding to
1932/// vectorize, the checks are moved back. If deciding not to vectorize, the
1933/// temporary blocks are completely removed.
1934class GeneratedRTChecks {
1935  /// Basic block which contains the generated SCEV checks, if any.
1936  BasicBlock *SCEVCheckBlock = nullptr;
1937
1938  /// The value representing the result of the generated SCEV checks. If it is
1939  /// nullptr, either no SCEV checks have been generated or they have been used.
1940  Value *SCEVCheckCond = nullptr;
1941
1942  /// Basic block which contains the generated memory runtime checks, if any.
1943  BasicBlock *MemCheckBlock = nullptr;
1944
1945  /// The value representing the result of the generated memory runtime checks.
1946  /// If it is nullptr, either no memory runtime checks have been generated or
1947  /// they have been used.
1948  Value *MemRuntimeCheckCond = nullptr;
1949
1950  DominatorTree *DT;
1951  LoopInfo *LI;
1952  TargetTransformInfo *TTI;
1953
1954  SCEVExpander SCEVExp;
1955  SCEVExpander MemCheckExp;
1956
1957  bool CostTooHigh = false;
1958  const bool AddBranchWeights;
1959
1960  Loop *OuterLoop = nullptr;
1961
1962public:
1963  GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1964                    TargetTransformInfo *TTI, const DataLayout &DL,
1965                    bool AddBranchWeights)
1966      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1967        MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1968
1969  /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970  /// accurately estimate the cost of the runtime checks. The blocks are
1971  /// un-linked from the IR and is added back during vector code generation. If
1972  /// there is no vector code generation, the check blocks are removed
1973  /// completely.
1974  void Create(Loop *L, const LoopAccessInfo &LAI,
1975              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1976
1977    // Hard cutoff to limit compile-time increase in case a very large number of
1978    // runtime checks needs to be generated.
1979    // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1980    // profile info.
1981    CostTooHigh =
1982        LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1983    if (CostTooHigh)
1984      return;
1985
1986    BasicBlock *LoopHeader = L->getHeader();
1987    BasicBlock *Preheader = L->getLoopPreheader();
1988
1989    // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990    // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991    // may be used by SCEVExpander. The blocks will be un-linked from their
1992    // predecessors and removed from LI & DT at the end of the function.
1993    if (!UnionPred.isAlwaysTrue()) {
1994      SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1995                                  nullptr, "vector.scevcheck");
1996
1997      SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1998          &UnionPred, SCEVCheckBlock->getTerminator());
1999    }
2000
2001    const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2002    if (RtPtrChecking.Need) {
2003      auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2004      MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2005                                 "vector.memcheck");
2006
2007      auto DiffChecks = RtPtrChecking.getDiffChecks();
2008      if (DiffChecks) {
2009        Value *RuntimeVF = nullptr;
2010        MemRuntimeCheckCond = addDiffRuntimeChecks(
2011            MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2012            [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2013              if (!RuntimeVF)
2014                RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2015              return RuntimeVF;
2016            },
2017            IC);
2018      } else {
2019        MemRuntimeCheckCond = addRuntimeChecks(
2020            MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2021            MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2022      }
2023      assert(MemRuntimeCheckCond &&
2024             "no RT checks generated although RtPtrChecking "
2025             "claimed checks are required");
2026    }
2027
2028    if (!MemCheckBlock && !SCEVCheckBlock)
2029      return;
2030
2031    // Unhook the temporary block with the checks, update various places
2032    // accordingly.
2033    if (SCEVCheckBlock)
2034      SCEVCheckBlock->replaceAllUsesWith(Preheader);
2035    if (MemCheckBlock)
2036      MemCheckBlock->replaceAllUsesWith(Preheader);
2037
2038    if (SCEVCheckBlock) {
2039      SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2040      new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2041      Preheader->getTerminator()->eraseFromParent();
2042    }
2043    if (MemCheckBlock) {
2044      MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2045      new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2046      Preheader->getTerminator()->eraseFromParent();
2047    }
2048
2049    DT->changeImmediateDominator(LoopHeader, Preheader);
2050    if (MemCheckBlock) {
2051      DT->eraseNode(MemCheckBlock);
2052      LI->removeBlock(MemCheckBlock);
2053    }
2054    if (SCEVCheckBlock) {
2055      DT->eraseNode(SCEVCheckBlock);
2056      LI->removeBlock(SCEVCheckBlock);
2057    }
2058
2059    // Outer loop is used as part of the later cost calculations.
2060    OuterLoop = L->getParentLoop();
2061  }
2062
2063  InstructionCost getCost() {
2064    if (SCEVCheckBlock || MemCheckBlock)
2065      LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2066
2067    if (CostTooHigh) {
2068      InstructionCost Cost;
2069      Cost.setInvalid();
2070      LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2071      return Cost;
2072    }
2073
2074    InstructionCost RTCheckCost = 0;
2075    if (SCEVCheckBlock)
2076      for (Instruction &I : *SCEVCheckBlock) {
2077        if (SCEVCheckBlock->getTerminator() == &I)
2078          continue;
2079        InstructionCost C =
2080            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2081        LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2082        RTCheckCost += C;
2083      }
2084    if (MemCheckBlock) {
2085      InstructionCost MemCheckCost = 0;
2086      for (Instruction &I : *MemCheckBlock) {
2087        if (MemCheckBlock->getTerminator() == &I)
2088          continue;
2089        InstructionCost C =
2090            TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091        LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2092        MemCheckCost += C;
2093      }
2094
2095      // If the runtime memory checks are being created inside an outer loop
2096      // we should find out if these checks are outer loop invariant. If so,
2097      // the checks will likely be hoisted out and so the effective cost will
2098      // reduce according to the outer loop trip count.
2099      if (OuterLoop) {
2100        ScalarEvolution *SE = MemCheckExp.getSE();
2101        // TODO: If profitable, we could refine this further by analysing every
2102        // individual memory check, since there could be a mixture of loop
2103        // variant and invariant checks that mean the final condition is
2104        // variant.
2105        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106        if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107          // It seems reasonable to assume that we can reduce the effective
2108          // cost of the checks even when we know nothing about the trip
2109          // count. Assume that the outer loop executes at least twice.
2110          unsigned BestTripCount = 2;
2111
2112          // If exact trip count is known use that.
2113          if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114            BestTripCount = SmallTC;
2115          else if (LoopVectorizeWithBlockFrequency) {
2116            // Else use profile data if available.
2117            if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118              BestTripCount = *EstimatedTC;
2119          }
2120
2121          InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2122
2123          // Let's ensure the cost is always at least 1.
2124          NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125                                     (InstructionCost::CostType)1);
2126
2127          LLVM_DEBUG(dbgs()
2128                     << "We expect runtime memory checks to be hoisted "
2129                     << "out of the outer loop. Cost reduced from "
2130                     << MemCheckCost << " to " << NewMemCheckCost << '\n');
2131
2132          MemCheckCost = NewMemCheckCost;
2133        }
2134      }
2135
2136      RTCheckCost += MemCheckCost;
2137    }
2138
2139    if (SCEVCheckBlock || MemCheckBlock)
2140      LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2141                        << "\n");
2142
2143    return RTCheckCost;
2144  }
2145
2146  /// Remove the created SCEV & memory runtime check blocks & instructions, if
2147  /// unused.
2148  ~GeneratedRTChecks() {
2149    SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2150    SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2151    if (!SCEVCheckCond)
2152      SCEVCleaner.markResultUsed();
2153
2154    if (!MemRuntimeCheckCond)
2155      MemCheckCleaner.markResultUsed();
2156
2157    if (MemRuntimeCheckCond) {
2158      auto &SE = *MemCheckExp.getSE();
2159      // Memory runtime check generation creates compares that use expanded
2160      // values. Remove them before running the SCEVExpanderCleaners.
2161      for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2162        if (MemCheckExp.isInsertedInstruction(&I))
2163          continue;
2164        SE.forgetValue(&I);
2165        I.eraseFromParent();
2166      }
2167    }
2168    MemCheckCleaner.cleanup();
2169    SCEVCleaner.cleanup();
2170
2171    if (SCEVCheckCond)
2172      SCEVCheckBlock->eraseFromParent();
2173    if (MemRuntimeCheckCond)
2174      MemCheckBlock->eraseFromParent();
2175  }
2176
2177  /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178  /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179  /// depending on the generated condition.
2180  BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2181                             BasicBlock *LoopVectorPreHeader,
2182                             BasicBlock *LoopExitBlock) {
2183    if (!SCEVCheckCond)
2184      return nullptr;
2185
2186    Value *Cond = SCEVCheckCond;
2187    // Mark the check as used, to prevent it from being removed during cleanup.
2188    SCEVCheckCond = nullptr;
2189    if (auto *C = dyn_cast<ConstantInt>(Cond))
2190      if (C->isZero())
2191        return nullptr;
2192
2193    auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2194
2195    BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2196    // Create new preheader for vector loop.
2197    if (OuterLoop)
2198      OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2199
2200    SCEVCheckBlock->getTerminator()->eraseFromParent();
2201    SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2202    Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2203                                                SCEVCheckBlock);
2204
2205    DT->addNewBlock(SCEVCheckBlock, Pred);
2206    DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2207
2208    BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2209    if (AddBranchWeights)
2210      setBranchWeights(BI, SCEVCheckBypassWeights);
2211    ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2212    return SCEVCheckBlock;
2213  }
2214
2215  /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216  /// the branches to branch to the vector preheader or \p Bypass, depending on
2217  /// the generated condition.
2218  BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2219                                   BasicBlock *LoopVectorPreHeader) {
2220    // Check if we generated code that checks in runtime if arrays overlap.
2221    if (!MemRuntimeCheckCond)
2222      return nullptr;
2223
2224    auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2225    Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2226                                                MemCheckBlock);
2227
2228    DT->addNewBlock(MemCheckBlock, Pred);
2229    DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2230    MemCheckBlock->moveBefore(LoopVectorPreHeader);
2231
2232    if (OuterLoop)
2233      OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2234
2235    BranchInst &BI =
2236        *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2237    if (AddBranchWeights) {
2238      setBranchWeights(BI, MemCheckBypassWeights);
2239    }
2240    ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2241    MemCheckBlock->getTerminator()->setDebugLoc(
2242        Pred->getTerminator()->getDebugLoc());
2243
2244    // Mark the check as used, to prevent it from being removed during cleanup.
2245    MemRuntimeCheckCond = nullptr;
2246    return MemCheckBlock;
2247  }
2248};
2249} // namespace
2250
2251static bool useActiveLaneMask(TailFoldingStyle Style) {
2252  return Style == TailFoldingStyle::Data ||
2253         Style == TailFoldingStyle::DataAndControlFlow ||
2254         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2255}
2256
2257static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2258  return Style == TailFoldingStyle::DataAndControlFlow ||
2259         Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2260}
2261
2262// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2263// vectorization. The loop needs to be annotated with #pragma omp simd
2264// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2265// vector length information is not provided, vectorization is not considered
2266// explicit. Interleave hints are not allowed either. These limitations will be
2267// relaxed in the future.
2268// Please, note that we are currently forced to abuse the pragma 'clang
2269// vectorize' semantics. This pragma provides *auto-vectorization hints*
2270// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2271// provides *explicit vectorization hints* (LV can bypass legal checks and
2272// assume that vectorization is legal). However, both hints are implemented
2273// using the same metadata (llvm.loop.vectorize, processed by
2274// LoopVectorizeHints). This will be fixed in the future when the native IR
2275// representation for pragma 'omp simd' is introduced.
2276static bool isExplicitVecOuterLoop(Loop *OuterLp,
2277                                   OptimizationRemarkEmitter *ORE) {
2278  assert(!OuterLp->isInnermost() && "This is not an outer loop");
2279  LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2280
2281  // Only outer loops with an explicit vectorization hint are supported.
2282  // Unannotated outer loops are ignored.
2283  if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2284    return false;
2285
2286  Function *Fn = OuterLp->getHeader()->getParent();
2287  if (!Hints.allowVectorization(Fn, OuterLp,
2288                                true /*VectorizeOnlyWhenForced*/)) {
2289    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2290    return false;
2291  }
2292
2293  if (Hints.getInterleave() > 1) {
2294    // TODO: Interleave support is future work.
2295    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2296                         "outer loops.\n");
2297    Hints.emitRemarkWithHints();
2298    return false;
2299  }
2300
2301  return true;
2302}
2303
2304static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2305                                  OptimizationRemarkEmitter *ORE,
2306                                  SmallVectorImpl<Loop *> &V) {
2307  // Collect inner loops and outer loops without irreducible control flow. For
2308  // now, only collect outer loops that have explicit vectorization hints. If we
2309  // are stress testing the VPlan H-CFG construction, we collect the outermost
2310  // loop of every loop nest.
2311  if (L.isInnermost() || VPlanBuildStressTest ||
2312      (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2313    LoopBlocksRPO RPOT(&L);
2314    RPOT.perform(LI);
2315    if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2316      V.push_back(&L);
2317      // TODO: Collect inner loops inside marked outer loops in case
2318      // vectorization fails for the outer loop. Do not invoke
2319      // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2320      // already known to be reducible. We can use an inherited attribute for
2321      // that.
2322      return;
2323    }
2324  }
2325  for (Loop *InnerL : L)
2326    collectSupportedLoops(*InnerL, LI, ORE, V);
2327}
2328
2329//===----------------------------------------------------------------------===//
2330// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2331// LoopVectorizationCostModel and LoopVectorizationPlanner.
2332//===----------------------------------------------------------------------===//
2333
2334/// Compute the transformed value of Index at offset StartValue using step
2335/// StepValue.
2336/// For integer induction, returns StartValue + Index * StepValue.
2337/// For pointer induction, returns StartValue[Index * StepValue].
2338/// FIXME: The newly created binary instructions should contain nsw/nuw
2339/// flags, which can be found from the original scalar operations.
2340static Value *
2341emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2342                     Value *Step,
2343                     InductionDescriptor::InductionKind InductionKind,
2344                     const BinaryOperator *InductionBinOp) {
2345  Type *StepTy = Step->getType();
2346  Value *CastedIndex = StepTy->isIntegerTy()
2347                           ? B.CreateSExtOrTrunc(Index, StepTy)
2348                           : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2349  if (CastedIndex != Index) {
2350    CastedIndex->setName(CastedIndex->getName() + ".cast");
2351    Index = CastedIndex;
2352  }
2353
2354  // Note: the IR at this point is broken. We cannot use SE to create any new
2355  // SCEV and then expand it, hoping that SCEV's simplification will give us
2356  // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2357  // lead to various SCEV crashes. So all we can do is to use builder and rely
2358  // on InstCombine for future simplifications. Here we handle some trivial
2359  // cases only.
2360  auto CreateAdd = [&B](Value *X, Value *Y) {
2361    assert(X->getType() == Y->getType() && "Types don't match!");
2362    if (auto *CX = dyn_cast<ConstantInt>(X))
2363      if (CX->isZero())
2364        return Y;
2365    if (auto *CY = dyn_cast<ConstantInt>(Y))
2366      if (CY->isZero())
2367        return X;
2368    return B.CreateAdd(X, Y);
2369  };
2370
2371  // We allow X to be a vector type, in which case Y will potentially be
2372  // splatted into a vector with the same element count.
2373  auto CreateMul = [&B](Value *X, Value *Y) {
2374    assert(X->getType()->getScalarType() == Y->getType() &&
2375           "Types don't match!");
2376    if (auto *CX = dyn_cast<ConstantInt>(X))
2377      if (CX->isOne())
2378        return Y;
2379    if (auto *CY = dyn_cast<ConstantInt>(Y))
2380      if (CY->isOne())
2381        return X;
2382    VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2383    if (XVTy && !isa<VectorType>(Y->getType()))
2384      Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2385    return B.CreateMul(X, Y);
2386  };
2387
2388  switch (InductionKind) {
2389  case InductionDescriptor::IK_IntInduction: {
2390    assert(!isa<VectorType>(Index->getType()) &&
2391           "Vector indices not supported for integer inductions yet");
2392    assert(Index->getType() == StartValue->getType() &&
2393           "Index type does not match StartValue type");
2394    if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2395      return B.CreateSub(StartValue, Index);
2396    auto *Offset = CreateMul(Index, Step);
2397    return CreateAdd(StartValue, Offset);
2398  }
2399  case InductionDescriptor::IK_PtrInduction:
2400    return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2401  case InductionDescriptor::IK_FpInduction: {
2402    assert(!isa<VectorType>(Index->getType()) &&
2403           "Vector indices not supported for FP inductions yet");
2404    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2405    assert(InductionBinOp &&
2406           (InductionBinOp->getOpcode() == Instruction::FAdd ||
2407            InductionBinOp->getOpcode() == Instruction::FSub) &&
2408           "Original bin op should be defined for FP induction");
2409
2410    Value *MulExp = B.CreateFMul(Step, Index);
2411    return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2412                         "induction");
2413  }
2414  case InductionDescriptor::IK_NoInduction:
2415    return nullptr;
2416  }
2417  llvm_unreachable("invalid enum");
2418}
2419
2420std::optional<unsigned> getMaxVScale(const Function &F,
2421                                     const TargetTransformInfo &TTI) {
2422  if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2423    return MaxVScale;
2424
2425  if (F.hasFnAttribute(Attribute::VScaleRange))
2426    return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2427
2428  return std::nullopt;
2429}
2430
2431/// For the given VF and UF and maximum trip count computed for the loop, return
2432/// whether the induction variable might overflow in the vectorized loop. If not,
2433/// then we know a runtime overflow check always evaluates to false and can be
2434/// removed.
2435static bool isIndvarOverflowCheckKnownFalse(
2436    const LoopVectorizationCostModel *Cost,
2437    ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2438  // Always be conservative if we don't know the exact unroll factor.
2439  unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2440
2441  Type *IdxTy = Cost->Legal->getWidestInductionType();
2442  APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2443
2444  // We know the runtime overflow check is known false iff the (max) trip-count
2445  // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446  // the vector loop induction variable.
2447  if (unsigned TC =
2448          Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2449    uint64_t MaxVF = VF.getKnownMinValue();
2450    if (VF.isScalable()) {
2451      std::optional<unsigned> MaxVScale =
2452          getMaxVScale(*Cost->TheFunction, Cost->TTI);
2453      if (!MaxVScale)
2454        return false;
2455      MaxVF *= *MaxVScale;
2456    }
2457
2458    return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2459  }
2460
2461  return false;
2462}
2463
2464// Return whether we allow using masked interleave-groups (for dealing with
2465// strided loads/stores that reside in predicated blocks, or for dealing
2466// with gaps).
2467static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2468  // If an override option has been passed in for interleaved accesses, use it.
2469  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2470    return EnableMaskedInterleavedMemAccesses;
2471
2472  return TTI.enableMaskedInterleavedAccessVectorization();
2473}
2474
2475// Try to vectorize the interleave group that \p Instr belongs to.
2476//
2477// E.g. Translate following interleaved load group (factor = 3):
2478//   for (i = 0; i < N; i+=3) {
2479//     R = Pic[i];             // Member of index 0
2480//     G = Pic[i+1];           // Member of index 1
2481//     B = Pic[i+2];           // Member of index 2
2482//     ... // do something to R, G, B
2483//   }
2484// To:
2485//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2486//   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2487//   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2488//   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2489//
2490// Or translate following interleaved store group (factor = 3):
2491//   for (i = 0; i < N; i+=3) {
2492//     ... do something to R, G, B
2493//     Pic[i]   = R;           // Member of index 0
2494//     Pic[i+1] = G;           // Member of index 1
2495//     Pic[i+2] = B;           // Member of index 2
2496//   }
2497// To:
2498//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499//   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2500//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2501//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2502//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2503void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504    const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2505    VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2506    VPValue *BlockInMask, bool NeedsMaskForGaps) {
2507  Instruction *Instr = Group->getInsertPos();
2508  const DataLayout &DL = Instr->getModule()->getDataLayout();
2509
2510  // Prepare for the vector type of the interleaved load/store.
2511  Type *ScalarTy = getLoadStoreType(Instr);
2512  unsigned InterleaveFactor = Group->getFactor();
2513  auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2514
2515  // Prepare for the new pointers.
2516  SmallVector<Value *, 2> AddrParts;
2517  unsigned Index = Group->getIndex(Instr);
2518
2519  // TODO: extend the masked interleaved-group support to reversed access.
2520  assert((!BlockInMask || !Group->isReverse()) &&
2521         "Reversed masked interleave-group not supported.");
2522
2523  Value *Idx;
2524  // If the group is reverse, adjust the index to refer to the last vector lane
2525  // instead of the first. We adjust the index from the first vector lane,
2526  // rather than directly getting the pointer for lane VF - 1, because the
2527  // pointer operand of the interleaved access is supposed to be uniform. For
2528  // uniform instructions, we're only required to generate a value for the
2529  // first vector lane in each unroll iteration.
2530  if (Group->isReverse()) {
2531    Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2532    Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2533    Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2534    Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2535    Idx = Builder.CreateNeg(Idx);
2536  } else
2537    Idx = Builder.getInt32(-Index);
2538
2539  for (unsigned Part = 0; Part < UF; Part++) {
2540    Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2541    if (auto *I = dyn_cast<Instruction>(AddrPart))
2542      State.setDebugLocFrom(I->getDebugLoc());
2543
2544    // Notice current instruction could be any index. Need to adjust the address
2545    // to the member of index 0.
2546    //
2547    // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2548    //       b = A[i];       // Member of index 0
2549    // Current pointer is pointed to A[i+1], adjust it to A[i].
2550    //
2551    // E.g.  A[i+1] = a;     // Member of index 1
2552    //       A[i]   = b;     // Member of index 0
2553    //       A[i+2] = c;     // Member of index 2 (Current instruction)
2554    // Current pointer is pointed to A[i+2], adjust it to A[i].
2555
2556    bool InBounds = false;
2557    if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2558      InBounds = gep->isInBounds();
2559    AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2560    AddrParts.push_back(AddrPart);
2561  }
2562
2563  State.setDebugLocFrom(Instr->getDebugLoc());
2564  Value *PoisonVec = PoisonValue::get(VecTy);
2565
2566  auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2567                             unsigned Part, Value *MaskForGaps) -> Value * {
2568    if (VF.isScalable()) {
2569      assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2570      assert(InterleaveFactor == 2 &&
2571             "Unsupported deinterleave factor for scalable vectors");
2572      auto *BlockInMaskPart = State.get(BlockInMask, Part);
2573      SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2574      auto *MaskTy =
2575          VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2576      return Builder.CreateIntrinsic(
2577          MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2578          /*FMFSource=*/nullptr, "interleaved.mask");
2579    }
2580
2581    if (!BlockInMask)
2582      return MaskForGaps;
2583
2584    Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585    Value *ShuffledMask = Builder.CreateShuffleVector(
2586        BlockInMaskPart,
2587        createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588        "interleaved.mask");
2589    return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2590                                             MaskForGaps)
2591                       : ShuffledMask;
2592  };
2593
2594  // Vectorize the interleaved load group.
2595  if (isa<LoadInst>(Instr)) {
2596    Value *MaskForGaps = nullptr;
2597    if (NeedsMaskForGaps) {
2598      MaskForGaps =
2599          createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2600      assert(MaskForGaps && "Mask for Gaps is required but it is null");
2601    }
2602
2603    // For each unroll part, create a wide load for the group.
2604    SmallVector<Value *, 2> NewLoads;
2605    for (unsigned Part = 0; Part < UF; Part++) {
2606      Instruction *NewLoad;
2607      if (BlockInMask || MaskForGaps) {
2608        assert(useMaskedInterleavedAccesses(*TTI) &&
2609               "masked interleaved groups are not allowed.");
2610        Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2611        NewLoad =
2612            Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2613                                     GroupMask, PoisonVec, "wide.masked.vec");
2614      }
2615      else
2616        NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2617                                            Group->getAlign(), "wide.vec");
2618      Group->addMetadata(NewLoad);
2619      NewLoads.push_back(NewLoad);
2620    }
2621
2622    if (VecTy->isScalableTy()) {
2623      assert(InterleaveFactor == 2 &&
2624             "Unsupported deinterleave factor for scalable vectors");
2625
2626      for (unsigned Part = 0; Part < UF; ++Part) {
2627        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628        // so must use intrinsics to deinterleave.
2629        Value *DI = Builder.CreateIntrinsic(
2630            Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2631            /*FMFSource=*/nullptr, "strided.vec");
2632        unsigned J = 0;
2633        for (unsigned I = 0; I < InterleaveFactor; ++I) {
2634          Instruction *Member = Group->getMember(I);
2635
2636          if (!Member)
2637            continue;
2638
2639          Value *StridedVec = Builder.CreateExtractValue(DI, I);
2640          // If this member has different type, cast the result type.
2641          if (Member->getType() != ScalarTy) {
2642            VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643            StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2644          }
2645
2646          if (Group->isReverse())
2647            StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2648
2649          State.set(VPDefs[J], StridedVec, Part);
2650          ++J;
2651        }
2652      }
2653
2654      return;
2655    }
2656
2657    // For each member in the group, shuffle out the appropriate data from the
2658    // wide loads.
2659    unsigned J = 0;
2660    for (unsigned I = 0; I < InterleaveFactor; ++I) {
2661      Instruction *Member = Group->getMember(I);
2662
2663      // Skip the gaps in the group.
2664      if (!Member)
2665        continue;
2666
2667      auto StrideMask =
2668          createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2669      for (unsigned Part = 0; Part < UF; Part++) {
2670        Value *StridedVec = Builder.CreateShuffleVector(
2671            NewLoads[Part], StrideMask, "strided.vec");
2672
2673        // If this member has different type, cast the result type.
2674        if (Member->getType() != ScalarTy) {
2675          assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2677          StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2678        }
2679
2680        if (Group->isReverse())
2681          StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2682
2683        State.set(VPDefs[J], StridedVec, Part);
2684      }
2685      ++J;
2686    }
2687    return;
2688  }
2689
2690  // The sub vector type for current instruction.
2691  auto *SubVT = VectorType::get(ScalarTy, VF);
2692
2693  // Vectorize the interleaved store group.
2694  Value *MaskForGaps =
2695      createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2696  assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2697         "masked interleaved groups are not allowed.");
2698  assert((!MaskForGaps || !VF.isScalable()) &&
2699         "masking gaps for scalable vectors is not yet supported.");
2700  for (unsigned Part = 0; Part < UF; Part++) {
2701    // Collect the stored vector from each member.
2702    SmallVector<Value *, 4> StoredVecs;
2703    unsigned StoredIdx = 0;
2704    for (unsigned i = 0; i < InterleaveFactor; i++) {
2705      assert((Group->getMember(i) || MaskForGaps) &&
2706             "Fail to get a member from an interleaved store group");
2707      Instruction *Member = Group->getMember(i);
2708
2709      // Skip the gaps in the group.
2710      if (!Member) {
2711        Value *Undef = PoisonValue::get(SubVT);
2712        StoredVecs.push_back(Undef);
2713        continue;
2714      }
2715
2716      Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2717      ++StoredIdx;
2718
2719      if (Group->isReverse())
2720        StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2721
2722      // If this member has different type, cast it to a unified type.
2723
2724      if (StoredVec->getType() != SubVT)
2725        StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2726
2727      StoredVecs.push_back(StoredVec);
2728    }
2729
2730    // Interleave all the smaller vectors into one wider vector.
2731    Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2732    Instruction *NewStoreInstr;
2733    if (BlockInMask || MaskForGaps) {
2734      Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2735      NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736                                                Group->getAlign(), GroupMask);
2737    } else
2738      NewStoreInstr =
2739          Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2740
2741    Group->addMetadata(NewStoreInstr);
2742  }
2743}
2744
2745void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2746                                               VPReplicateRecipe *RepRecipe,
2747                                               const VPIteration &Instance,
2748                                               VPTransformState &State) {
2749  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2750
2751  // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752  // the first lane and part.
2753  if (isa<NoAliasScopeDeclInst>(Instr))
2754    if (!Instance.isFirstIteration())
2755      return;
2756
2757  // Does this instruction return a value ?
2758  bool IsVoidRetTy = Instr->getType()->isVoidTy();
2759
2760  Instruction *Cloned = Instr->clone();
2761  if (!IsVoidRetTy) {
2762    Cloned->setName(Instr->getName() + ".cloned");
2763#if !defined(NDEBUG)
2764    // Verify that VPlan type inference results agree with the type of the
2765    // generated values.
2766    assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2767           "inferred type and type from generated instructions do not match");
2768#endif
2769  }
2770
2771  RepRecipe->setFlags(Cloned);
2772
2773  if (auto DL = Instr->getDebugLoc())
2774    State.setDebugLocFrom(DL);
2775
2776  // Replace the operands of the cloned instructions with their scalar
2777  // equivalents in the new loop.
2778  for (const auto &I : enumerate(RepRecipe->operands())) {
2779    auto InputInstance = Instance;
2780    VPValue *Operand = I.value();
2781    if (vputils::isUniformAfterVectorization(Operand))
2782      InputInstance.Lane = VPLane::getFirstLane();
2783    Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2784  }
2785  State.addNewMetadata(Cloned, Instr);
2786
2787  // Place the cloned scalar in the new loop.
2788  State.Builder.Insert(Cloned);
2789
2790  State.set(RepRecipe, Cloned, Instance);
2791
2792  // If we just cloned a new assumption, add it the assumption cache.
2793  if (auto *II = dyn_cast<AssumeInst>(Cloned))
2794    AC->registerAssumption(II);
2795
2796  // End if-block.
2797  bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2798  if (IfPredicateInstr)
2799    PredicatedInstructions.push_back(Cloned);
2800}
2801
2802Value *
2803InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2804  if (VectorTripCount)
2805    return VectorTripCount;
2806
2807  Value *TC = getTripCount();
2808  IRBuilder<> Builder(InsertBlock->getTerminator());
2809
2810  Type *Ty = TC->getType();
2811  // This is where we can make the step a runtime constant.
2812  Value *Step = createStepForVF(Builder, Ty, VF, UF);
2813
2814  // If the tail is to be folded by masking, round the number of iterations N
2815  // up to a multiple of Step instead of rounding down. This is done by first
2816  // adding Step-1 and then rounding down. Note that it's ok if this addition
2817  // overflows: the vector induction variable will eventually wrap to zero given
2818  // that it starts at zero and its Step is a power of two; the loop will then
2819  // exit, with the last early-exit vector comparison also producing all-true.
2820  // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2821  // is accounted for in emitIterationCountCheck that adds an overflow check.
2822  if (Cost->foldTailByMasking()) {
2823    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2824           "VF*UF must be a power of 2 when folding tail by masking");
2825    Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2826    TC = Builder.CreateAdd(
2827        TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2828  }
2829
2830  // Now we need to generate the expression for the part of the loop that the
2831  // vectorized body will execute. This is equal to N - (N % Step) if scalar
2832  // iterations are not required for correctness, or N - Step, otherwise. Step
2833  // is equal to the vectorization factor (number of SIMD elements) times the
2834  // unroll factor (number of SIMD instructions).
2835  Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2836
2837  // There are cases where we *must* run at least one iteration in the remainder
2838  // loop.  See the cost model for when this can happen.  If the step evenly
2839  // divides the trip count, we set the remainder to be equal to the step. If
2840  // the step does not evenly divide the trip count, no adjustment is necessary
2841  // since there will already be scalar iterations. Note that the minimum
2842  // iterations check ensures that N >= Step.
2843  if (Cost->requiresScalarEpilogue(VF.isVector())) {
2844    auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2845    R = Builder.CreateSelect(IsZero, Step, R);
2846  }
2847
2848  VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2849
2850  return VectorTripCount;
2851}
2852
2853Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2854                                                   const DataLayout &DL) {
2855  // Verify that V is a vector type with same number of elements as DstVTy.
2856  auto *DstFVTy = cast<VectorType>(DstVTy);
2857  auto VF = DstFVTy->getElementCount();
2858  auto *SrcVecTy = cast<VectorType>(V->getType());
2859  assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2860  Type *SrcElemTy = SrcVecTy->getElementType();
2861  Type *DstElemTy = DstFVTy->getElementType();
2862  assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2863         "Vector elements must have same size");
2864
2865  // Do a direct cast if element types are castable.
2866  if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2867    return Builder.CreateBitOrPointerCast(V, DstFVTy);
2868  }
2869  // V cannot be directly casted to desired vector type.
2870  // May happen when V is a floating point vector but DstVTy is a vector of
2871  // pointers or vice-versa. Handle this using a two-step bitcast using an
2872  // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2873  assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2874         "Only one type should be a pointer type");
2875  assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2876         "Only one type should be a floating point type");
2877  Type *IntTy =
2878      IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2879  auto *VecIntTy = VectorType::get(IntTy, VF);
2880  Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2881  return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2882}
2883
2884void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2885  Value *Count = getTripCount();
2886  // Reuse existing vector loop preheader for TC checks.
2887  // Note that new preheader block is generated for vector loop.
2888  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2889  IRBuilder<> Builder(TCCheckBlock->getTerminator());
2890
2891  // Generate code to check if the loop's trip count is less than VF * UF, or
2892  // equal to it in case a scalar epilogue is required; this implies that the
2893  // vector trip count is zero. This check also covers the case where adding one
2894  // to the backedge-taken count overflowed leading to an incorrect trip count
2895  // of zero. In this case we will also jump to the scalar loop.
2896  auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2897                                                       : ICmpInst::ICMP_ULT;
2898
2899  // If tail is to be folded, vector loop takes care of all iterations.
2900  Type *CountTy = Count->getType();
2901  Value *CheckMinIters = Builder.getFalse();
2902  auto CreateStep = [&]() -> Value * {
2903    // Create step with max(MinProTripCount, UF * VF).
2904    if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2905      return createStepForVF(Builder, CountTy, VF, UF);
2906
2907    Value *MinProfTC =
2908        createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2909    if (!VF.isScalable())
2910      return MinProfTC;
2911    return Builder.CreateBinaryIntrinsic(
2912        Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2913  };
2914
2915  TailFoldingStyle Style = Cost->getTailFoldingStyle();
2916  if (Style == TailFoldingStyle::None)
2917    CheckMinIters =
2918        Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2919  else if (VF.isScalable() &&
2920           !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2921           Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2922    // vscale is not necessarily a power-of-2, which means we cannot guarantee
2923    // an overflow to zero when updating induction variables and so an
2924    // additional overflow check is required before entering the vector loop.
2925
2926    // Get the maximum unsigned value for the type.
2927    Value *MaxUIntTripCount =
2928        ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2929    Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2930
2931    // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932    CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2933  }
2934
2935  // Create new preheader for vector loop.
2936  LoopVectorPreHeader =
2937      SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2938                 "vector.ph");
2939
2940  assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2941                               DT->getNode(Bypass)->getIDom()) &&
2942         "TC check is expected to dominate Bypass");
2943
2944  // Update dominator for Bypass & LoopExit (if needed).
2945  DT->changeImmediateDominator(Bypass, TCCheckBlock);
2946  if (!Cost->requiresScalarEpilogue(VF.isVector()))
2947    // If there is an epilogue which must run, there's no edge from the
2948    // middle block to exit blocks  and thus no need to update the immediate
2949    // dominator of the exit blocks.
2950    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2951
2952  BranchInst &BI =
2953      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2954  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2955    setBranchWeights(BI, MinItersBypassWeights);
2956  ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2957  LoopBypassBlocks.push_back(TCCheckBlock);
2958}
2959
2960BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2961  BasicBlock *const SCEVCheckBlock =
2962      RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2963  if (!SCEVCheckBlock)
2964    return nullptr;
2965
2966  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2967           (OptForSizeBasedOnProfile &&
2968            Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2969         "Cannot SCEV check stride or overflow when optimizing for size");
2970
2971
2972  // Update dominator only if this is first RT check.
2973  if (LoopBypassBlocks.empty()) {
2974    DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2975    if (!Cost->requiresScalarEpilogue(VF.isVector()))
2976      // If there is an epilogue which must run, there's no edge from the
2977      // middle block to exit blocks  and thus no need to update the immediate
2978      // dominator of the exit blocks.
2979      DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2980  }
2981
2982  LoopBypassBlocks.push_back(SCEVCheckBlock);
2983  AddedSafetyChecks = true;
2984  return SCEVCheckBlock;
2985}
2986
2987BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2988  // VPlan-native path does not do any analysis for runtime checks currently.
2989  if (EnableVPlanNativePath)
2990    return nullptr;
2991
2992  BasicBlock *const MemCheckBlock =
2993      RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2994
2995  // Check if we generated code that checks in runtime if arrays overlap. We put
2996  // the checks into a separate block to make the more common case of few
2997  // elements faster.
2998  if (!MemCheckBlock)
2999    return nullptr;
3000
3001  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3002    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3003           "Cannot emit memory checks when optimizing for size, unless forced "
3004           "to vectorize.");
3005    ORE->emit([&]() {
3006      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3007                                        OrigLoop->getStartLoc(),
3008                                        OrigLoop->getHeader())
3009             << "Code-size may be reduced by not forcing "
3010                "vectorization, or by source-code modifications "
3011                "eliminating the need for runtime checks "
3012                "(e.g., adding 'restrict').";
3013    });
3014  }
3015
3016  LoopBypassBlocks.push_back(MemCheckBlock);
3017
3018  AddedSafetyChecks = true;
3019
3020  return MemCheckBlock;
3021}
3022
3023void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3024  LoopScalarBody = OrigLoop->getHeader();
3025  LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3026  assert(LoopVectorPreHeader && "Invalid loop structure");
3027  LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3028  assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
3029         "multiple exit loop without required epilogue?");
3030
3031  LoopMiddleBlock =
3032      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3033                 LI, nullptr, Twine(Prefix) + "middle.block");
3034  LoopScalarPreHeader =
3035      SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3036                 nullptr, Twine(Prefix) + "scalar.ph");
3037
3038  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3039
3040  // Set up the middle block terminator.  Two cases:
3041  // 1) If we know that we must execute the scalar epilogue, emit an
3042  //    unconditional branch.
3043  // 2) Otherwise, we must have a single unique exit block (due to how we
3044  //    implement the multiple exit case).  In this case, set up a conditional
3045  //    branch from the middle block to the loop scalar preheader, and the
3046  //    exit block.  completeLoopSkeleton will update the condition to use an
3047  //    iteration check, if required to decide whether to execute the remainder.
3048  BranchInst *BrInst =
3049      Cost->requiresScalarEpilogue(VF.isVector())
3050          ? BranchInst::Create(LoopScalarPreHeader)
3051          : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3052                               Builder.getTrue());
3053  BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3054  ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3055
3056  // Update dominator for loop exit. During skeleton creation, only the vector
3057  // pre-header and the middle block are created. The vector loop is entirely
3058  // created during VPlan exection.
3059  if (!Cost->requiresScalarEpilogue(VF.isVector()))
3060    // If there is an epilogue which must run, there's no edge from the
3061    // middle block to exit blocks  and thus no need to update the immediate
3062    // dominator of the exit blocks.
3063    DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3064}
3065
3066PHINode *InnerLoopVectorizer::createInductionResumeValue(
3067    PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3068    ArrayRef<BasicBlock *> BypassBlocks,
3069    std::pair<BasicBlock *, Value *> AdditionalBypass) {
3070  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3071  assert(VectorTripCount && "Expected valid arguments");
3072
3073  Instruction *OldInduction = Legal->getPrimaryInduction();
3074  Value *&EndValue = IVEndValues[OrigPhi];
3075  Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3076  if (OrigPhi == OldInduction) {
3077    // We know what the end value is.
3078    EndValue = VectorTripCount;
3079  } else {
3080    IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3081
3082    // Fast-math-flags propagate from the original induction instruction.
3083    if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3084      B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3085
3086    EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3087                                    Step, II.getKind(), II.getInductionBinOp());
3088    EndValue->setName("ind.end");
3089
3090    // Compute the end value for the additional bypass (if applicable).
3091    if (AdditionalBypass.first) {
3092      B.SetInsertPoint(AdditionalBypass.first,
3093                       AdditionalBypass.first->getFirstInsertionPt());
3094      EndValueFromAdditionalBypass =
3095          emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3096                               Step, II.getKind(), II.getInductionBinOp());
3097      EndValueFromAdditionalBypass->setName("ind.end");
3098    }
3099  }
3100
3101  // Create phi nodes to merge from the  backedge-taken check block.
3102  PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3103                                         LoopScalarPreHeader->getTerminator());
3104  // Copy original phi DL over to the new one.
3105  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3106
3107  // The new PHI merges the original incoming value, in case of a bypass,
3108  // or the value at the end of the vectorized loop.
3109  BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3110
3111  // Fix the scalar body counter (PHI node).
3112  // The old induction's phi node in the scalar body needs the truncated
3113  // value.
3114  for (BasicBlock *BB : BypassBlocks)
3115    BCResumeVal->addIncoming(II.getStartValue(), BB);
3116
3117  if (AdditionalBypass.first)
3118    BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3119                                          EndValueFromAdditionalBypass);
3120  return BCResumeVal;
3121}
3122
3123/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124/// expansion results.
3125static Value *getExpandedStep(const InductionDescriptor &ID,
3126                              const SCEV2ValueTy &ExpandedSCEVs) {
3127  const SCEV *Step = ID.getStep();
3128  if (auto *C = dyn_cast<SCEVConstant>(Step))
3129    return C->getValue();
3130  if (auto *U = dyn_cast<SCEVUnknown>(Step))
3131    return U->getValue();
3132  auto I = ExpandedSCEVs.find(Step);
3133  assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3134  return I->second;
3135}
3136
3137void InnerLoopVectorizer::createInductionResumeValues(
3138    const SCEV2ValueTy &ExpandedSCEVs,
3139    std::pair<BasicBlock *, Value *> AdditionalBypass) {
3140  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3141          (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3142         "Inconsistent information about additional bypass.");
3143  // We are going to resume the execution of the scalar loop.
3144  // Go over all of the induction variables that we found and fix the
3145  // PHIs that are left in the scalar version of the loop.
3146  // The starting values of PHI nodes depend on the counter of the last
3147  // iteration in the vectorized loop.
3148  // If we come from a bypass edge then we need to start from the original
3149  // start value.
3150  for (const auto &InductionEntry : Legal->getInductionVars()) {
3151    PHINode *OrigPhi = InductionEntry.first;
3152    const InductionDescriptor &II = InductionEntry.second;
3153    PHINode *BCResumeVal = createInductionResumeValue(
3154        OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3155        AdditionalBypass);
3156    OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3157  }
3158}
3159
3160BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3161  // The trip counts should be cached by now.
3162  Value *Count = getTripCount();
3163  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3164
3165  auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3166
3167  // Add a check in the middle block to see if we have completed
3168  // all of the iterations in the first vector loop.  Three cases:
3169  // 1) If we require a scalar epilogue, there is no conditional branch as
3170  //    we unconditionally branch to the scalar preheader.  Do nothing.
3171  // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172  //    Thus if tail is to be folded, we know we don't need to run the
3173  //    remainder and we can use the previous value for the condition (true).
3174  // 3) Otherwise, construct a runtime check.
3175  if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3176      !Cost->foldTailByMasking()) {
3177    // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178    // of the corresponding compare because they may have ended up with
3179    // different line numbers and we want to avoid awkward line stepping while
3180    // debugging. Eg. if the compare has got a line number inside the loop.
3181    // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182    // operands. Perform simplification directly on VPlan once the branch is
3183    // modeled there.
3184    IRBuilder<> B(LoopMiddleBlock->getTerminator());
3185    B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3186    Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3187    BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3188    BI.setCondition(CmpN);
3189    if (hasBranchWeightMD(*ScalarLatchTerm)) {
3190      // Assume that `Count % VectorTripCount` is equally distributed.
3191      unsigned TripCount = UF * VF.getKnownMinValue();
3192      assert(TripCount > 0 && "trip count should not be zero");
3193      const uint32_t Weights[] = {1, TripCount - 1};
3194      setBranchWeights(BI, Weights);
3195    }
3196  }
3197
3198#ifdef EXPENSIVE_CHECKS
3199  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3200#endif
3201
3202  return LoopVectorPreHeader;
3203}
3204
3205std::pair<BasicBlock *, Value *>
3206InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207    const SCEV2ValueTy &ExpandedSCEVs) {
3208  /*
3209   In this function we generate a new loop. The new loop will contain
3210   the vectorized instructions while the old loop will continue to run the
3211   scalar remainder.
3212
3213       [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214     /  |      preheader are expanded here. Eventually all required SCEV
3215    /   |      expansion should happen here.
3216   /    v
3217  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3218  |  /  |
3219  | /   v
3220  ||   [ ]     <-- vector pre header.
3221  |/    |
3222  |     v
3223  |    [  ] \
3224  |    [  ]_|   <-- vector loop (created during VPlan execution).
3225  |     |
3226  |     v
3227  \   -[ ]   <--- middle-block.
3228   \/   |
3229   /\   v
3230   | ->[ ]     <--- new preheader.
3231   |    |
3232 (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3233   |   [ ] \
3234   |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3235    \   |
3236     \  v
3237      >[ ]     <-- exit block(s).
3238   ...
3239   */
3240
3241  // Create an empty vector loop, and prepare basic blocks for the runtime
3242  // checks.
3243  createVectorLoopSkeleton("");
3244
3245  // Now, compare the new count to zero. If it is zero skip the vector loop and
3246  // jump to the scalar loop. This check also covers the case where the
3247  // backedge-taken count is uint##_max: adding one to it will overflow leading
3248  // to an incorrect trip count of zero. In this (rare) case we will also jump
3249  // to the scalar loop.
3250  emitIterationCountCheck(LoopScalarPreHeader);
3251
3252  // Generate the code to check any assumptions that we've made for SCEV
3253  // expressions.
3254  emitSCEVChecks(LoopScalarPreHeader);
3255
3256  // Generate the code that checks in runtime if arrays overlap. We put the
3257  // checks into a separate block to make the more common case of few elements
3258  // faster.
3259  emitMemRuntimeChecks(LoopScalarPreHeader);
3260
3261  // Emit phis for the new starting index of the scalar loop.
3262  createInductionResumeValues(ExpandedSCEVs);
3263
3264  return {completeLoopSkeleton(), nullptr};
3265}
3266
3267// Fix up external users of the induction variable. At this point, we are
3268// in LCSSA form, with all external PHIs that use the IV having one input value,
3269// coming from the remainder loop. We need those PHIs to also have a correct
3270// value for the IV when arriving directly from the middle block.
3271void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3272                                       const InductionDescriptor &II,
3273                                       Value *VectorTripCount, Value *EndValue,
3274                                       BasicBlock *MiddleBlock,
3275                                       BasicBlock *VectorHeader, VPlan &Plan,
3276                                       VPTransformState &State) {
3277  // There are two kinds of external IV usages - those that use the value
3278  // computed in the last iteration (the PHI) and those that use the penultimate
3279  // value (the value that feeds into the phi from the loop latch).
3280  // We allow both, but they, obviously, have different values.
3281
3282  assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3283
3284  DenseMap<Value *, Value *> MissingVals;
3285
3286  // An external user of the last iteration's value should see the value that
3287  // the remainder loop uses to initialize its own IV.
3288  Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3289  for (User *U : PostInc->users()) {
3290    Instruction *UI = cast<Instruction>(U);
3291    if (!OrigLoop->contains(UI)) {
3292      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3293      MissingVals[UI] = EndValue;
3294    }
3295  }
3296
3297  // An external user of the penultimate value need to see EndValue - Step.
3298  // The simplest way to get this is to recompute it from the constituent SCEVs,
3299  // that is Start + (Step * (CRD - 1)).
3300  for (User *U : OrigPhi->users()) {
3301    auto *UI = cast<Instruction>(U);
3302    if (!OrigLoop->contains(UI)) {
3303      assert(isa<PHINode>(UI) && "Expected LCSSA form");
3304      IRBuilder<> B(MiddleBlock->getTerminator());
3305
3306      // Fast-math-flags propagate from the original induction instruction.
3307      if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3308        B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3309
3310      Value *CountMinusOne = B.CreateSub(
3311          VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3312      CountMinusOne->setName("cmo");
3313
3314      VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3315      assert(StepVPV && "step must have been expanded during VPlan execution");
3316      Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3317                                        : State.get(StepVPV, {0, 0});
3318      Value *Escape =
3319          emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3320                               II.getKind(), II.getInductionBinOp());
3321      Escape->setName("ind.escape");
3322      MissingVals[UI] = Escape;
3323    }
3324  }
3325
3326  for (auto &I : MissingVals) {
3327    PHINode *PHI = cast<PHINode>(I.first);
3328    // One corner case we have to handle is two IVs "chasing" each-other,
3329    // that is %IV2 = phi [...], [ %IV1, %latch ]
3330    // In this case, if IV1 has an external use, we need to avoid adding both
3331    // "last value of IV1" and "penultimate value of IV2". So, verify that we
3332    // don't already have an incoming value for the middle block.
3333    if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3334      PHI->addIncoming(I.second, MiddleBlock);
3335      Plan.removeLiveOut(PHI);
3336    }
3337  }
3338}
3339
3340namespace {
3341
3342struct CSEDenseMapInfo {
3343  static bool canHandle(const Instruction *I) {
3344    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3345           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3346  }
3347
3348  static inline Instruction *getEmptyKey() {
3349    return DenseMapInfo<Instruction *>::getEmptyKey();
3350  }
3351
3352  static inline Instruction *getTombstoneKey() {
3353    return DenseMapInfo<Instruction *>::getTombstoneKey();
3354  }
3355
3356  static unsigned getHashValue(const Instruction *I) {
3357    assert(canHandle(I) && "Unknown instruction!");
3358    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3359                                                           I->value_op_end()));
3360  }
3361
3362  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3363    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3364        LHS == getTombstoneKey() || RHS == getTombstoneKey())
3365      return LHS == RHS;
3366    return LHS->isIdenticalTo(RHS);
3367  }
3368};
3369
3370} // end anonymous namespace
3371
3372///Perform cse of induction variable instructions.
3373static void cse(BasicBlock *BB) {
3374  // Perform simple cse.
3375  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3376  for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3377    if (!CSEDenseMapInfo::canHandle(&In))
3378      continue;
3379
3380    // Check if we can replace this instruction with any of the
3381    // visited instructions.
3382    if (Instruction *V = CSEMap.lookup(&In)) {
3383      In.replaceAllUsesWith(V);
3384      In.eraseFromParent();
3385      continue;
3386    }
3387
3388    CSEMap[&In] = &In;
3389  }
3390}
3391
3392InstructionCost
3393LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3394                                              ElementCount VF) const {
3395  // We only need to calculate a cost if the VF is scalar; for actual vectors
3396  // we should already have a pre-calculated cost at each VF.
3397  if (!VF.isScalar())
3398    return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3399
3400  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3401  Type *RetTy = CI->getType();
3402  if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3403    if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3404      return *RedCost;
3405
3406  SmallVector<Type *, 4> Tys;
3407  for (auto &ArgOp : CI->args())
3408    Tys.push_back(ArgOp->getType());
3409
3410  InstructionCost ScalarCallCost =
3411      TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3412
3413  // If this is an intrinsic we may have a lower cost for it.
3414  if (getVectorIntrinsicIDForCall(CI, TLI)) {
3415    InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3416    return std::min(ScalarCallCost, IntrinsicCost);
3417  }
3418  return ScalarCallCost;
3419}
3420
3421static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3422  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3423    return Elt;
3424  return VectorType::get(Elt, VF);
3425}
3426
3427InstructionCost
3428LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3429                                                   ElementCount VF) const {
3430  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3431  assert(ID && "Expected intrinsic call!");
3432  Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3433  FastMathFlags FMF;
3434  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3435    FMF = FPMO->getFastMathFlags();
3436
3437  SmallVector<const Value *> Arguments(CI->args());
3438  FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3439  SmallVector<Type *> ParamTys;
3440  std::transform(FTy->param_begin(), FTy->param_end(),
3441                 std::back_inserter(ParamTys),
3442                 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3443
3444  IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3445                                    dyn_cast<IntrinsicInst>(CI));
3446  return TTI.getIntrinsicInstrCost(CostAttrs,
3447                                   TargetTransformInfo::TCK_RecipThroughput);
3448}
3449
3450static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3451  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3452  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3453  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3454}
3455
3456static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3457  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3458  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3459  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3460}
3461
3462void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3463                                            VPlan &Plan) {
3464  // Fix widened non-induction PHIs by setting up the PHI operands.
3465  if (EnableVPlanNativePath)
3466    fixNonInductionPHIs(Plan, State);
3467
3468  // At this point every instruction in the original loop is widened to a
3469  // vector form. Now we need to fix the recurrences in the loop. These PHI
3470  // nodes are currently empty because we did not want to introduce cycles.
3471  // This is the second stage of vectorizing recurrences. Note that fixing
3472  // reduction phis are already modeled in VPlan.
3473  // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474  VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3475  VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3476  for (VPRecipeBase &R : HeaderVPBB->phis()) {
3477    if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3478      fixFixedOrderRecurrence(FOR, State);
3479  }
3480
3481  // Forget the original basic block.
3482  PSE.getSE()->forgetLoop(OrigLoop);
3483  PSE.getSE()->forgetBlockAndLoopDispositions();
3484
3485  // After vectorization, the exit blocks of the original loop will have
3486  // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487  // looked through single-entry phis.
3488  SmallVector<BasicBlock *> ExitBlocks;
3489  OrigLoop->getExitBlocks(ExitBlocks);
3490  for (BasicBlock *Exit : ExitBlocks)
3491    for (PHINode &PN : Exit->phis())
3492      PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3493
3494  VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3495  Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3496  if (Cost->requiresScalarEpilogue(VF.isVector())) {
3497    // No edge from the middle block to the unique exit block has been inserted
3498    // and there is nothing to fix from vector loop; phis should have incoming
3499    // from scalar loop only.
3500  } else {
3501    // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3502    // the cost model.
3503
3504    // If we inserted an edge from the middle block to the unique exit block,
3505    // update uses outside the loop (phis) to account for the newly inserted
3506    // edge.
3507
3508    // Fix-up external users of the induction variables.
3509    for (const auto &Entry : Legal->getInductionVars())
3510      fixupIVUsers(Entry.first, Entry.second,
3511                   getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3512                   IVEndValues[Entry.first], LoopMiddleBlock,
3513                   VectorLoop->getHeader(), Plan, State);
3514  }
3515
3516  // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3517  // in the exit block, so update the builder.
3518  State.Builder.SetInsertPoint(State.CFG.ExitBB,
3519                               State.CFG.ExitBB->getFirstNonPHIIt());
3520  for (const auto &KV : Plan.getLiveOuts())
3521    KV.second->fixPhi(Plan, State);
3522
3523  for (Instruction *PI : PredicatedInstructions)
3524    sinkScalarOperands(&*PI);
3525
3526  // Remove redundant induction instructions.
3527  cse(VectorLoop->getHeader());
3528
3529  // Set/update profile weights for the vector and remainder loops as original
3530  // loop iterations are now distributed among them. Note that original loop
3531  // represented by LoopScalarBody becomes remainder loop after vectorization.
3532  //
3533  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3534  // end up getting slightly roughened result but that should be OK since
3535  // profile is not inherently precise anyway. Note also possible bypass of
3536  // vector code caused by legality checks is ignored, assigning all the weight
3537  // to the vector loop, optimistically.
3538  //
3539  // For scalable vectorization we can't know at compile time how many iterations
3540  // of the loop are handled in one vector iteration, so instead assume a pessimistic
3541  // vscale of '1'.
3542  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3543                               LI->getLoopFor(LoopScalarBody),
3544                               VF.getKnownMinValue() * UF);
3545}
3546
3547void InnerLoopVectorizer::fixFixedOrderRecurrence(
3548    VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3549  // This is the second phase of vectorizing first-order recurrences. An
3550  // overview of the transformation is described below. Suppose we have the
3551  // following loop.
3552  //
3553  //   for (int i = 0; i < n; ++i)
3554  //     b[i] = a[i] - a[i - 1];
3555  //
3556  // There is a first-order recurrence on "a". For this loop, the shorthand
3557  // scalar IR looks like:
3558  //
3559  //   scalar.ph:
3560  //     s_init = a[-1]
3561  //     br scalar.body
3562  //
3563  //   scalar.body:
3564  //     i = phi [0, scalar.ph], [i+1, scalar.body]
3565  //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3566  //     s2 = a[i]
3567  //     b[i] = s2 - s1
3568  //     br cond, scalar.body, ...
3569  //
3570  // In this example, s1 is a recurrence because it's value depends on the
3571  // previous iteration. In the first phase of vectorization, we created a
3572  // vector phi v1 for s1. We now complete the vectorization and produce the
3573  // shorthand vector IR shown below (for VF = 4, UF = 1).
3574  //
3575  //   vector.ph:
3576  //     v_init = vector(..., ..., ..., a[-1])
3577  //     br vector.body
3578  //
3579  //   vector.body
3580  //     i = phi [0, vector.ph], [i+4, vector.body]
3581  //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3582  //     v2 = a[i, i+1, i+2, i+3];
3583  //     v3 = vector(v1(3), v2(0, 1, 2))
3584  //     b[i, i+1, i+2, i+3] = v2 - v3
3585  //     br cond, vector.body, middle.block
3586  //
3587  //   middle.block:
3588  //     x = v2(3)
3589  //     br scalar.ph
3590  //
3591  //   scalar.ph:
3592  //     s_init = phi [x, middle.block], [a[-1], otherwise]
3593  //     br scalar.body
3594  //
3595  // After execution completes the vector loop, we extract the next value of
3596  // the recurrence (x) to use as the initial value in the scalar loop.
3597
3598  // Extract the last vector element in the middle block. This will be the
3599  // initial value for the recurrence when jumping to the scalar loop.
3600  VPValue *PreviousDef = PhiR->getBackedgeValue();
3601  Value *Incoming = State.get(PreviousDef, UF - 1);
3602  auto *ExtractForScalar = Incoming;
3603  auto *IdxTy = Builder.getInt32Ty();
3604  Value *RuntimeVF = nullptr;
3605  if (VF.isVector()) {
3606    auto *One = ConstantInt::get(IdxTy, 1);
3607    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3608    RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3609    auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3610    ExtractForScalar =
3611        Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3612  }
3613
3614  auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3615  assert(PhiR->getNumUsers() == 1 &&
3616         RecurSplice->getOpcode() ==
3617             VPInstruction::FirstOrderRecurrenceSplice &&
3618         "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619  SmallVector<VPLiveOut *> LiveOuts;
3620  for (VPUser *U : RecurSplice->users())
3621    if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3622      LiveOuts.push_back(LiveOut);
3623
3624  if (!LiveOuts.empty()) {
3625    // Extract the second last element in the middle block if the
3626    // Phi is used outside the loop. We need to extract the phi itself
3627    // and not the last element (the phi update in the current iteration). This
3628    // will be the value when jumping to the exit block from the
3629    // LoopMiddleBlock, when the scalar loop is not run at all.
3630    Value *ExtractForPhiUsedOutsideLoop = nullptr;
3631    if (VF.isVector()) {
3632      auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3633      ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3634          Incoming, Idx, "vector.recur.extract.for.phi");
3635    } else {
3636      assert(UF > 1 && "VF and UF cannot both be 1");
3637      // When loop is unrolled without vectorizing, initialize
3638      // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639      // value of `Incoming`. This is analogous to the vectorized case above:
3640      // extracting the second last element when VF > 1.
3641      ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3642    }
3643
3644    for (VPLiveOut *LiveOut : LiveOuts) {
3645      assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3646      PHINode *LCSSAPhi = LiveOut->getPhi();
3647      LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3648      State.Plan->removeLiveOut(LCSSAPhi);
3649    }
3650  }
3651
3652  // Fix the initial value of the original recurrence in the scalar loop.
3653  Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3654  PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3655  auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3656  auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3657  for (auto *BB : predecessors(LoopScalarPreHeader)) {
3658    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3659    Start->addIncoming(Incoming, BB);
3660  }
3661
3662  Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3663  Phi->setName("scalar.recur");
3664}
3665
3666void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3667  // The basic block and loop containing the predicated instruction.
3668  auto *PredBB = PredInst->getParent();
3669  auto *VectorLoop = LI->getLoopFor(PredBB);
3670
3671  // Initialize a worklist with the operands of the predicated instruction.
3672  SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3673
3674  // Holds instructions that we need to analyze again. An instruction may be
3675  // reanalyzed if we don't yet know if we can sink it or not.
3676  SmallVector<Instruction *, 8> InstsToReanalyze;
3677
3678  // Returns true if a given use occurs in the predicated block. Phi nodes use
3679  // their operands in their corresponding predecessor blocks.
3680  auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3681    auto *I = cast<Instruction>(U.getUser());
3682    BasicBlock *BB = I->getParent();
3683    if (auto *Phi = dyn_cast<PHINode>(I))
3684      BB = Phi->getIncomingBlock(
3685          PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3686    return BB == PredBB;
3687  };
3688
3689  // Iteratively sink the scalarized operands of the predicated instruction
3690  // into the block we created for it. When an instruction is sunk, it's
3691  // operands are then added to the worklist. The algorithm ends after one pass
3692  // through the worklist doesn't sink a single instruction.
3693  bool Changed;
3694  do {
3695    // Add the instructions that need to be reanalyzed to the worklist, and
3696    // reset the changed indicator.
3697    Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3698    InstsToReanalyze.clear();
3699    Changed = false;
3700
3701    while (!Worklist.empty()) {
3702      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3703
3704      // We can't sink an instruction if it is a phi node, is not in the loop,
3705      // may have side effects or may read from memory.
3706      // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707      if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3708          I->mayHaveSideEffects() || I->mayReadFromMemory())
3709          continue;
3710
3711      // If the instruction is already in PredBB, check if we can sink its
3712      // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713      // sinking the scalar instruction I, hence it appears in PredBB; but it
3714      // may have failed to sink I's operands (recursively), which we try
3715      // (again) here.
3716      if (I->getParent() == PredBB) {
3717        Worklist.insert(I->op_begin(), I->op_end());
3718        continue;
3719      }
3720
3721      // It's legal to sink the instruction if all its uses occur in the
3722      // predicated block. Otherwise, there's nothing to do yet, and we may
3723      // need to reanalyze the instruction.
3724      if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3725        InstsToReanalyze.push_back(I);
3726        continue;
3727      }
3728
3729      // Move the instruction to the beginning of the predicated block, and add
3730      // it's operands to the worklist.
3731      I->moveBefore(&*PredBB->getFirstInsertionPt());
3732      Worklist.insert(I->op_begin(), I->op_end());
3733
3734      // The sinking may have enabled other instructions to be sunk, so we will
3735      // need to iterate.
3736      Changed = true;
3737    }
3738  } while (Changed);
3739}
3740
3741void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3742                                              VPTransformState &State) {
3743  auto Iter = vp_depth_first_deep(Plan.getEntry());
3744  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3745    for (VPRecipeBase &P : VPBB->phis()) {
3746      VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3747      if (!VPPhi)
3748        continue;
3749      PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3750      // Make sure the builder has a valid insert point.
3751      Builder.SetInsertPoint(NewPhi);
3752      for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3753        VPValue *Inc = VPPhi->getIncomingValue(i);
3754        VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3755        NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3756      }
3757    }
3758  }
3759}
3760
3761bool InnerLoopVectorizer::useOrderedReductions(
3762    const RecurrenceDescriptor &RdxDesc) {
3763  return Cost->useOrderedReductions(RdxDesc);
3764}
3765
3766void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3767  // We should not collect Scalars more than once per VF. Right now, this
3768  // function is called from collectUniformsAndScalars(), which already does
3769  // this check. Collecting Scalars for VF=1 does not make any sense.
3770  assert(VF.isVector() && !Scalars.contains(VF) &&
3771         "This function should not be visited twice for the same VF");
3772
3773  // This avoids any chances of creating a REPLICATE recipe during planning
3774  // since that would result in generation of scalarized code during execution,
3775  // which is not supported for scalable vectors.
3776  if (VF.isScalable()) {
3777    Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3778    return;
3779  }
3780
3781  SmallSetVector<Instruction *, 8> Worklist;
3782
3783  // These sets are used to seed the analysis with pointers used by memory
3784  // accesses that will remain scalar.
3785  SmallSetVector<Instruction *, 8> ScalarPtrs;
3786  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3787  auto *Latch = TheLoop->getLoopLatch();
3788
3789  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3790  // The pointer operands of loads and stores will be scalar as long as the
3791  // memory access is not a gather or scatter operation. The value operand of a
3792  // store will remain scalar if the store is scalarized.
3793  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3794    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3795    assert(WideningDecision != CM_Unknown &&
3796           "Widening decision should be ready at this moment");
3797    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3798      if (Ptr == Store->getValueOperand())
3799        return WideningDecision == CM_Scalarize;
3800    assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3801           "Ptr is neither a value or pointer operand");
3802    return WideningDecision != CM_GatherScatter;
3803  };
3804
3805  // A helper that returns true if the given value is a bitcast or
3806  // getelementptr instruction contained in the loop.
3807  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3808    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3809            isa<GetElementPtrInst>(V)) &&
3810           !TheLoop->isLoopInvariant(V);
3811  };
3812
3813  // A helper that evaluates a memory access's use of a pointer. If the use will
3814  // be a scalar use and the pointer is only used by memory accesses, we place
3815  // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3816  // PossibleNonScalarPtrs.
3817  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3818    // We only care about bitcast and getelementptr instructions contained in
3819    // the loop.
3820    if (!isLoopVaryingBitCastOrGEP(Ptr))
3821      return;
3822
3823    // If the pointer has already been identified as scalar (e.g., if it was
3824    // also identified as uniform), there's nothing to do.
3825    auto *I = cast<Instruction>(Ptr);
3826    if (Worklist.count(I))
3827      return;
3828
3829    // If the use of the pointer will be a scalar use, and all users of the
3830    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831    // place the pointer in PossibleNonScalarPtrs.
3832    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3833          return isa<LoadInst>(U) || isa<StoreInst>(U);
3834        }))
3835      ScalarPtrs.insert(I);
3836    else
3837      PossibleNonScalarPtrs.insert(I);
3838  };
3839
3840  // We seed the scalars analysis with three classes of instructions: (1)
3841  // instructions marked uniform-after-vectorization and (2) bitcast,
3842  // getelementptr and (pointer) phi instructions used by memory accesses
3843  // requiring a scalar use.
3844  //
3845  // (1) Add to the worklist all instructions that have been identified as
3846  // uniform-after-vectorization.
3847  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3848
3849  // (2) Add to the worklist all bitcast and getelementptr instructions used by
3850  // memory accesses requiring a scalar use. The pointer operands of loads and
3851  // stores will be scalar as long as the memory accesses is not a gather or
3852  // scatter operation. The value operand of a store will remain scalar if the
3853  // store is scalarized.
3854  for (auto *BB : TheLoop->blocks())
3855    for (auto &I : *BB) {
3856      if (auto *Load = dyn_cast<LoadInst>(&I)) {
3857        evaluatePtrUse(Load, Load->getPointerOperand());
3858      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3859        evaluatePtrUse(Store, Store->getPointerOperand());
3860        evaluatePtrUse(Store, Store->getValueOperand());
3861      }
3862    }
3863  for (auto *I : ScalarPtrs)
3864    if (!PossibleNonScalarPtrs.count(I)) {
3865      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3866      Worklist.insert(I);
3867    }
3868
3869  // Insert the forced scalars.
3870  // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3871  // induction variable when the PHI user is scalarized.
3872  auto ForcedScalar = ForcedScalars.find(VF);
3873  if (ForcedScalar != ForcedScalars.end())
3874    for (auto *I : ForcedScalar->second) {
3875      LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3876      Worklist.insert(I);
3877    }
3878
3879  // Expand the worklist by looking through any bitcasts and getelementptr
3880  // instructions we've already identified as scalar. This is similar to the
3881  // expansion step in collectLoopUniforms(); however, here we're only
3882  // expanding to include additional bitcasts and getelementptr instructions.
3883  unsigned Idx = 0;
3884  while (Idx != Worklist.size()) {
3885    Instruction *Dst = Worklist[Idx++];
3886    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3887      continue;
3888    auto *Src = cast<Instruction>(Dst->getOperand(0));
3889    if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3890          auto *J = cast<Instruction>(U);
3891          return !TheLoop->contains(J) || Worklist.count(J) ||
3892                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3893                  isScalarUse(J, Src));
3894        })) {
3895      Worklist.insert(Src);
3896      LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3897    }
3898  }
3899
3900  // An induction variable will remain scalar if all users of the induction
3901  // variable and induction variable update remain scalar.
3902  for (const auto &Induction : Legal->getInductionVars()) {
3903    auto *Ind = Induction.first;
3904    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3905
3906    // If tail-folding is applied, the primary induction variable will be used
3907    // to feed a vector compare.
3908    if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3909      continue;
3910
3911    // Returns true if \p Indvar is a pointer induction that is used directly by
3912    // load/store instruction \p I.
3913    auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3914                                              Instruction *I) {
3915      return Induction.second.getKind() ==
3916                 InductionDescriptor::IK_PtrInduction &&
3917             (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3918             Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3919    };
3920
3921    // Determine if all users of the induction variable are scalar after
3922    // vectorization.
3923    auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3924      auto *I = cast<Instruction>(U);
3925      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3926             IsDirectLoadStoreFromPtrIndvar(Ind, I);
3927    });
3928    if (!ScalarInd)
3929      continue;
3930
3931    // Determine if all users of the induction variable update instruction are
3932    // scalar after vectorization.
3933    auto ScalarIndUpdate =
3934        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3935          auto *I = cast<Instruction>(U);
3936          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3937                 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3938        });
3939    if (!ScalarIndUpdate)
3940      continue;
3941
3942    // The induction variable and its update instruction will remain scalar.
3943    Worklist.insert(Ind);
3944    Worklist.insert(IndUpdate);
3945    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3946    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3947                      << "\n");
3948  }
3949
3950  Scalars[VF].insert(Worklist.begin(), Worklist.end());
3951}
3952
3953bool LoopVectorizationCostModel::isScalarWithPredication(
3954    Instruction *I, ElementCount VF) const {
3955  if (!isPredicatedInst(I))
3956    return false;
3957
3958  // Do we have a non-scalar lowering for this predicated
3959  // instruction? No - it is scalar with predication.
3960  switch(I->getOpcode()) {
3961  default:
3962    return true;
3963  case Instruction::Call:
3964    if (VF.isScalar())
3965      return true;
3966    return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3967               .Kind == CM_Scalarize;
3968  case Instruction::Load:
3969  case Instruction::Store: {
3970    auto *Ptr = getLoadStorePointerOperand(I);
3971    auto *Ty = getLoadStoreType(I);
3972    Type *VTy = Ty;
3973    if (VF.isVector())
3974      VTy = VectorType::get(Ty, VF);
3975    const Align Alignment = getLoadStoreAlignment(I);
3976    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3977                                TTI.isLegalMaskedGather(VTy, Alignment))
3978                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3979                                TTI.isLegalMaskedScatter(VTy, Alignment));
3980  }
3981  case Instruction::UDiv:
3982  case Instruction::SDiv:
3983  case Instruction::SRem:
3984  case Instruction::URem: {
3985    // We have the option to use the safe-divisor idiom to avoid predication.
3986    // The cost based decision here will always select safe-divisor for
3987    // scalable vectors as scalarization isn't legal.
3988    const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3989    return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3990  }
3991  }
3992}
3993
3994bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3995  if (!blockNeedsPredicationForAnyReason(I->getParent()))
3996    return false;
3997
3998  // Can we prove this instruction is safe to unconditionally execute?
3999  // If not, we must use some form of predication.
4000  switch(I->getOpcode()) {
4001  default:
4002    return false;
4003  case Instruction::Load:
4004  case Instruction::Store: {
4005    if (!Legal->isMaskRequired(I))
4006      return false;
4007    // When we know the load's address is loop invariant and the instruction
4008    // in the original scalar loop was unconditionally executed then we
4009    // don't need to mark it as a predicated instruction. Tail folding may
4010    // introduce additional predication, but we're guaranteed to always have
4011    // at least one active lane.  We call Legal->blockNeedsPredication here
4012    // because it doesn't query tail-folding.  For stores, we need to prove
4013    // both speculation safety (which follows from the same argument as loads),
4014    // but also must prove the value being stored is correct.  The easiest
4015    // form of the later is to require that all values stored are the same.
4016    if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4017        (isa<LoadInst>(I) ||
4018         (isa<StoreInst>(I) &&
4019          TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4020        !Legal->blockNeedsPredication(I->getParent()))
4021      return false;
4022    return true;
4023  }
4024  case Instruction::UDiv:
4025  case Instruction::SDiv:
4026  case Instruction::SRem:
4027  case Instruction::URem:
4028    // TODO: We can use the loop-preheader as context point here and get
4029    // context sensitive reasoning
4030    return !isSafeToSpeculativelyExecute(I);
4031  case Instruction::Call:
4032    return Legal->isMaskRequired(I);
4033  }
4034}
4035
4036std::pair<InstructionCost, InstructionCost>
4037LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4038                                                    ElementCount VF) const {
4039  assert(I->getOpcode() == Instruction::UDiv ||
4040         I->getOpcode() == Instruction::SDiv ||
4041         I->getOpcode() == Instruction::SRem ||
4042         I->getOpcode() == Instruction::URem);
4043  assert(!isSafeToSpeculativelyExecute(I));
4044
4045  const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4046
4047  // Scalarization isn't legal for scalable vector types
4048  InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4049  if (!VF.isScalable()) {
4050    // Get the scalarization cost and scale this amount by the probability of
4051    // executing the predicated block. If the instruction is not predicated,
4052    // we fall through to the next case.
4053    ScalarizationCost = 0;
4054
4055    // These instructions have a non-void type, so account for the phi nodes
4056    // that we will create. This cost is likely to be zero. The phi node
4057    // cost, if any, should be scaled by the block probability because it
4058    // models a copy at the end of each predicated block.
4059    ScalarizationCost += VF.getKnownMinValue() *
4060      TTI.getCFInstrCost(Instruction::PHI, CostKind);
4061
4062    // The cost of the non-predicated instruction.
4063    ScalarizationCost += VF.getKnownMinValue() *
4064      TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4065
4066    // The cost of insertelement and extractelement instructions needed for
4067    // scalarization.
4068    ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4069
4070    // Scale the cost by the probability of executing the predicated blocks.
4071    // This assumes the predicated block for each vector lane is equally
4072    // likely.
4073    ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4074  }
4075  InstructionCost SafeDivisorCost = 0;
4076
4077  auto *VecTy = ToVectorTy(I->getType(), VF);
4078
4079  // The cost of the select guard to ensure all lanes are well defined
4080  // after we speculate above any internal control flow.
4081  SafeDivisorCost += TTI.getCmpSelInstrCost(
4082    Instruction::Select, VecTy,
4083    ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4084    CmpInst::BAD_ICMP_PREDICATE, CostKind);
4085
4086  // Certain instructions can be cheaper to vectorize if they have a constant
4087  // second vector operand. One example of this are shifts on x86.
4088  Value *Op2 = I->getOperand(1);
4089  auto Op2Info = TTI.getOperandInfo(Op2);
4090  if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4091      Legal->isInvariant(Op2))
4092    Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4093
4094  SmallVector<const Value *, 4> Operands(I->operand_values());
4095  SafeDivisorCost += TTI.getArithmeticInstrCost(
4096    I->getOpcode(), VecTy, CostKind,
4097    {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4098    Op2Info, Operands, I);
4099  return {ScalarizationCost, SafeDivisorCost};
4100}
4101
4102bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103    Instruction *I, ElementCount VF) {
4104  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4105  assert(getWideningDecision(I, VF) == CM_Unknown &&
4106         "Decision should not be set yet.");
4107  auto *Group = getInterleavedAccessGroup(I);
4108  assert(Group && "Must have a group.");
4109
4110  // If the instruction's allocated size doesn't equal it's type size, it
4111  // requires padding and will be scalarized.
4112  auto &DL = I->getModule()->getDataLayout();
4113  auto *ScalarTy = getLoadStoreType(I);
4114  if (hasIrregularType(ScalarTy, DL))
4115    return false;
4116
4117  // If the group involves a non-integral pointer, we may not be able to
4118  // losslessly cast all values to a common type.
4119  unsigned InterleaveFactor = Group->getFactor();
4120  bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4121  for (unsigned i = 0; i < InterleaveFactor; i++) {
4122    Instruction *Member = Group->getMember(i);
4123    if (!Member)
4124      continue;
4125    auto *MemberTy = getLoadStoreType(Member);
4126    bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4127    // Don't coerce non-integral pointers to integers or vice versa.
4128    if (MemberNI != ScalarNI) {
4129      // TODO: Consider adding special nullptr value case here
4130      return false;
4131    } else if (MemberNI && ScalarNI &&
4132               ScalarTy->getPointerAddressSpace() !=
4133               MemberTy->getPointerAddressSpace()) {
4134      return false;
4135    }
4136  }
4137
4138  // Check if masking is required.
4139  // A Group may need masking for one of two reasons: it resides in a block that
4140  // needs predication, or it was decided to use masking to deal with gaps
4141  // (either a gap at the end of a load-access that may result in a speculative
4142  // load, or any gaps in a store-access).
4143  bool PredicatedAccessRequiresMasking =
4144      blockNeedsPredicationForAnyReason(I->getParent()) &&
4145      Legal->isMaskRequired(I);
4146  bool LoadAccessWithGapsRequiresEpilogMasking =
4147      isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4148      !isScalarEpilogueAllowed();
4149  bool StoreAccessWithGapsRequiresMasking =
4150      isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4151  if (!PredicatedAccessRequiresMasking &&
4152      !LoadAccessWithGapsRequiresEpilogMasking &&
4153      !StoreAccessWithGapsRequiresMasking)
4154    return true;
4155
4156  // If masked interleaving is required, we expect that the user/target had
4157  // enabled it, because otherwise it either wouldn't have been created or
4158  // it should have been invalidated by the CostModel.
4159  assert(useMaskedInterleavedAccesses(TTI) &&
4160         "Masked interleave-groups for predicated accesses are not enabled.");
4161
4162  if (Group->isReverse())
4163    return false;
4164
4165  auto *Ty = getLoadStoreType(I);
4166  const Align Alignment = getLoadStoreAlignment(I);
4167  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4168                          : TTI.isLegalMaskedStore(Ty, Alignment);
4169}
4170
4171bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172    Instruction *I, ElementCount VF) {
4173  // Get and ensure we have a valid memory instruction.
4174  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4175
4176  auto *Ptr = getLoadStorePointerOperand(I);
4177  auto *ScalarTy = getLoadStoreType(I);
4178
4179  // In order to be widened, the pointer should be consecutive, first of all.
4180  if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4181    return false;
4182
4183  // If the instruction is a store located in a predicated block, it will be
4184  // scalarized.
4185  if (isScalarWithPredication(I, VF))
4186    return false;
4187
4188  // If the instruction's allocated size doesn't equal it's type size, it
4189  // requires padding and will be scalarized.
4190  auto &DL = I->getModule()->getDataLayout();
4191  if (hasIrregularType(ScalarTy, DL))
4192    return false;
4193
4194  return true;
4195}
4196
4197void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4198  // We should not collect Uniforms more than once per VF. Right now,
4199  // this function is called from collectUniformsAndScalars(), which
4200  // already does this check. Collecting Uniforms for VF=1 does not make any
4201  // sense.
4202
4203  assert(VF.isVector() && !Uniforms.contains(VF) &&
4204         "This function should not be visited twice for the same VF");
4205
4206  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4207  // not analyze again.  Uniforms.count(VF) will return 1.
4208  Uniforms[VF].clear();
4209
4210  // We now know that the loop is vectorizable!
4211  // Collect instructions inside the loop that will remain uniform after
4212  // vectorization.
4213
4214  // Global values, params and instructions outside of current loop are out of
4215  // scope.
4216  auto isOutOfScope = [&](Value *V) -> bool {
4217    Instruction *I = dyn_cast<Instruction>(V);
4218    return (!I || !TheLoop->contains(I));
4219  };
4220
4221  // Worklist containing uniform instructions demanding lane 0.
4222  SetVector<Instruction *> Worklist;
4223  BasicBlock *Latch = TheLoop->getLoopLatch();
4224
4225  // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226  // that are scalar with predication must not be considered uniform after
4227  // vectorization, because that would create an erroneous replicating region
4228  // where only a single instance out of VF should be formed.
4229  // TODO: optimize such seldom cases if found important, see PR40816.
4230  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4231    if (isOutOfScope(I)) {
4232      LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4233                        << *I << "\n");
4234      return;
4235    }
4236    if (isScalarWithPredication(I, VF)) {
4237      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4238                        << *I << "\n");
4239      return;
4240    }
4241    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4242    Worklist.insert(I);
4243  };
4244
4245  // Start with the conditional branch. If the branch condition is an
4246  // instruction contained in the loop that is only used by the branch, it is
4247  // uniform.
4248  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4249  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4250    addToWorklistIfAllowed(Cmp);
4251
4252  auto PrevVF = VF.divideCoefficientBy(2);
4253  // Return true if all lanes perform the same memory operation, and we can
4254  // thus chose to execute only one.
4255  auto isUniformMemOpUse = [&](Instruction *I) {
4256    // If the value was already known to not be uniform for the previous
4257    // (smaller VF), it cannot be uniform for the larger VF.
4258    if (PrevVF.isVector()) {
4259      auto Iter = Uniforms.find(PrevVF);
4260      if (Iter != Uniforms.end() && !Iter->second.contains(I))
4261        return false;
4262    }
4263    if (!Legal->isUniformMemOp(*I, VF))
4264      return false;
4265    if (isa<LoadInst>(I))
4266      // Loading the same address always produces the same result - at least
4267      // assuming aliasing and ordering which have already been checked.
4268      return true;
4269    // Storing the same value on every iteration.
4270    return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4271  };
4272
4273  auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4274    InstWidening WideningDecision = getWideningDecision(I, VF);
4275    assert(WideningDecision != CM_Unknown &&
4276           "Widening decision should be ready at this moment");
4277
4278    if (isUniformMemOpUse(I))
4279      return true;
4280
4281    return (WideningDecision == CM_Widen ||
4282            WideningDecision == CM_Widen_Reverse ||
4283            WideningDecision == CM_Interleave);
4284  };
4285
4286  // Returns true if Ptr is the pointer operand of a memory access instruction
4287  // I, I is known to not require scalarization, and the pointer is not also
4288  // stored.
4289  auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4290    if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4291      return false;
4292    return getLoadStorePointerOperand(I) == Ptr &&
4293           (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4294  };
4295
4296  // Holds a list of values which are known to have at least one uniform use.
4297  // Note that there may be other uses which aren't uniform.  A "uniform use"
4298  // here is something which only demands lane 0 of the unrolled iterations;
4299  // it does not imply that all lanes produce the same value (e.g. this is not
4300  // the usual meaning of uniform)
4301  SetVector<Value *> HasUniformUse;
4302
4303  // Scan the loop for instructions which are either a) known to have only
4304  // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4305  for (auto *BB : TheLoop->blocks())
4306    for (auto &I : *BB) {
4307      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4308        switch (II->getIntrinsicID()) {
4309        case Intrinsic::sideeffect:
4310        case Intrinsic::experimental_noalias_scope_decl:
4311        case Intrinsic::assume:
4312        case Intrinsic::lifetime_start:
4313        case Intrinsic::lifetime_end:
4314          if (TheLoop->hasLoopInvariantOperands(&I))
4315            addToWorklistIfAllowed(&I);
4316          break;
4317        default:
4318          break;
4319        }
4320      }
4321
4322      // ExtractValue instructions must be uniform, because the operands are
4323      // known to be loop-invariant.
4324      if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4325        assert(isOutOfScope(EVI->getAggregateOperand()) &&
4326               "Expected aggregate value to be loop invariant");
4327        addToWorklistIfAllowed(EVI);
4328        continue;
4329      }
4330
4331      // If there's no pointer operand, there's nothing to do.
4332      auto *Ptr = getLoadStorePointerOperand(&I);
4333      if (!Ptr)
4334        continue;
4335
4336      if (isUniformMemOpUse(&I))
4337        addToWorklistIfAllowed(&I);
4338
4339      if (isVectorizedMemAccessUse(&I, Ptr))
4340        HasUniformUse.insert(Ptr);
4341    }
4342
4343  // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344  // demanding) users.  Since loops are assumed to be in LCSSA form, this
4345  // disallows uses outside the loop as well.
4346  for (auto *V : HasUniformUse) {
4347    if (isOutOfScope(V))
4348      continue;
4349    auto *I = cast<Instruction>(V);
4350    auto UsersAreMemAccesses =
4351      llvm::all_of(I->users(), [&](User *U) -> bool {
4352        return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4353      });
4354    if (UsersAreMemAccesses)
4355      addToWorklistIfAllowed(I);
4356  }
4357
4358  // Expand Worklist in topological order: whenever a new instruction
4359  // is added , its users should be already inside Worklist.  It ensures
4360  // a uniform instruction will only be used by uniform instructions.
4361  unsigned idx = 0;
4362  while (idx != Worklist.size()) {
4363    Instruction *I = Worklist[idx++];
4364
4365    for (auto *OV : I->operand_values()) {
4366      // isOutOfScope operands cannot be uniform instructions.
4367      if (isOutOfScope(OV))
4368        continue;
4369      // First order recurrence Phi's should typically be considered
4370      // non-uniform.
4371      auto *OP = dyn_cast<PHINode>(OV);
4372      if (OP && Legal->isFixedOrderRecurrence(OP))
4373        continue;
4374      // If all the users of the operand are uniform, then add the
4375      // operand into the uniform worklist.
4376      auto *OI = cast<Instruction>(OV);
4377      if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4378            auto *J = cast<Instruction>(U);
4379            return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4380          }))
4381        addToWorklistIfAllowed(OI);
4382    }
4383  }
4384
4385  // For an instruction to be added into Worklist above, all its users inside
4386  // the loop should also be in Worklist. However, this condition cannot be
4387  // true for phi nodes that form a cyclic dependence. We must process phi
4388  // nodes separately. An induction variable will remain uniform if all users
4389  // of the induction variable and induction variable update remain uniform.
4390  // The code below handles both pointer and non-pointer induction variables.
4391  for (const auto &Induction : Legal->getInductionVars()) {
4392    auto *Ind = Induction.first;
4393    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4394
4395    // Determine if all users of the induction variable are uniform after
4396    // vectorization.
4397    auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4398      auto *I = cast<Instruction>(U);
4399      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4400             isVectorizedMemAccessUse(I, Ind);
4401    });
4402    if (!UniformInd)
4403      continue;
4404
4405    // Determine if all users of the induction variable update instruction are
4406    // uniform after vectorization.
4407    auto UniformIndUpdate =
4408        llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4409          auto *I = cast<Instruction>(U);
4410          return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4411                 isVectorizedMemAccessUse(I, IndUpdate);
4412        });
4413    if (!UniformIndUpdate)
4414      continue;
4415
4416    // The induction variable and its update instruction will remain uniform.
4417    addToWorklistIfAllowed(Ind);
4418    addToWorklistIfAllowed(IndUpdate);
4419  }
4420
4421  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4422}
4423
4424bool LoopVectorizationCostModel::runtimeChecksRequired() {
4425  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4426
4427  if (Legal->getRuntimePointerChecking()->Need) {
4428    reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4429        "runtime pointer checks needed. Enable vectorization of this "
4430        "loop with '#pragma clang loop vectorize(enable)' when "
4431        "compiling with -Os/-Oz",
4432        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4433    return true;
4434  }
4435
4436  if (!PSE.getPredicate().isAlwaysTrue()) {
4437    reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4438        "runtime SCEV checks needed. Enable vectorization of this "
4439        "loop with '#pragma clang loop vectorize(enable)' when "
4440        "compiling with -Os/-Oz",
4441        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4442    return true;
4443  }
4444
4445  // FIXME: Avoid specializing for stride==1 instead of bailing out.
4446  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4447    reportVectorizationFailure("Runtime stride check for small trip count",
4448        "runtime stride == 1 checks needed. Enable vectorization of "
4449        "this loop without such check by compiling with -Os/-Oz",
4450        "CantVersionLoopWithOptForSize", ORE, TheLoop);
4451    return true;
4452  }
4453
4454  return false;
4455}
4456
4457ElementCount
4458LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4459  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4460    return ElementCount::getScalable(0);
4461
4462  if (Hints->isScalableVectorizationDisabled()) {
4463    reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464                            "ScalableVectorizationDisabled", ORE, TheLoop);
4465    return ElementCount::getScalable(0);
4466  }
4467
4468  LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4469
4470  auto MaxScalableVF = ElementCount::getScalable(
4471      std::numeric_limits<ElementCount::ScalarTy>::max());
4472
4473  // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474  // FIXME: While for scalable vectors this is currently sufficient, this should
4475  // be replaced by a more detailed mechanism that filters out specific VFs,
4476  // instead of invalidating vectorization for a whole set of VFs based on the
4477  // MaxVF.
4478
4479  // Disable scalable vectorization if the loop contains unsupported reductions.
4480  if (!canVectorizeReductions(MaxScalableVF)) {
4481    reportVectorizationInfo(
4482        "Scalable vectorization not supported for the reduction "
4483        "operations found in this loop.",
4484        "ScalableVFUnfeasible", ORE, TheLoop);
4485    return ElementCount::getScalable(0);
4486  }
4487
4488  // Disable scalable vectorization if the loop contains any instructions
4489  // with element types not supported for scalable vectors.
4490  if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4491        return !Ty->isVoidTy() &&
4492               !this->TTI.isElementTypeLegalForScalableVector(Ty);
4493      })) {
4494    reportVectorizationInfo("Scalable vectorization is not supported "
4495                            "for all element types found in this loop.",
4496                            "ScalableVFUnfeasible", ORE, TheLoop);
4497    return ElementCount::getScalable(0);
4498  }
4499
4500  if (Legal->isSafeForAnyVectorWidth())
4501    return MaxScalableVF;
4502
4503  // Limit MaxScalableVF by the maximum safe dependence distance.
4504  if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4505    MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4506  else
4507    MaxScalableVF = ElementCount::getScalable(0);
4508
4509  if (!MaxScalableVF)
4510    reportVectorizationInfo(
4511        "Max legal vector width too small, scalable vectorization "
4512        "unfeasible.",
4513        "ScalableVFUnfeasible", ORE, TheLoop);
4514
4515  return MaxScalableVF;
4516}
4517
4518FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4519    unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4520  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4521  unsigned SmallestType, WidestType;
4522  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4523
4524  // Get the maximum safe dependence distance in bits computed by LAA.
4525  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526  // the memory accesses that is most restrictive (involved in the smallest
4527  // dependence distance).
4528  unsigned MaxSafeElements =
4529      llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4530
4531  auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4532  auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4533
4534  LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4535                    << ".\n");
4536  LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4537                    << ".\n");
4538
4539  // First analyze the UserVF, fall back if the UserVF should be ignored.
4540  if (UserVF) {
4541    auto MaxSafeUserVF =
4542        UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4543
4544    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4545      // If `VF=vscale x N` is safe, then so is `VF=N`
4546      if (UserVF.isScalable())
4547        return FixedScalableVFPair(
4548            ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4549      else
4550        return UserVF;
4551    }
4552
4553    assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4554
4555    // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556    // is better to ignore the hint and let the compiler choose a suitable VF.
4557    if (!UserVF.isScalable()) {
4558      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559                        << " is unsafe, clamping to max safe VF="
4560                        << MaxSafeFixedVF << ".\n");
4561      ORE->emit([&]() {
4562        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4563                                          TheLoop->getStartLoc(),
4564                                          TheLoop->getHeader())
4565               << "User-specified vectorization factor "
4566               << ore::NV("UserVectorizationFactor", UserVF)
4567               << " is unsafe, clamping to maximum safe vectorization factor "
4568               << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4569      });
4570      return MaxSafeFixedVF;
4571    }
4572
4573    if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4574      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575                        << " is ignored because scalable vectors are not "
4576                           "available.\n");
4577      ORE->emit([&]() {
4578        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4579                                          TheLoop->getStartLoc(),
4580                                          TheLoop->getHeader())
4581               << "User-specified vectorization factor "
4582               << ore::NV("UserVectorizationFactor", UserVF)
4583               << " is ignored because the target does not support scalable "
4584                  "vectors. The compiler will pick a more suitable value.";
4585      });
4586    } else {
4587      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588                        << " is unsafe. Ignoring scalable UserVF.\n");
4589      ORE->emit([&]() {
4590        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4591                                          TheLoop->getStartLoc(),
4592                                          TheLoop->getHeader())
4593               << "User-specified vectorization factor "
4594               << ore::NV("UserVectorizationFactor", UserVF)
4595               << " is unsafe. Ignoring the hint to let the compiler pick a "
4596                  "more suitable value.";
4597      });
4598    }
4599  }
4600
4601  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602                    << " / " << WidestType << " bits.\n");
4603
4604  FixedScalableVFPair Result(ElementCount::getFixed(1),
4605                             ElementCount::getScalable(0));
4606  if (auto MaxVF =
4607          getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4608                                  MaxSafeFixedVF, FoldTailByMasking))
4609    Result.FixedVF = MaxVF;
4610
4611  if (auto MaxVF =
4612          getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4613                                  MaxSafeScalableVF, FoldTailByMasking))
4614    if (MaxVF.isScalable()) {
4615      Result.ScalableVF = MaxVF;
4616      LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4617                        << "\n");
4618    }
4619
4620  return Result;
4621}
4622
4623FixedScalableVFPair
4624LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4625  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4626    // TODO: It may by useful to do since it's still likely to be dynamically
4627    // uniform if the target can skip.
4628    reportVectorizationFailure(
4629        "Not inserting runtime ptr check for divergent target",
4630        "runtime pointer checks needed. Not enabled for divergent target",
4631        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4632    return FixedScalableVFPair::getNone();
4633  }
4634
4635  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4636  unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4637  LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4638  if (TC == 1) {
4639    reportVectorizationFailure("Single iteration (non) loop",
4640        "loop trip count is one, irrelevant for vectorization",
4641        "SingleIterationLoop", ORE, TheLoop);
4642    return FixedScalableVFPair::getNone();
4643  }
4644
4645  switch (ScalarEpilogueStatus) {
4646  case CM_ScalarEpilogueAllowed:
4647    return computeFeasibleMaxVF(MaxTC, UserVF, false);
4648  case CM_ScalarEpilogueNotAllowedUsePredicate:
4649    [[fallthrough]];
4650  case CM_ScalarEpilogueNotNeededUsePredicate:
4651    LLVM_DEBUG(
4652        dbgs() << "LV: vector predicate hint/switch found.\n"
4653               << "LV: Not allowing scalar epilogue, creating predicated "
4654               << "vector loop.\n");
4655    break;
4656  case CM_ScalarEpilogueNotAllowedLowTripLoop:
4657    // fallthrough as a special case of OptForSize
4658  case CM_ScalarEpilogueNotAllowedOptSize:
4659    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4660      LLVM_DEBUG(
4661          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4662    else
4663      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4664                        << "count.\n");
4665
4666    // Bail if runtime checks are required, which are not good when optimising
4667    // for size.
4668    if (runtimeChecksRequired())
4669      return FixedScalableVFPair::getNone();
4670
4671    break;
4672  }
4673
4674  // The only loops we can vectorize without a scalar epilogue, are loops with
4675  // a bottom-test and a single exiting block. We'd have to handle the fact
4676  // that not every instruction executes on the last iteration.  This will
4677  // require a lane mask which varies through the vector loop body.  (TODO)
4678  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4679    // If there was a tail-folding hint/switch, but we can't fold the tail by
4680    // masking, fallback to a vectorization with a scalar epilogue.
4681    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4682      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683                           "scalar epilogue instead.\n");
4684      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4685      return computeFeasibleMaxVF(MaxTC, UserVF, false);
4686    }
4687    return FixedScalableVFPair::getNone();
4688  }
4689
4690  // Now try the tail folding
4691
4692  // Invalidate interleave groups that require an epilogue if we can't mask
4693  // the interleave-group.
4694  if (!useMaskedInterleavedAccesses(TTI)) {
4695    assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4696           "No decisions should have been taken at this point");
4697    // Note: There is no need to invalidate any cost modeling decisions here, as
4698    // non where taken so far.
4699    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4700  }
4701
4702  FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4703
4704  // Avoid tail folding if the trip count is known to be a multiple of any VF
4705  // we choose.
4706  std::optional<unsigned> MaxPowerOf2RuntimeVF =
4707      MaxFactors.FixedVF.getFixedValue();
4708  if (MaxFactors.ScalableVF) {
4709    std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4710    if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4711      MaxPowerOf2RuntimeVF = std::max<unsigned>(
4712          *MaxPowerOf2RuntimeVF,
4713          *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4714    } else
4715      MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4716  }
4717
4718  if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4719    assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4720           "MaxFixedVF must be a power of 2");
4721    unsigned MaxVFtimesIC =
4722        UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4723    ScalarEvolution *SE = PSE.getSE();
4724    const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4725    const SCEV *ExitCount = SE->getAddExpr(
4726        BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4727    const SCEV *Rem = SE->getURemExpr(
4728        SE->applyLoopGuards(ExitCount, TheLoop),
4729        SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4730    if (Rem->isZero()) {
4731      // Accept MaxFixedVF if we do not have a tail.
4732      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4733      return MaxFactors;
4734    }
4735  }
4736
4737  // If we don't know the precise trip count, or if the trip count that we
4738  // found modulo the vectorization factor is not zero, try to fold the tail
4739  // by masking.
4740  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4741  if (Legal->prepareToFoldTailByMasking()) {
4742    CanFoldTailByMasking = true;
4743    return MaxFactors;
4744  }
4745
4746  // If there was a tail-folding hint/switch, but we can't fold the tail by
4747  // masking, fallback to a vectorization with a scalar epilogue.
4748  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4749    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750                         "scalar epilogue instead.\n");
4751    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4752    return MaxFactors;
4753  }
4754
4755  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4756    LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757    return FixedScalableVFPair::getNone();
4758  }
4759
4760  if (TC == 0) {
4761    reportVectorizationFailure(
4762        "Unable to calculate the loop count due to complex control flow",
4763        "unable to calculate the loop count due to complex control flow",
4764        "UnknownLoopCountComplexCFG", ORE, TheLoop);
4765    return FixedScalableVFPair::getNone();
4766  }
4767
4768  reportVectorizationFailure(
4769      "Cannot optimize for size and vectorize at the same time.",
4770      "cannot optimize for size and vectorize at the same time. "
4771      "Enable vectorization of this loop with '#pragma clang loop "
4772      "vectorize(enable)' when compiling with -Os/-Oz",
4773      "NoTailLoopWithOptForSize", ORE, TheLoop);
4774  return FixedScalableVFPair::getNone();
4775}
4776
4777ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4778    unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4779    ElementCount MaxSafeVF, bool FoldTailByMasking) {
4780  bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4781  const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4782      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4783                           : TargetTransformInfo::RGK_FixedWidthVector);
4784
4785  // Convenience function to return the minimum of two ElementCounts.
4786  auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4787    assert((LHS.isScalable() == RHS.isScalable()) &&
4788           "Scalable flags must match");
4789    return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4790  };
4791
4792  // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4793  // Note that both WidestRegister and WidestType may not be a powers of 2.
4794  auto MaxVectorElementCount = ElementCount::get(
4795      llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4796      ComputeScalableMaxVF);
4797  MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4798  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799                    << (MaxVectorElementCount * WidestType) << " bits.\n");
4800
4801  if (!MaxVectorElementCount) {
4802    LLVM_DEBUG(dbgs() << "LV: The target has no "
4803                      << (ComputeScalableMaxVF ? "scalable" : "fixed")
4804                      << " vector registers.\n");
4805    return ElementCount::getFixed(1);
4806  }
4807
4808  unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4809  if (MaxVectorElementCount.isScalable() &&
4810      TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4811    auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4812    auto Min = Attr.getVScaleRangeMin();
4813    WidestRegisterMinEC *= Min;
4814  }
4815
4816  // When a scalar epilogue is required, at least one iteration of the scalar
4817  // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818  // max VF that results in a dead vector loop.
4819  if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4820    MaxTripCount -= 1;
4821
4822  if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4823      (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4824    // If upper bound loop trip count (TC) is known at compile time there is no
4825    // point in choosing VF greater than TC (as done in the loop below). Select
4826    // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827    // scalable, we only fall back on a fixed VF when the TC is less than or
4828    // equal to the known number of lanes.
4829    auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4830    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4831                         "exceeding the constant trip count: "
4832                      << ClampedUpperTripCount << "\n");
4833    return ElementCount::get(
4834        ClampedUpperTripCount,
4835        FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4836  }
4837
4838  TargetTransformInfo::RegisterKind RegKind =
4839      ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4840                           : TargetTransformInfo::RGK_FixedWidthVector;
4841  ElementCount MaxVF = MaxVectorElementCount;
4842  if (MaximizeBandwidth ||
4843      (MaximizeBandwidth.getNumOccurrences() == 0 &&
4844       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4845        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4846    auto MaxVectorElementCountMaxBW = ElementCount::get(
4847        llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4848        ComputeScalableMaxVF);
4849    MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4850
4851    // Collect all viable vectorization factors larger than the default MaxVF
4852    // (i.e. MaxVectorElementCount).
4853    SmallVector<ElementCount, 8> VFs;
4854    for (ElementCount VS = MaxVectorElementCount * 2;
4855         ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4856      VFs.push_back(VS);
4857
4858    // For each VF calculate its register usage.
4859    auto RUs = calculateRegisterUsage(VFs);
4860
4861    // Select the largest VF which doesn't require more registers than existing
4862    // ones.
4863    for (int i = RUs.size() - 1; i >= 0; --i) {
4864      bool Selected = true;
4865      for (auto &pair : RUs[i].MaxLocalUsers) {
4866        unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4867        if (pair.second > TargetNumRegisters)
4868          Selected = false;
4869      }
4870      if (Selected) {
4871        MaxVF = VFs[i];
4872        break;
4873      }
4874    }
4875    if (ElementCount MinVF =
4876            TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4877      if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4878        LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4879                          << ") with target's minimum: " << MinVF << '\n');
4880        MaxVF = MinVF;
4881      }
4882    }
4883
4884    // Invalidate any widening decisions we might have made, in case the loop
4885    // requires prediction (decided later), but we have already made some
4886    // load/store widening decisions.
4887    invalidateCostModelingDecisions();
4888  }
4889  return MaxVF;
4890}
4891
4892/// Convenience function that returns the value of vscale_range iff
4893/// vscale_range.min == vscale_range.max or otherwise returns the value
4894/// returned by the corresponding TTI method.
4895static std::optional<unsigned>
4896getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4897  const Function *Fn = L->getHeader()->getParent();
4898  if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4899    auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4900    auto Min = Attr.getVScaleRangeMin();
4901    auto Max = Attr.getVScaleRangeMax();
4902    if (Max && Min == Max)
4903      return Max;
4904  }
4905
4906  return TTI.getVScaleForTuning();
4907}
4908
4909bool LoopVectorizationPlanner::isMoreProfitable(
4910    const VectorizationFactor &A, const VectorizationFactor &B) const {
4911  InstructionCost CostA = A.Cost;
4912  InstructionCost CostB = B.Cost;
4913
4914  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4915
4916  if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4917    // If the trip count is a known (possibly small) constant, the trip count
4918    // will be rounded up to an integer number of iterations under
4919    // FoldTailByMasking. The total cost in that case will be
4920    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922    // some extra overheads, but for the purpose of comparing the costs of
4923    // different VFs we can use this to compare the total loop-body cost
4924    // expected after vectorization.
4925    auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4926                                             InstructionCost VectorCost,
4927                                             InstructionCost ScalarCost) {
4928      return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4929                                    : VectorCost * (MaxTripCount / VF) +
4930                                          ScalarCost * (MaxTripCount % VF);
4931    };
4932    auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4933    auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4934
4935    return RTCostA < RTCostB;
4936  }
4937
4938  // Improve estimate for the vector width if it is scalable.
4939  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4940  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4941  if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4942    if (A.Width.isScalable())
4943      EstimatedWidthA *= *VScale;
4944    if (B.Width.isScalable())
4945      EstimatedWidthB *= *VScale;
4946  }
4947
4948  // Assume vscale may be larger than 1 (or the value being tuned for),
4949  // so that scalable vectorization is slightly favorable over fixed-width
4950  // vectorization.
4951  if (A.Width.isScalable() && !B.Width.isScalable())
4952    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4953
4954  // To avoid the need for FP division:
4955  //      (CostA / A.Width) < (CostB / B.Width)
4956  // <=>  (CostA * B.Width) < (CostB * A.Width)
4957  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4958}
4959
4960static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4961                                   OptimizationRemarkEmitter *ORE,
4962                                   Loop *TheLoop) {
4963  if (InvalidCosts.empty())
4964    return;
4965
4966  // Emit a report of VFs with invalid costs in the loop.
4967
4968  // Group the remarks per instruction, keeping the instruction order from
4969  // InvalidCosts.
4970  std::map<Instruction *, unsigned> Numbering;
4971  unsigned I = 0;
4972  for (auto &Pair : InvalidCosts)
4973    if (!Numbering.count(Pair.first))
4974      Numbering[Pair.first] = I++;
4975
4976  // Sort the list, first on instruction(number) then on VF.
4977  sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4978    if (Numbering[A.first] != Numbering[B.first])
4979      return Numbering[A.first] < Numbering[B.first];
4980    ElementCountComparator ECC;
4981    return ECC(A.second, B.second);
4982  });
4983
4984  // For a list of ordered instruction-vf pairs:
4985  //   [(load, vf1), (load, vf2), (store, vf1)]
4986  // Group the instructions together to emit separate remarks for:
4987  //   load  (vf1, vf2)
4988  //   store (vf1)
4989  auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4990  auto Subset = ArrayRef<InstructionVFPair>();
4991  do {
4992    if (Subset.empty())
4993      Subset = Tail.take_front(1);
4994
4995    Instruction *I = Subset.front().first;
4996
4997    // If the next instruction is different, or if there are no other pairs,
4998    // emit a remark for the collated subset. e.g.
4999    //   [(load, vf1), (load, vf2))]
5000    // to emit:
5001    //  remark: invalid costs for 'load' at VF=(vf, vf2)
5002    if (Subset == Tail || Tail[Subset.size()].first != I) {
5003      std::string OutString;
5004      raw_string_ostream OS(OutString);
5005      assert(!Subset.empty() && "Unexpected empty range");
5006      OS << "Instruction with invalid costs prevented vectorization at VF=(";
5007      for (const auto &Pair : Subset)
5008        OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5009      OS << "):";
5010      if (auto *CI = dyn_cast<CallInst>(I))
5011        OS << " call to " << CI->getCalledFunction()->getName();
5012      else
5013        OS << " " << I->getOpcodeName();
5014      OS.flush();
5015      reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5016      Tail = Tail.drop_front(Subset.size());
5017      Subset = {};
5018    } else
5019      // Grow the subset by one element
5020      Subset = Tail.take_front(Subset.size() + 1);
5021  } while (!Tail.empty());
5022}
5023
5024VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5025    const ElementCountSet &VFCandidates) {
5026  InstructionCost ExpectedCost =
5027      CM.expectedCost(ElementCount::getFixed(1)).first;
5028  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5029  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5030  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5031         "Expected Scalar VF to be a candidate");
5032
5033  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5034                                       ExpectedCost);
5035  VectorizationFactor ChosenFactor = ScalarCost;
5036
5037  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5038  if (ForceVectorization && VFCandidates.size() > 1) {
5039    // Ignore scalar width, because the user explicitly wants vectorization.
5040    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5041    // evaluation.
5042    ChosenFactor.Cost = InstructionCost::getMax();
5043  }
5044
5045  SmallVector<InstructionVFPair> InvalidCosts;
5046  for (const auto &i : VFCandidates) {
5047    // The cost for scalar VF=1 is already calculated, so ignore it.
5048    if (i.isScalar())
5049      continue;
5050
5051    LoopVectorizationCostModel::VectorizationCostTy C =
5052        CM.expectedCost(i, &InvalidCosts);
5053    VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5054
5055#ifndef NDEBUG
5056    unsigned AssumedMinimumVscale =
5057        getVScaleForTuning(OrigLoop, TTI).value_or(1);
5058    unsigned Width =
5059        Candidate.Width.isScalable()
5060            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5061            : Candidate.Width.getFixedValue();
5062    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063                      << " costs: " << (Candidate.Cost / Width));
5064    if (i.isScalable())
5065      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066                        << AssumedMinimumVscale << ")");
5067    LLVM_DEBUG(dbgs() << ".\n");
5068#endif
5069
5070    if (!C.second && !ForceVectorization) {
5071      LLVM_DEBUG(
5072          dbgs() << "LV: Not considering vector loop of width " << i
5073                 << " because it will not generate any vector instructions.\n");
5074      continue;
5075    }
5076
5077    // If profitable add it to ProfitableVF list.
5078    if (isMoreProfitable(Candidate, ScalarCost))
5079      ProfitableVFs.push_back(Candidate);
5080
5081    if (isMoreProfitable(Candidate, ChosenFactor))
5082      ChosenFactor = Candidate;
5083  }
5084
5085  emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5086
5087  if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5088    reportVectorizationFailure(
5089        "There are conditional stores.",
5090        "store that is conditionally executed prevents vectorization",
5091        "ConditionalStore", ORE, OrigLoop);
5092    ChosenFactor = ScalarCost;
5093  }
5094
5095  LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5096                 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5097             << "LV: Vectorization seems to be not beneficial, "
5098             << "but was forced by a user.\n");
5099  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5100  return ChosenFactor;
5101}
5102
5103bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104    ElementCount VF) const {
5105  // Cross iteration phis such as reductions need special handling and are
5106  // currently unsupported.
5107  if (any_of(OrigLoop->getHeader()->phis(),
5108             [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5109    return false;
5110
5111  // Phis with uses outside of the loop require special handling and are
5112  // currently unsupported.
5113  for (const auto &Entry : Legal->getInductionVars()) {
5114    // Look for uses of the value of the induction at the last iteration.
5115    Value *PostInc =
5116        Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5117    for (User *U : PostInc->users())
5118      if (!OrigLoop->contains(cast<Instruction>(U)))
5119        return false;
5120    // Look for uses of penultimate value of the induction.
5121    for (User *U : Entry.first->users())
5122      if (!OrigLoop->contains(cast<Instruction>(U)))
5123        return false;
5124  }
5125
5126  // Epilogue vectorization code has not been auditted to ensure it handles
5127  // non-latch exits properly.  It may be fine, but it needs auditted and
5128  // tested.
5129  if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5130    return false;
5131
5132  return true;
5133}
5134
5135bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136    const ElementCount VF) const {
5137  // FIXME: We need a much better cost-model to take different parameters such
5138  // as register pressure, code size increase and cost of extra branches into
5139  // account. For now we apply a very crude heuristic and only consider loops
5140  // with vectorization factors larger than a certain value.
5141
5142  // Allow the target to opt out entirely.
5143  if (!TTI.preferEpilogueVectorization())
5144    return false;
5145
5146  // We also consider epilogue vectorization unprofitable for targets that don't
5147  // consider interleaving beneficial (eg. MVE).
5148  if (TTI.getMaxInterleaveFactor(VF) <= 1)
5149    return false;
5150
5151  unsigned Multiplier = 1;
5152  if (VF.isScalable())
5153    Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5154  if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5155    return true;
5156  return false;
5157}
5158
5159VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160    const ElementCount MainLoopVF, unsigned IC) {
5161  VectorizationFactor Result = VectorizationFactor::Disabled();
5162  if (!EnableEpilogueVectorization) {
5163    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5164    return Result;
5165  }
5166
5167  if (!CM.isScalarEpilogueAllowed()) {
5168    LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169                         "epilogue is allowed.\n");
5170    return Result;
5171  }
5172
5173  // Not really a cost consideration, but check for unsupported cases here to
5174  // simplify the logic.
5175  if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5176    LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177                         "is not a supported candidate.\n");
5178    return Result;
5179  }
5180
5181  if (EpilogueVectorizationForceVF > 1) {
5182    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183    ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5184    if (hasPlanWithVF(ForcedEC))
5185      return {ForcedEC, 0, 0};
5186    else {
5187      LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5188                           "viable.\n");
5189      return Result;
5190    }
5191  }
5192
5193  if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5194      OrigLoop->getHeader()->getParent()->hasMinSize()) {
5195    LLVM_DEBUG(
5196        dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5197    return Result;
5198  }
5199
5200  if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5201    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5202                         "this loop\n");
5203    return Result;
5204  }
5205
5206  // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207  // the main loop handles 8 lanes per iteration. We could still benefit from
5208  // vectorizing the epilogue loop with VF=4.
5209  ElementCount EstimatedRuntimeVF = MainLoopVF;
5210  if (MainLoopVF.isScalable()) {
5211    EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5212    if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5213      EstimatedRuntimeVF *= *VScale;
5214  }
5215
5216  ScalarEvolution &SE = *PSE.getSE();
5217  Type *TCType = Legal->getWidestInductionType();
5218  const SCEV *RemainingIterations = nullptr;
5219  for (auto &NextVF : ProfitableVFs) {
5220    // Skip candidate VFs without a corresponding VPlan.
5221    if (!hasPlanWithVF(NextVF.Width))
5222      continue;
5223
5224    // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225    // vectors) or the VF of the main loop (fixed vectors).
5226    if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5227         ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5228        ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5229      continue;
5230
5231    // If NextVF is greater than the number of remaining iterations, the
5232    // epilogue loop would be dead. Skip such factors.
5233    if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5234      // TODO: extend to support scalable VFs.
5235      if (!RemainingIterations) {
5236        const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5237        RemainingIterations = SE.getURemExpr(
5238            TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5239      }
5240      if (SE.isKnownPredicate(
5241              CmpInst::ICMP_UGT,
5242              SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5243              RemainingIterations))
5244        continue;
5245    }
5246
5247    if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5248      Result = NextVF;
5249  }
5250
5251  if (Result != VectorizationFactor::Disabled())
5252    LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253                      << Result.Width << "\n");
5254  return Result;
5255}
5256
5257std::pair<unsigned, unsigned>
5258LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5259  unsigned MinWidth = -1U;
5260  unsigned MaxWidth = 8;
5261  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5262  // For in-loop reductions, no element types are added to ElementTypesInLoop
5263  // if there are no loads/stores in the loop. In this case, check through the
5264  // reduction variables to determine the maximum width.
5265  if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5266    // Reset MaxWidth so that we can find the smallest type used by recurrences
5267    // in the loop.
5268    MaxWidth = -1U;
5269    for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5270      const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5271      // When finding the min width used by the recurrence we need to account
5272      // for casts on the input operands of the recurrence.
5273      MaxWidth = std::min<unsigned>(
5274          MaxWidth, std::min<unsigned>(
5275                        RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5276                        RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5277    }
5278  } else {
5279    for (Type *T : ElementTypesInLoop) {
5280      MinWidth = std::min<unsigned>(
5281          MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5282      MaxWidth = std::max<unsigned>(
5283          MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5284    }
5285  }
5286  return {MinWidth, MaxWidth};
5287}
5288
5289void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290  ElementTypesInLoop.clear();
5291  // For each block.
5292  for (BasicBlock *BB : TheLoop->blocks()) {
5293    // For each instruction in the loop.
5294    for (Instruction &I : BB->instructionsWithoutDebug()) {
5295      Type *T = I.getType();
5296
5297      // Skip ignored values.
5298      if (ValuesToIgnore.count(&I))
5299        continue;
5300
5301      // Only examine Loads, Stores and PHINodes.
5302      if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5303        continue;
5304
5305      // Examine PHI nodes that are reduction variables. Update the type to
5306      // account for the recurrence type.
5307      if (auto *PN = dyn_cast<PHINode>(&I)) {
5308        if (!Legal->isReductionVariable(PN))
5309          continue;
5310        const RecurrenceDescriptor &RdxDesc =
5311            Legal->getReductionVars().find(PN)->second;
5312        if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5313            TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5314                                      RdxDesc.getRecurrenceType(),
5315                                      TargetTransformInfo::ReductionFlags()))
5316          continue;
5317        T = RdxDesc.getRecurrenceType();
5318      }
5319
5320      // Examine the stored values.
5321      if (auto *ST = dyn_cast<StoreInst>(&I))
5322        T = ST->getValueOperand()->getType();
5323
5324      assert(T->isSized() &&
5325             "Expected the load/store/recurrence type to be sized");
5326
5327      ElementTypesInLoop.insert(T);
5328    }
5329  }
5330}
5331
5332unsigned
5333LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5334                                                  InstructionCost LoopCost) {
5335  // -- The interleave heuristics --
5336  // We interleave the loop in order to expose ILP and reduce the loop overhead.
5337  // There are many micro-architectural considerations that we can't predict
5338  // at this level. For example, frontend pressure (on decode or fetch) due to
5339  // code size, or the number and capabilities of the execution ports.
5340  //
5341  // We use the following heuristics to select the interleave count:
5342  // 1. If the code has reductions, then we interleave to break the cross
5343  // iteration dependency.
5344  // 2. If the loop is really small, then we interleave to reduce the loop
5345  // overhead.
5346  // 3. We don't interleave if we think that we will spill registers to memory
5347  // due to the increased register pressure.
5348
5349  if (!isScalarEpilogueAllowed())
5350    return 1;
5351
5352  // We used the distance for the interleave count.
5353  if (!Legal->isSafeForAnyVectorWidth())
5354    return 1;
5355
5356  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5357  const bool HasReductions = !Legal->getReductionVars().empty();
5358  // Do not interleave loops with a relatively small known or estimated trip
5359  // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360  // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361  // because with the above conditions interleaving can expose ILP and break
5362  // cross iteration dependences for reductions.
5363  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5364      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5365    return 1;
5366
5367  // If we did not calculate the cost for VF (because the user selected the VF)
5368  // then we calculate the cost of VF here.
5369  if (LoopCost == 0) {
5370    LoopCost = expectedCost(VF).first;
5371    assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5372
5373    // Loop body is free and there is no need for interleaving.
5374    if (LoopCost == 0)
5375      return 1;
5376  }
5377
5378  RegisterUsage R = calculateRegisterUsage({VF})[0];
5379  // We divide by these constants so assume that we have at least one
5380  // instruction that uses at least one register.
5381  for (auto& pair : R.MaxLocalUsers) {
5382    pair.second = std::max(pair.second, 1U);
5383  }
5384
5385  // We calculate the interleave count using the following formula.
5386  // Subtract the number of loop invariants from the number of available
5387  // registers. These registers are used by all of the interleaved instances.
5388  // Next, divide the remaining registers by the number of registers that is
5389  // required by the loop, in order to estimate how many parallel instances
5390  // fit without causing spills. All of this is rounded down if necessary to be
5391  // a power of two. We want power of two interleave count to simplify any
5392  // addressing operations or alignment considerations.
5393  // We also want power of two interleave counts to ensure that the induction
5394  // variable of the vector loop wraps to zero, when tail is folded by masking;
5395  // this currently happens when OptForSize, in which case IC is set to 1 above.
5396  unsigned IC = UINT_MAX;
5397
5398  for (auto& pair : R.MaxLocalUsers) {
5399    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5400    LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5401                      << " registers of "
5402                      << TTI.getRegisterClassName(pair.first) << " register class\n");
5403    if (VF.isScalar()) {
5404      if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5405        TargetNumRegisters = ForceTargetNumScalarRegs;
5406    } else {
5407      if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5408        TargetNumRegisters = ForceTargetNumVectorRegs;
5409    }
5410    unsigned MaxLocalUsers = pair.second;
5411    unsigned LoopInvariantRegs = 0;
5412    if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5413      LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5414
5415    unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5416                                     MaxLocalUsers);
5417    // Don't count the induction variable as interleaved.
5418    if (EnableIndVarRegisterHeur) {
5419      TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5420                              std::max(1U, (MaxLocalUsers - 1)));
5421    }
5422
5423    IC = std::min(IC, TmpIC);
5424  }
5425
5426  // Clamp the interleave ranges to reasonable counts.
5427  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5428
5429  // Check if the user has overridden the max.
5430  if (VF.isScalar()) {
5431    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5432      MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5433  } else {
5434    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5435      MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5436  }
5437
5438  unsigned EstimatedVF = VF.getKnownMinValue();
5439  if (VF.isScalable()) {
5440    if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5441      EstimatedVF *= *VScale;
5442  }
5443  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5444
5445  unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5446  if (KnownTC) {
5447    // If trip count is known we select between two prospective ICs, where
5448    // 1) the aggressive IC is capped by the trip count divided by VF
5449    // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450    // The final IC is selected in a way that the epilogue loop trip count is
5451    // minimized while maximizing the IC itself, so that we either run the
5452    // vector loop at least once if it generates a small epilogue loop, or else
5453    // we run the vector loop at least twice.
5454
5455    unsigned InterleaveCountUB = bit_floor(
5456        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5457    unsigned InterleaveCountLB = bit_floor(std::max(
5458        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5459    MaxInterleaveCount = InterleaveCountLB;
5460
5461    if (InterleaveCountUB != InterleaveCountLB) {
5462      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5463      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5464      // If both produce same scalar tail, maximize the IC to do the same work
5465      // in fewer vector loop iterations
5466      if (TailTripCountUB == TailTripCountLB)
5467        MaxInterleaveCount = InterleaveCountUB;
5468    }
5469  } else if (BestKnownTC) {
5470    // If trip count is an estimated compile time constant, limit the
5471    // IC to be capped by the trip count divided by VF * 2, such that the vector
5472    // loop runs at least twice to make interleaving seem profitable when there
5473    // is an epilogue loop present. Since exact Trip count is not known we
5474    // choose to be conservative in our IC estimate.
5475    MaxInterleaveCount = bit_floor(std::max(
5476        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5477  }
5478
5479  assert(MaxInterleaveCount > 0 &&
5480         "Maximum interleave count must be greater than 0");
5481
5482  // Clamp the calculated IC to be between the 1 and the max interleave count
5483  // that the target and trip count allows.
5484  if (IC > MaxInterleaveCount)
5485    IC = MaxInterleaveCount;
5486  else
5487    // Make sure IC is greater than 0.
5488    IC = std::max(1u, IC);
5489
5490  assert(IC > 0 && "Interleave count must be greater than 0.");
5491
5492  // Interleave if we vectorized this loop and there is a reduction that could
5493  // benefit from interleaving.
5494  if (VF.isVector() && HasReductions) {
5495    LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5496    return IC;
5497  }
5498
5499  // For any scalar loop that either requires runtime checks or predication we
5500  // are better off leaving this to the unroller. Note that if we've already
5501  // vectorized the loop we will have done the runtime check and so interleaving
5502  // won't require further checks.
5503  bool ScalarInterleavingRequiresPredication =
5504      (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5505         return Legal->blockNeedsPredication(BB);
5506       }));
5507  bool ScalarInterleavingRequiresRuntimePointerCheck =
5508      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5509
5510  // We want to interleave small loops in order to reduce the loop overhead and
5511  // potentially expose ILP opportunities.
5512  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5513                    << "LV: IC is " << IC << '\n'
5514                    << "LV: VF is " << VF << '\n');
5515  const bool AggressivelyInterleaveReductions =
5516      TTI.enableAggressiveInterleaving(HasReductions);
5517  if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5518      !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5519    // We assume that the cost overhead is 1 and we use the cost model
5520    // to estimate the cost of the loop and interleave until the cost of the
5521    // loop overhead is about 5% of the cost of the loop.
5522    unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5523                                        SmallLoopCost / *LoopCost.getValue()));
5524
5525    // Interleave until store/load ports (estimated by max interleave count) are
5526    // saturated.
5527    unsigned NumStores = Legal->getNumStores();
5528    unsigned NumLoads = Legal->getNumLoads();
5529    unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5530    unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5531
5532    // There is little point in interleaving for reductions containing selects
5533    // and compares when VF=1 since it may just create more overhead than it's
5534    // worth for loops with small trip counts. This is because we still have to
5535    // do the final reduction after the loop.
5536    bool HasSelectCmpReductions =
5537        HasReductions &&
5538        any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5539          const RecurrenceDescriptor &RdxDesc = Reduction.second;
5540          return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541              RdxDesc.getRecurrenceKind());
5542        });
5543    if (HasSelectCmpReductions) {
5544      LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5545      return 1;
5546    }
5547
5548    // If we have a scalar reduction (vector reductions are already dealt with
5549    // by this point), we can increase the critical path length if the loop
5550    // we're interleaving is inside another loop. For tree-wise reductions
5551    // set the limit to 2, and for ordered reductions it's best to disable
5552    // interleaving entirely.
5553    if (HasReductions && TheLoop->getLoopDepth() > 1) {
5554      bool HasOrderedReductions =
5555          any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5556            const RecurrenceDescriptor &RdxDesc = Reduction.second;
5557            return RdxDesc.isOrdered();
5558          });
5559      if (HasOrderedReductions) {
5560        LLVM_DEBUG(
5561            dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5562        return 1;
5563      }
5564
5565      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5566      SmallIC = std::min(SmallIC, F);
5567      StoresIC = std::min(StoresIC, F);
5568      LoadsIC = std::min(LoadsIC, F);
5569    }
5570
5571    if (EnableLoadStoreRuntimeInterleave &&
5572        std::max(StoresIC, LoadsIC) > SmallIC) {
5573      LLVM_DEBUG(
5574          dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5575      return std::max(StoresIC, LoadsIC);
5576    }
5577
5578    // If there are scalar reductions and TTI has enabled aggressive
5579    // interleaving for reductions, we will interleave to expose ILP.
5580    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5581        AggressivelyInterleaveReductions) {
5582      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583      // Interleave no less than SmallIC but not as aggressive as the normal IC
5584      // to satisfy the rare situation when resources are too limited.
5585      return std::max(IC / 2, SmallIC);
5586    } else {
5587      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5588      return SmallIC;
5589    }
5590  }
5591
5592  // Interleave if this is a large loop (small loops are already dealt with by
5593  // this point) that could benefit from interleaving.
5594  if (AggressivelyInterleaveReductions) {
5595    LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5596    return IC;
5597  }
5598
5599  LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5600  return 1;
5601}
5602
5603SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5604LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5605  // This function calculates the register usage by measuring the highest number
5606  // of values that are alive at a single location. Obviously, this is a very
5607  // rough estimation. We scan the loop in a topological order in order and
5608  // assign a number to each instruction. We use RPO to ensure that defs are
5609  // met before their users. We assume that each instruction that has in-loop
5610  // users starts an interval. We record every time that an in-loop value is
5611  // used, so we have a list of the first and last occurrences of each
5612  // instruction. Next, we transpose this data structure into a multi map that
5613  // holds the list of intervals that *end* at a specific location. This multi
5614  // map allows us to perform a linear search. We scan the instructions linearly
5615  // and record each time that a new interval starts, by placing it in a set.
5616  // If we find this value in the multi-map then we remove it from the set.
5617  // The max register usage is the maximum size of the set.
5618  // We also search for instructions that are defined outside the loop, but are
5619  // used inside the loop. We need this number separately from the max-interval
5620  // usage number because when we unroll, loop-invariant values do not take
5621  // more register.
5622  LoopBlocksDFS DFS(TheLoop);
5623  DFS.perform(LI);
5624
5625  RegisterUsage RU;
5626
5627  // Each 'key' in the map opens a new interval. The values
5628  // of the map are the index of the 'last seen' usage of the
5629  // instruction that is the key.
5630  using IntervalMap = DenseMap<Instruction *, unsigned>;
5631
5632  // Maps instruction to its index.
5633  SmallVector<Instruction *, 64> IdxToInstr;
5634  // Marks the end of each interval.
5635  IntervalMap EndPoint;
5636  // Saves the list of instruction indices that are used in the loop.
5637  SmallPtrSet<Instruction *, 8> Ends;
5638  // Saves the list of values that are used in the loop but are defined outside
5639  // the loop (not including non-instruction values such as arguments and
5640  // constants).
5641  SmallSetVector<Instruction *, 8> LoopInvariants;
5642
5643  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5644    for (Instruction &I : BB->instructionsWithoutDebug()) {
5645      IdxToInstr.push_back(&I);
5646
5647      // Save the end location of each USE.
5648      for (Value *U : I.operands()) {
5649        auto *Instr = dyn_cast<Instruction>(U);
5650
5651        // Ignore non-instruction values such as arguments, constants, etc.
5652        // FIXME: Might need some motivation why these values are ignored. If
5653        // for example an argument is used inside the loop it will increase the
5654        // register pressure (so shouldn't we add it to LoopInvariants).
5655        if (!Instr)
5656          continue;
5657
5658        // If this instruction is outside the loop then record it and continue.
5659        if (!TheLoop->contains(Instr)) {
5660          LoopInvariants.insert(Instr);
5661          continue;
5662        }
5663
5664        // Overwrite previous end points.
5665        EndPoint[Instr] = IdxToInstr.size();
5666        Ends.insert(Instr);
5667      }
5668    }
5669  }
5670
5671  // Saves the list of intervals that end with the index in 'key'.
5672  using InstrList = SmallVector<Instruction *, 2>;
5673  DenseMap<unsigned, InstrList> TransposeEnds;
5674
5675  // Transpose the EndPoints to a list of values that end at each index.
5676  for (auto &Interval : EndPoint)
5677    TransposeEnds[Interval.second].push_back(Interval.first);
5678
5679  SmallPtrSet<Instruction *, 8> OpenIntervals;
5680  SmallVector<RegisterUsage, 8> RUs(VFs.size());
5681  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5682
5683  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5684
5685  const auto &TTICapture = TTI;
5686  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5687    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5688      return 0;
5689    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5690  };
5691
5692  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5693    Instruction *I = IdxToInstr[i];
5694
5695    // Remove all of the instructions that end at this location.
5696    InstrList &List = TransposeEnds[i];
5697    for (Instruction *ToRemove : List)
5698      OpenIntervals.erase(ToRemove);
5699
5700    // Ignore instructions that are never used within the loop.
5701    if (!Ends.count(I))
5702      continue;
5703
5704    // Skip ignored values.
5705    if (ValuesToIgnore.count(I))
5706      continue;
5707
5708    collectInLoopReductions();
5709
5710    // For each VF find the maximum usage of registers.
5711    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5712      // Count the number of registers used, per register class, given all open
5713      // intervals.
5714      // Note that elements in this SmallMapVector will be default constructed
5715      // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716      // there is no previous entry for ClassID.
5717      SmallMapVector<unsigned, unsigned, 4> RegUsage;
5718
5719      if (VFs[j].isScalar()) {
5720        for (auto *Inst : OpenIntervals) {
5721          unsigned ClassID =
5722              TTI.getRegisterClassForType(false, Inst->getType());
5723          // FIXME: The target might use more than one register for the type
5724          // even in the scalar case.
5725          RegUsage[ClassID] += 1;
5726        }
5727      } else {
5728        collectUniformsAndScalars(VFs[j]);
5729        for (auto *Inst : OpenIntervals) {
5730          // Skip ignored values for VF > 1.
5731          if (VecValuesToIgnore.count(Inst))
5732            continue;
5733          if (isScalarAfterVectorization(Inst, VFs[j])) {
5734            unsigned ClassID =
5735                TTI.getRegisterClassForType(false, Inst->getType());
5736            // FIXME: The target might use more than one register for the type
5737            // even in the scalar case.
5738            RegUsage[ClassID] += 1;
5739          } else {
5740            unsigned ClassID =
5741                TTI.getRegisterClassForType(true, Inst->getType());
5742            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5743          }
5744        }
5745      }
5746
5747      for (auto& pair : RegUsage) {
5748        auto &Entry = MaxUsages[j][pair.first];
5749        Entry = std::max(Entry, pair.second);
5750      }
5751    }
5752
5753    LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5754                      << OpenIntervals.size() << '\n');
5755
5756    // Add the current instruction to the list of open intervals.
5757    OpenIntervals.insert(I);
5758  }
5759
5760  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5761    // Note that elements in this SmallMapVector will be default constructed
5762    // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763    // there is no previous entry for ClassID.
5764    SmallMapVector<unsigned, unsigned, 4> Invariant;
5765
5766    for (auto *Inst : LoopInvariants) {
5767      // FIXME: The target might use more than one register for the type
5768      // even in the scalar case.
5769      bool IsScalar = all_of(Inst->users(), [&](User *U) {
5770        auto *I = cast<Instruction>(U);
5771        return TheLoop != LI->getLoopFor(I->getParent()) ||
5772               isScalarAfterVectorization(I, VFs[i]);
5773      });
5774
5775      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5776      unsigned ClassID =
5777          TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5778      Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5779    }
5780
5781    LLVM_DEBUG({
5782      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5783      dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5784             << " item\n";
5785      for (const auto &pair : MaxUsages[i]) {
5786        dbgs() << "LV(REG): RegisterClass: "
5787               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5788               << " registers\n";
5789      }
5790      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5791             << " item\n";
5792      for (const auto &pair : Invariant) {
5793        dbgs() << "LV(REG): RegisterClass: "
5794               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5795               << " registers\n";
5796      }
5797    });
5798
5799    RU.LoopInvariantRegs = Invariant;
5800    RU.MaxLocalUsers = MaxUsages[i];
5801    RUs[i] = RU;
5802  }
5803
5804  return RUs;
5805}
5806
5807bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5808                                                           ElementCount VF) {
5809  // TODO: Cost model for emulated masked load/store is completely
5810  // broken. This hack guides the cost model to use an artificially
5811  // high enough value to practically disable vectorization with such
5812  // operations, except where previously deployed legality hack allowed
5813  // using very low cost values. This is to avoid regressions coming simply
5814  // from moving "masked load/store" check from legality to cost model.
5815  // Masked Load/Gather emulation was previously never allowed.
5816  // Limited number of Masked Store/Scatter emulation was allowed.
5817  assert((isPredicatedInst(I)) &&
5818         "Expecting a scalar emulated instruction");
5819  return isa<LoadInst>(I) ||
5820         (isa<StoreInst>(I) &&
5821          NumPredStores > NumberOfStoresToPredicate);
5822}
5823
5824void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5825  // If we aren't vectorizing the loop, or if we've already collected the
5826  // instructions to scalarize, there's nothing to do. Collection may already
5827  // have occurred if we have a user-selected VF and are now computing the
5828  // expected cost for interleaving.
5829  if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5830    return;
5831
5832  // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5833  // not profitable to scalarize any instructions, the presence of VF in the
5834  // map will indicate that we've analyzed it already.
5835  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5836
5837  PredicatedBBsAfterVectorization[VF].clear();
5838
5839  // Find all the instructions that are scalar with predication in the loop and
5840  // determine if it would be better to not if-convert the blocks they are in.
5841  // If so, we also record the instructions to scalarize.
5842  for (BasicBlock *BB : TheLoop->blocks()) {
5843    if (!blockNeedsPredicationForAnyReason(BB))
5844      continue;
5845    for (Instruction &I : *BB)
5846      if (isScalarWithPredication(&I, VF)) {
5847        ScalarCostsTy ScalarCosts;
5848        // Do not apply discount if scalable, because that would lead to
5849        // invalid scalarization costs.
5850        // Do not apply discount logic if hacked cost is needed
5851        // for emulated masked memrefs.
5852        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5853            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5854          ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5855        // Remember that BB will remain after vectorization.
5856        PredicatedBBsAfterVectorization[VF].insert(BB);
5857      }
5858  }
5859}
5860
5861InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5862    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5863  assert(!isUniformAfterVectorization(PredInst, VF) &&
5864         "Instruction marked uniform-after-vectorization will be predicated");
5865
5866  // Initialize the discount to zero, meaning that the scalar version and the
5867  // vector version cost the same.
5868  InstructionCost Discount = 0;
5869
5870  // Holds instructions to analyze. The instructions we visit are mapped in
5871  // ScalarCosts. Those instructions are the ones that would be scalarized if
5872  // we find that the scalar version costs less.
5873  SmallVector<Instruction *, 8> Worklist;
5874
5875  // Returns true if the given instruction can be scalarized.
5876  auto canBeScalarized = [&](Instruction *I) -> bool {
5877    // We only attempt to scalarize instructions forming a single-use chain
5878    // from the original predicated block that would otherwise be vectorized.
5879    // Although not strictly necessary, we give up on instructions we know will
5880    // already be scalar to avoid traversing chains that are unlikely to be
5881    // beneficial.
5882    if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5883        isScalarAfterVectorization(I, VF))
5884      return false;
5885
5886    // If the instruction is scalar with predication, it will be analyzed
5887    // separately. We ignore it within the context of PredInst.
5888    if (isScalarWithPredication(I, VF))
5889      return false;
5890
5891    // If any of the instruction's operands are uniform after vectorization,
5892    // the instruction cannot be scalarized. This prevents, for example, a
5893    // masked load from being scalarized.
5894    //
5895    // We assume we will only emit a value for lane zero of an instruction
5896    // marked uniform after vectorization, rather than VF identical values.
5897    // Thus, if we scalarize an instruction that uses a uniform, we would
5898    // create uses of values corresponding to the lanes we aren't emitting code
5899    // for. This behavior can be changed by allowing getScalarValue to clone
5900    // the lane zero values for uniforms rather than asserting.
5901    for (Use &U : I->operands())
5902      if (auto *J = dyn_cast<Instruction>(U.get()))
5903        if (isUniformAfterVectorization(J, VF))
5904          return false;
5905
5906    // Otherwise, we can scalarize the instruction.
5907    return true;
5908  };
5909
5910  // Compute the expected cost discount from scalarizing the entire expression
5911  // feeding the predicated instruction. We currently only consider expressions
5912  // that are single-use instruction chains.
5913  Worklist.push_back(PredInst);
5914  while (!Worklist.empty()) {
5915    Instruction *I = Worklist.pop_back_val();
5916
5917    // If we've already analyzed the instruction, there's nothing to do.
5918    if (ScalarCosts.contains(I))
5919      continue;
5920
5921    // Compute the cost of the vector instruction. Note that this cost already
5922    // includes the scalarization overhead of the predicated instruction.
5923    InstructionCost VectorCost = getInstructionCost(I, VF).first;
5924
5925    // Compute the cost of the scalarized instruction. This cost is the cost of
5926    // the instruction as if it wasn't if-converted and instead remained in the
5927    // predicated block. We will scale this cost by block probability after
5928    // computing the scalarization overhead.
5929    InstructionCost ScalarCost =
5930        VF.getFixedValue() *
5931        getInstructionCost(I, ElementCount::getFixed(1)).first;
5932
5933    // Compute the scalarization overhead of needed insertelement instructions
5934    // and phi nodes.
5935    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5936    if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5937      ScalarCost += TTI.getScalarizationOverhead(
5938          cast<VectorType>(ToVectorTy(I->getType(), VF)),
5939          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5940          /*Extract*/ false, CostKind);
5941      ScalarCost +=
5942          VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5943    }
5944
5945    // Compute the scalarization overhead of needed extractelement
5946    // instructions. For each of the instruction's operands, if the operand can
5947    // be scalarized, add it to the worklist; otherwise, account for the
5948    // overhead.
5949    for (Use &U : I->operands())
5950      if (auto *J = dyn_cast<Instruction>(U.get())) {
5951        assert(VectorType::isValidElementType(J->getType()) &&
5952               "Instruction has non-scalar type");
5953        if (canBeScalarized(J))
5954          Worklist.push_back(J);
5955        else if (needsExtract(J, VF)) {
5956          ScalarCost += TTI.getScalarizationOverhead(
5957              cast<VectorType>(ToVectorTy(J->getType(), VF)),
5958              APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5959              /*Extract*/ true, CostKind);
5960        }
5961      }
5962
5963    // Scale the total scalar cost by block probability.
5964    ScalarCost /= getReciprocalPredBlockProb();
5965
5966    // Compute the discount. A non-negative discount means the vector version
5967    // of the instruction costs more, and scalarizing would be beneficial.
5968    Discount += VectorCost - ScalarCost;
5969    ScalarCosts[I] = ScalarCost;
5970  }
5971
5972  return Discount;
5973}
5974
5975LoopVectorizationCostModel::VectorizationCostTy
5976LoopVectorizationCostModel::expectedCost(
5977    ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5978  VectorizationCostTy Cost;
5979
5980  // For each block.
5981  for (BasicBlock *BB : TheLoop->blocks()) {
5982    VectorizationCostTy BlockCost;
5983
5984    // For each instruction in the old loop.
5985    for (Instruction &I : BB->instructionsWithoutDebug()) {
5986      // Skip ignored values.
5987      if (ValuesToIgnore.count(&I) ||
5988          (VF.isVector() && VecValuesToIgnore.count(&I)))
5989        continue;
5990
5991      VectorizationCostTy C = getInstructionCost(&I, VF);
5992
5993      // Check if we should override the cost.
5994      if (C.first.isValid() &&
5995          ForceTargetInstructionCost.getNumOccurrences() > 0)
5996        C.first = InstructionCost(ForceTargetInstructionCost);
5997
5998      // Keep a list of instructions with invalid costs.
5999      if (Invalid && !C.first.isValid())
6000        Invalid->emplace_back(&I, VF);
6001
6002      BlockCost.first += C.first;
6003      BlockCost.second |= C.second;
6004      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6005                        << " for VF " << VF << " For instruction: " << I
6006                        << '\n');
6007    }
6008
6009    // If we are vectorizing a predicated block, it will have been
6010    // if-converted. This means that the block's instructions (aside from
6011    // stores and instructions that may divide by zero) will now be
6012    // unconditionally executed. For the scalar case, we may not always execute
6013    // the predicated block, if it is an if-else block. Thus, scale the block's
6014    // cost by the probability of executing it. blockNeedsPredication from
6015    // Legal is used so as to not include all blocks in tail folded loops.
6016    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6017      BlockCost.first /= getReciprocalPredBlockProb();
6018
6019    Cost.first += BlockCost.first;
6020    Cost.second |= BlockCost.second;
6021  }
6022
6023  return Cost;
6024}
6025
6026/// Gets Address Access SCEV after verifying that the access pattern
6027/// is loop invariant except the induction variable dependence.
6028///
6029/// This SCEV can be sent to the Target in order to estimate the address
6030/// calculation cost.
6031static const SCEV *getAddressAccessSCEV(
6032              Value *Ptr,
6033              LoopVectorizationLegality *Legal,
6034              PredicatedScalarEvolution &PSE,
6035              const Loop *TheLoop) {
6036
6037  auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6038  if (!Gep)
6039    return nullptr;
6040
6041  // We are looking for a gep with all loop invariant indices except for one
6042  // which should be an induction variable.
6043  auto SE = PSE.getSE();
6044  unsigned NumOperands = Gep->getNumOperands();
6045  for (unsigned i = 1; i < NumOperands; ++i) {
6046    Value *Opd = Gep->getOperand(i);
6047    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6048        !Legal->isInductionVariable(Opd))
6049      return nullptr;
6050  }
6051
6052  // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6053  return PSE.getSCEV(Ptr);
6054}
6055
6056InstructionCost
6057LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6058                                                        ElementCount VF) {
6059  assert(VF.isVector() &&
6060         "Scalarization cost of instruction implies vectorization.");
6061  if (VF.isScalable())
6062    return InstructionCost::getInvalid();
6063
6064  Type *ValTy = getLoadStoreType(I);
6065  auto SE = PSE.getSE();
6066
6067  unsigned AS = getLoadStoreAddressSpace(I);
6068  Value *Ptr = getLoadStorePointerOperand(I);
6069  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6070  // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6071  //       that it is being called from this specific place.
6072
6073  // Figure out whether the access is strided and get the stride value
6074  // if it's known in compile time
6075  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6076
6077  // Get the cost of the scalar memory instruction and address computation.
6078  InstructionCost Cost =
6079      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6080
6081  // Don't pass *I here, since it is scalar but will actually be part of a
6082  // vectorized loop where the user of it is a vectorized instruction.
6083  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6084  const Align Alignment = getLoadStoreAlignment(I);
6085  Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6086                                                      ValTy->getScalarType(),
6087                                                      Alignment, AS, CostKind);
6088
6089  // Get the overhead of the extractelement and insertelement instructions
6090  // we might create due to scalarization.
6091  Cost += getScalarizationOverhead(I, VF, CostKind);
6092
6093  // If we have a predicated load/store, it will need extra i1 extracts and
6094  // conditional branches, but may not be executed for each vector lane. Scale
6095  // the cost by the probability of executing the predicated block.
6096  if (isPredicatedInst(I)) {
6097    Cost /= getReciprocalPredBlockProb();
6098
6099    // Add the cost of an i1 extract and a branch
6100    auto *Vec_i1Ty =
6101        VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6102    Cost += TTI.getScalarizationOverhead(
6103        Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6104        /*Insert=*/false, /*Extract=*/true, CostKind);
6105    Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6106
6107    if (useEmulatedMaskMemRefHack(I, VF))
6108      // Artificially setting to a high enough value to practically disable
6109      // vectorization with such operations.
6110      Cost = 3000000;
6111  }
6112
6113  return Cost;
6114}
6115
6116InstructionCost
6117LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6118                                                    ElementCount VF) {
6119  Type *ValTy = getLoadStoreType(I);
6120  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6121  Value *Ptr = getLoadStorePointerOperand(I);
6122  unsigned AS = getLoadStoreAddressSpace(I);
6123  int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6124  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6125
6126  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6127         "Stride should be 1 or -1 for consecutive memory access");
6128  const Align Alignment = getLoadStoreAlignment(I);
6129  InstructionCost Cost = 0;
6130  if (Legal->isMaskRequired(I)) {
6131    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6132                                      CostKind);
6133  } else {
6134    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6135    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6136                                CostKind, OpInfo, I);
6137  }
6138
6139  bool Reverse = ConsecutiveStride < 0;
6140  if (Reverse)
6141    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6142                               std::nullopt, CostKind, 0);
6143  return Cost;
6144}
6145
6146InstructionCost
6147LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6148                                                ElementCount VF) {
6149  assert(Legal->isUniformMemOp(*I, VF));
6150
6151  Type *ValTy = getLoadStoreType(I);
6152  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6153  const Align Alignment = getLoadStoreAlignment(I);
6154  unsigned AS = getLoadStoreAddressSpace(I);
6155  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6156  if (isa<LoadInst>(I)) {
6157    return TTI.getAddressComputationCost(ValTy) +
6158           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6159                               CostKind) +
6160           TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6161  }
6162  StoreInst *SI = cast<StoreInst>(I);
6163
6164  bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6165  return TTI.getAddressComputationCost(ValTy) +
6166         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6167                             CostKind) +
6168         (isLoopInvariantStoreValue
6169              ? 0
6170              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6171                                       CostKind, VF.getKnownMinValue() - 1));
6172}
6173
6174InstructionCost
6175LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6176                                                 ElementCount VF) {
6177  Type *ValTy = getLoadStoreType(I);
6178  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6179  const Align Alignment = getLoadStoreAlignment(I);
6180  const Value *Ptr = getLoadStorePointerOperand(I);
6181
6182  return TTI.getAddressComputationCost(VectorTy) +
6183         TTI.getGatherScatterOpCost(
6184             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6185             TargetTransformInfo::TCK_RecipThroughput, I);
6186}
6187
6188InstructionCost
6189LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6190                                                   ElementCount VF) {
6191  Type *ValTy = getLoadStoreType(I);
6192  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6193  unsigned AS = getLoadStoreAddressSpace(I);
6194  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6195
6196  auto Group = getInterleavedAccessGroup(I);
6197  assert(Group && "Fail to get an interleaved access group.");
6198
6199  unsigned InterleaveFactor = Group->getFactor();
6200  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6201
6202  // Holds the indices of existing members in the interleaved group.
6203  SmallVector<unsigned, 4> Indices;
6204  for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6205    if (Group->getMember(IF))
6206      Indices.push_back(IF);
6207
6208  // Calculate the cost of the whole interleaved group.
6209  bool UseMaskForGaps =
6210      (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211      (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6212  InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6213      I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6214      AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6215
6216  if (Group->isReverse()) {
6217    // TODO: Add support for reversed masked interleaved access.
6218    assert(!Legal->isMaskRequired(I) &&
6219           "Reverse masked interleaved access not supported.");
6220    Cost += Group->getNumMembers() *
6221            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6222                               std::nullopt, CostKind, 0);
6223  }
6224  return Cost;
6225}
6226
6227std::optional<InstructionCost>
6228LoopVectorizationCostModel::getReductionPatternCost(
6229    Instruction *I, ElementCount VF, Type *Ty,
6230    TTI::TargetCostKind CostKind) const {
6231  using namespace llvm::PatternMatch;
6232  // Early exit for no inloop reductions
6233  if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6234    return std::nullopt;
6235  auto *VectorTy = cast<VectorType>(Ty);
6236
6237  // We are looking for a pattern of, and finding the minimal acceptable cost:
6238  //  reduce(mul(ext(A), ext(B))) or
6239  //  reduce(mul(A, B)) or
6240  //  reduce(ext(A)) or
6241  //  reduce(A).
6242  // The basic idea is that we walk down the tree to do that, finding the root
6243  // reduction instruction in InLoopReductionImmediateChains. From there we find
6244  // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245  // of the components. If the reduction cost is lower then we return it for the
6246  // reduction instruction and 0 for the other instructions in the pattern. If
6247  // it is not we return an invalid cost specifying the orignal cost method
6248  // should be used.
6249  Instruction *RetI = I;
6250  if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6251    if (!RetI->hasOneUser())
6252      return std::nullopt;
6253    RetI = RetI->user_back();
6254  }
6255
6256  if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257      RetI->user_back()->getOpcode() == Instruction::Add) {
6258    RetI = RetI->user_back();
6259  }
6260
6261  // Test if the found instruction is a reduction, and if not return an invalid
6262  // cost specifying the parent to use the original cost modelling.
6263  if (!InLoopReductionImmediateChains.count(RetI))
6264    return std::nullopt;
6265
6266  // Find the reduction this chain is a part of and calculate the basic cost of
6267  // the reduction on its own.
6268  Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6269  Instruction *ReductionPhi = LastChain;
6270  while (!isa<PHINode>(ReductionPhi))
6271    ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6272
6273  const RecurrenceDescriptor &RdxDesc =
6274      Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6275
6276  InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6277      RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6278
6279  // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6280  // normal fmul instruction to the cost of the fadd reduction.
6281  if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6282    BaseCost +=
6283        TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6284
6285  // If we're using ordered reductions then we can just return the base cost
6286  // here, since getArithmeticReductionCost calculates the full ordered
6287  // reduction cost when FP reassociation is not allowed.
6288  if (useOrderedReductions(RdxDesc))
6289    return BaseCost;
6290
6291  // Get the operand that was not the reduction chain and match it to one of the
6292  // patterns, returning the better cost if it is found.
6293  Instruction *RedOp = RetI->getOperand(1) == LastChain
6294                           ? dyn_cast<Instruction>(RetI->getOperand(0))
6295                           : dyn_cast<Instruction>(RetI->getOperand(1));
6296
6297  VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6298
6299  Instruction *Op0, *Op1;
6300  if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6301      match(RedOp,
6302            m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6303      match(Op0, m_ZExtOrSExt(m_Value())) &&
6304      Op0->getOpcode() == Op1->getOpcode() &&
6305      Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6306      !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6307      (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6308
6309    // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310    // Note that the extend opcodes need to all match, or if A==B they will have
6311    // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312    // which is equally fine.
6313    bool IsUnsigned = isa<ZExtInst>(Op0);
6314    auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6315    auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6316
6317    InstructionCost ExtCost =
6318        TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6319                             TTI::CastContextHint::None, CostKind, Op0);
6320    InstructionCost MulCost =
6321        TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6322    InstructionCost Ext2Cost =
6323        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6324                             TTI::CastContextHint::None, CostKind, RedOp);
6325
6326    InstructionCost RedCost = TTI.getMulAccReductionCost(
6327        IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6328
6329    if (RedCost.isValid() &&
6330        RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6331      return I == RetI ? RedCost : 0;
6332  } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6333             !TheLoop->isLoopInvariant(RedOp)) {
6334    // Matched reduce(ext(A))
6335    bool IsUnsigned = isa<ZExtInst>(RedOp);
6336    auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6337    InstructionCost RedCost = TTI.getExtendedReductionCost(
6338        RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6339        RdxDesc.getFastMathFlags(), CostKind);
6340
6341    InstructionCost ExtCost =
6342        TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6343                             TTI::CastContextHint::None, CostKind, RedOp);
6344    if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6345      return I == RetI ? RedCost : 0;
6346  } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6347             match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6348    if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6349        Op0->getOpcode() == Op1->getOpcode() &&
6350        !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6351      bool IsUnsigned = isa<ZExtInst>(Op0);
6352      Type *Op0Ty = Op0->getOperand(0)->getType();
6353      Type *Op1Ty = Op1->getOperand(0)->getType();
6354      Type *LargestOpTy =
6355          Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6356                                                                    : Op0Ty;
6357      auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6358
6359      // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6360      // different sizes. We take the largest type as the ext to reduce, and add
6361      // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6362      InstructionCost ExtCost0 = TTI.getCastInstrCost(
6363          Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6364          TTI::CastContextHint::None, CostKind, Op0);
6365      InstructionCost ExtCost1 = TTI.getCastInstrCost(
6366          Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6367          TTI::CastContextHint::None, CostKind, Op1);
6368      InstructionCost MulCost =
6369          TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6370
6371      InstructionCost RedCost = TTI.getMulAccReductionCost(
6372          IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6373      InstructionCost ExtraExtCost = 0;
6374      if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6375        Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6376        ExtraExtCost = TTI.getCastInstrCost(
6377            ExtraExtOp->getOpcode(), ExtType,
6378            VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6379            TTI::CastContextHint::None, CostKind, ExtraExtOp);
6380      }
6381
6382      if (RedCost.isValid() &&
6383          (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6384        return I == RetI ? RedCost : 0;
6385    } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6386      // Matched reduce.add(mul())
6387      InstructionCost MulCost =
6388          TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6389
6390      InstructionCost RedCost = TTI.getMulAccReductionCost(
6391          true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6392
6393      if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6394        return I == RetI ? RedCost : 0;
6395    }
6396  }
6397
6398  return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6399}
6400
6401InstructionCost
6402LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6403                                                     ElementCount VF) {
6404  // Calculate scalar cost only. Vectorization cost should be ready at this
6405  // moment.
6406  if (VF.isScalar()) {
6407    Type *ValTy = getLoadStoreType(I);
6408    const Align Alignment = getLoadStoreAlignment(I);
6409    unsigned AS = getLoadStoreAddressSpace(I);
6410
6411    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6412    return TTI.getAddressComputationCost(ValTy) +
6413           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6414                               TTI::TCK_RecipThroughput, OpInfo, I);
6415  }
6416  return getWideningCost(I, VF);
6417}
6418
6419LoopVectorizationCostModel::VectorizationCostTy
6420LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6421                                               ElementCount VF) {
6422  // If we know that this instruction will remain uniform, check the cost of
6423  // the scalar version.
6424  if (isUniformAfterVectorization(I, VF))
6425    VF = ElementCount::getFixed(1);
6426
6427  if (VF.isVector() && isProfitableToScalarize(I, VF))
6428    return VectorizationCostTy(InstsToScalarize[VF][I], false);
6429
6430  // Forced scalars do not have any scalarization overhead.
6431  auto ForcedScalar = ForcedScalars.find(VF);
6432  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6433    auto InstSet = ForcedScalar->second;
6434    if (InstSet.count(I))
6435      return VectorizationCostTy(
6436          (getInstructionCost(I, ElementCount::getFixed(1)).first *
6437           VF.getKnownMinValue()),
6438          false);
6439  }
6440
6441  Type *VectorTy;
6442  InstructionCost C = getInstructionCost(I, VF, VectorTy);
6443
6444  bool TypeNotScalarized = false;
6445  if (VF.isVector() && VectorTy->isVectorTy()) {
6446    if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6447      if (VF.isScalable())
6448        // <vscale x 1 x iN> is assumed to be profitable over iN because
6449        // scalable registers are a distinct register class from scalar ones.
6450        // If we ever find a target which wants to lower scalable vectors
6451        // back to scalars, we'll need to update this code to explicitly
6452        // ask TTI about the register class uses for each part.
6453        TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6454      else
6455        TypeNotScalarized = NumParts < VF.getKnownMinValue();
6456    } else
6457      C = InstructionCost::getInvalid();
6458  }
6459  return VectorizationCostTy(C, TypeNotScalarized);
6460}
6461
6462InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6463    Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6464
6465  // There is no mechanism yet to create a scalable scalarization loop,
6466  // so this is currently Invalid.
6467  if (VF.isScalable())
6468    return InstructionCost::getInvalid();
6469
6470  if (VF.isScalar())
6471    return 0;
6472
6473  InstructionCost Cost = 0;
6474  Type *RetTy = ToVectorTy(I->getType(), VF);
6475  if (!RetTy->isVoidTy() &&
6476      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6477    Cost += TTI.getScalarizationOverhead(
6478        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6479        /*Insert*/ true,
6480        /*Extract*/ false, CostKind);
6481
6482  // Some targets keep addresses scalar.
6483  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6484    return Cost;
6485
6486  // Some targets support efficient element stores.
6487  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6488    return Cost;
6489
6490  // Collect operands to consider.
6491  CallInst *CI = dyn_cast<CallInst>(I);
6492  Instruction::op_range Ops = CI ? CI->args() : I->operands();
6493
6494  // Skip operands that do not require extraction/scalarization and do not incur
6495  // any overhead.
6496  SmallVector<Type *> Tys;
6497  for (auto *V : filterExtractingOperands(Ops, VF))
6498    Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6499  return Cost + TTI.getOperandsScalarizationOverhead(
6500                    filterExtractingOperands(Ops, VF), Tys, CostKind);
6501}
6502
6503void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6504  if (VF.isScalar())
6505    return;
6506  NumPredStores = 0;
6507  for (BasicBlock *BB : TheLoop->blocks()) {
6508    // For each instruction in the old loop.
6509    for (Instruction &I : *BB) {
6510      Value *Ptr =  getLoadStorePointerOperand(&I);
6511      if (!Ptr)
6512        continue;
6513
6514      // TODO: We should generate better code and update the cost model for
6515      // predicated uniform stores. Today they are treated as any other
6516      // predicated store (see added test cases in
6517      // invariant-store-vectorization.ll).
6518      if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6519        NumPredStores++;
6520
6521      if (Legal->isUniformMemOp(I, VF)) {
6522        auto isLegalToScalarize = [&]() {
6523          if (!VF.isScalable())
6524            // Scalarization of fixed length vectors "just works".
6525            return true;
6526
6527          // We have dedicated lowering for unpredicated uniform loads and
6528          // stores.  Note that even with tail folding we know that at least
6529          // one lane is active (i.e. generalized predication is not possible
6530          // here), and the logic below depends on this fact.
6531          if (!foldTailByMasking())
6532            return true;
6533
6534          // For scalable vectors, a uniform memop load is always
6535          // uniform-by-parts  and we know how to scalarize that.
6536          if (isa<LoadInst>(I))
6537            return true;
6538
6539          // A uniform store isn't neccessarily uniform-by-part
6540          // and we can't assume scalarization.
6541          auto &SI = cast<StoreInst>(I);
6542          return TheLoop->isLoopInvariant(SI.getValueOperand());
6543        };
6544
6545        const InstructionCost GatherScatterCost =
6546          isLegalGatherOrScatter(&I, VF) ?
6547          getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6548
6549        // Load: Scalar load + broadcast
6550        // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551        // FIXME: This cost is a significant under-estimate for tail folded
6552        // memory ops.
6553        const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6554          getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6555
6556        // Choose better solution for the current VF,  Note that Invalid
6557        // costs compare as maximumal large.  If both are invalid, we get
6558        // scalable invalid which signals a failure and a vectorization abort.
6559        if (GatherScatterCost < ScalarizationCost)
6560          setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6561        else
6562          setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6563        continue;
6564      }
6565
6566      // We assume that widening is the best solution when possible.
6567      if (memoryInstructionCanBeWidened(&I, VF)) {
6568        InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6569        int ConsecutiveStride = Legal->isConsecutivePtr(
6570            getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6571        assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6572               "Expected consecutive stride.");
6573        InstWidening Decision =
6574            ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6575        setWideningDecision(&I, VF, Decision, Cost);
6576        continue;
6577      }
6578
6579      // Choose between Interleaving, Gather/Scatter or Scalarization.
6580      InstructionCost InterleaveCost = InstructionCost::getInvalid();
6581      unsigned NumAccesses = 1;
6582      if (isAccessInterleaved(&I)) {
6583        auto Group = getInterleavedAccessGroup(&I);
6584        assert(Group && "Fail to get an interleaved access group.");
6585
6586        // Make one decision for the whole group.
6587        if (getWideningDecision(&I, VF) != CM_Unknown)
6588          continue;
6589
6590        NumAccesses = Group->getNumMembers();
6591        if (interleavedAccessCanBeWidened(&I, VF))
6592          InterleaveCost = getInterleaveGroupCost(&I, VF);
6593      }
6594
6595      InstructionCost GatherScatterCost =
6596          isLegalGatherOrScatter(&I, VF)
6597              ? getGatherScatterCost(&I, VF) * NumAccesses
6598              : InstructionCost::getInvalid();
6599
6600      InstructionCost ScalarizationCost =
6601          getMemInstScalarizationCost(&I, VF) * NumAccesses;
6602
6603      // Choose better solution for the current VF,
6604      // write down this decision and use it during vectorization.
6605      InstructionCost Cost;
6606      InstWidening Decision;
6607      if (InterleaveCost <= GatherScatterCost &&
6608          InterleaveCost < ScalarizationCost) {
6609        Decision = CM_Interleave;
6610        Cost = InterleaveCost;
6611      } else if (GatherScatterCost < ScalarizationCost) {
6612        Decision = CM_GatherScatter;
6613        Cost = GatherScatterCost;
6614      } else {
6615        Decision = CM_Scalarize;
6616        Cost = ScalarizationCost;
6617      }
6618      // If the instructions belongs to an interleave group, the whole group
6619      // receives the same decision. The whole group receives the cost, but
6620      // the cost will actually be assigned to one instruction.
6621      if (auto Group = getInterleavedAccessGroup(&I))
6622        setWideningDecision(Group, VF, Decision, Cost);
6623      else
6624        setWideningDecision(&I, VF, Decision, Cost);
6625    }
6626  }
6627
6628  // Make sure that any load of address and any other address computation
6629  // remains scalar unless there is gather/scatter support. This avoids
6630  // inevitable extracts into address registers, and also has the benefit of
6631  // activating LSR more, since that pass can't optimize vectorized
6632  // addresses.
6633  if (TTI.prefersVectorizedAddressing())
6634    return;
6635
6636  // Start with all scalar pointer uses.
6637  SmallPtrSet<Instruction *, 8> AddrDefs;
6638  for (BasicBlock *BB : TheLoop->blocks())
6639    for (Instruction &I : *BB) {
6640      Instruction *PtrDef =
6641        dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6642      if (PtrDef && TheLoop->contains(PtrDef) &&
6643          getWideningDecision(&I, VF) != CM_GatherScatter)
6644        AddrDefs.insert(PtrDef);
6645    }
6646
6647  // Add all instructions used to generate the addresses.
6648  SmallVector<Instruction *, 4> Worklist;
6649  append_range(Worklist, AddrDefs);
6650  while (!Worklist.empty()) {
6651    Instruction *I = Worklist.pop_back_val();
6652    for (auto &Op : I->operands())
6653      if (auto *InstOp = dyn_cast<Instruction>(Op))
6654        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6655            AddrDefs.insert(InstOp).second)
6656          Worklist.push_back(InstOp);
6657  }
6658
6659  for (auto *I : AddrDefs) {
6660    if (isa<LoadInst>(I)) {
6661      // Setting the desired widening decision should ideally be handled in
6662      // by cost functions, but since this involves the task of finding out
6663      // if the loaded register is involved in an address computation, it is
6664      // instead changed here when we know this is the case.
6665      InstWidening Decision = getWideningDecision(I, VF);
6666      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6667        // Scalarize a widened load of address.
6668        setWideningDecision(
6669            I, VF, CM_Scalarize,
6670            (VF.getKnownMinValue() *
6671             getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6672      else if (auto Group = getInterleavedAccessGroup(I)) {
6673        // Scalarize an interleave group of address loads.
6674        for (unsigned I = 0; I < Group->getFactor(); ++I) {
6675          if (Instruction *Member = Group->getMember(I))
6676            setWideningDecision(
6677                Member, VF, CM_Scalarize,
6678                (VF.getKnownMinValue() *
6679                 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6680        }
6681      }
6682    } else
6683      // Make sure I gets scalarized and a cost estimate without
6684      // scalarization overhead.
6685      ForcedScalars[VF].insert(I);
6686  }
6687}
6688
6689void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6690  assert(!VF.isScalar() &&
6691         "Trying to set a vectorization decision for a scalar VF");
6692
6693  for (BasicBlock *BB : TheLoop->blocks()) {
6694    // For each instruction in the old loop.
6695    for (Instruction &I : *BB) {
6696      CallInst *CI = dyn_cast<CallInst>(&I);
6697
6698      if (!CI)
6699        continue;
6700
6701      InstructionCost ScalarCost = InstructionCost::getInvalid();
6702      InstructionCost VectorCost = InstructionCost::getInvalid();
6703      InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6704      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6705
6706      Function *ScalarFunc = CI->getCalledFunction();
6707      Type *ScalarRetTy = CI->getType();
6708      SmallVector<Type *, 4> Tys, ScalarTys;
6709      bool MaskRequired = Legal->isMaskRequired(CI);
6710      for (auto &ArgOp : CI->args())
6711        ScalarTys.push_back(ArgOp->getType());
6712
6713      // Compute corresponding vector type for return value and arguments.
6714      Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6715      for (Type *ScalarTy : ScalarTys)
6716        Tys.push_back(ToVectorTy(ScalarTy, VF));
6717
6718      // An in-loop reduction using an fmuladd intrinsic is a special case;
6719      // we don't want the normal cost for that intrinsic.
6720      if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6721        if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6722          setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6723                                  getVectorIntrinsicIDForCall(CI, TLI),
6724                                  std::nullopt, *RedCost);
6725          continue;
6726        }
6727
6728      // Estimate cost of scalarized vector call. The source operands are
6729      // assumed to be vectors, so we need to extract individual elements from
6730      // there, execute VF scalar calls, and then gather the result into the
6731      // vector return value.
6732      InstructionCost ScalarCallCost =
6733          TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6734
6735      // Compute costs of unpacking argument values for the scalar calls and
6736      // packing the return values to a vector.
6737      InstructionCost ScalarizationCost =
6738          getScalarizationOverhead(CI, VF, CostKind);
6739
6740      ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6741
6742      // Find the cost of vectorizing the call, if we can find a suitable
6743      // vector variant of the function.
6744      bool UsesMask = false;
6745      VFInfo FuncInfo;
6746      Function *VecFunc = nullptr;
6747      // Search through any available variants for one we can use at this VF.
6748      for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6749        // Must match requested VF.
6750        if (Info.Shape.VF != VF)
6751          continue;
6752
6753        // Must take a mask argument if one is required
6754        if (MaskRequired && !Info.isMasked())
6755          continue;
6756
6757        // Check that all parameter kinds are supported
6758        bool ParamsOk = true;
6759        for (VFParameter Param : Info.Shape.Parameters) {
6760          switch (Param.ParamKind) {
6761          case VFParamKind::Vector:
6762            break;
6763          case VFParamKind::OMP_Uniform: {
6764            Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6765            // Make sure the scalar parameter in the loop is invariant.
6766            if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6767                                              TheLoop))
6768              ParamsOk = false;
6769            break;
6770          }
6771          case VFParamKind::OMP_Linear: {
6772            Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6773            // Find the stride for the scalar parameter in this loop and see if
6774            // it matches the stride for the variant.
6775            // TODO: do we need to figure out the cost of an extract to get the
6776            // first lane? Or do we hope that it will be folded away?
6777            ScalarEvolution *SE = PSE.getSE();
6778            const auto *SAR =
6779                dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6780
6781            if (!SAR || SAR->getLoop() != TheLoop) {
6782              ParamsOk = false;
6783              break;
6784            }
6785
6786            const SCEVConstant *Step =
6787                dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6788
6789            if (!Step ||
6790                Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6791              ParamsOk = false;
6792
6793            break;
6794          }
6795          case VFParamKind::GlobalPredicate:
6796            UsesMask = true;
6797            break;
6798          default:
6799            ParamsOk = false;
6800            break;
6801          }
6802        }
6803
6804        if (!ParamsOk)
6805          continue;
6806
6807        // Found a suitable candidate, stop here.
6808        VecFunc = CI->getModule()->getFunction(Info.VectorName);
6809        FuncInfo = Info;
6810        break;
6811      }
6812
6813      // Add in the cost of synthesizing a mask if one wasn't required.
6814      InstructionCost MaskCost = 0;
6815      if (VecFunc && UsesMask && !MaskRequired)
6816        MaskCost = TTI.getShuffleCost(
6817            TargetTransformInfo::SK_Broadcast,
6818            VectorType::get(IntegerType::getInt1Ty(
6819                                VecFunc->getFunctionType()->getContext()),
6820                            VF));
6821
6822      if (TLI && VecFunc && !CI->isNoBuiltin())
6823        VectorCost =
6824            TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6825
6826      // Find the cost of an intrinsic; some targets may have instructions that
6827      // perform the operation without needing an actual call.
6828      Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6829      if (IID != Intrinsic::not_intrinsic)
6830        IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6831
6832      InstructionCost Cost = ScalarCost;
6833      InstWidening Decision = CM_Scalarize;
6834
6835      if (VectorCost <= Cost) {
6836        Cost = VectorCost;
6837        Decision = CM_VectorCall;
6838      }
6839
6840      if (IntrinsicCost <= Cost) {
6841        Cost = IntrinsicCost;
6842        Decision = CM_IntrinsicCall;
6843      }
6844
6845      setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6846                              FuncInfo.getParamIndexForOptionalMask(), Cost);
6847    }
6848  }
6849}
6850
6851InstructionCost
6852LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6853                                               Type *&VectorTy) {
6854  Type *RetTy = I->getType();
6855  if (canTruncateToMinimalBitwidth(I, VF))
6856    RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6857  auto SE = PSE.getSE();
6858  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6859
6860  auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6861                                                ElementCount VF) -> bool {
6862    if (VF.isScalar())
6863      return true;
6864
6865    auto Scalarized = InstsToScalarize.find(VF);
6866    assert(Scalarized != InstsToScalarize.end() &&
6867           "VF not yet analyzed for scalarization profitability");
6868    return !Scalarized->second.count(I) &&
6869           llvm::all_of(I->users(), [&](User *U) {
6870             auto *UI = cast<Instruction>(U);
6871             return !Scalarized->second.count(UI);
6872           });
6873  };
6874  (void) hasSingleCopyAfterVectorization;
6875
6876  if (isScalarAfterVectorization(I, VF)) {
6877    // With the exception of GEPs and PHIs, after scalarization there should
6878    // only be one copy of the instruction generated in the loop. This is
6879    // because the VF is either 1, or any instructions that need scalarizing
6880    // have already been dealt with by the time we get here. As a result,
6881    // it means we don't have to multiply the instruction cost by VF.
6882    assert(I->getOpcode() == Instruction::GetElementPtr ||
6883           I->getOpcode() == Instruction::PHI ||
6884           (I->getOpcode() == Instruction::BitCast &&
6885            I->getType()->isPointerTy()) ||
6886           hasSingleCopyAfterVectorization(I, VF));
6887    VectorTy = RetTy;
6888  } else
6889    VectorTy = ToVectorTy(RetTy, VF);
6890
6891  // TODO: We need to estimate the cost of intrinsic calls.
6892  switch (I->getOpcode()) {
6893  case Instruction::GetElementPtr:
6894    // We mark this instruction as zero-cost because the cost of GEPs in
6895    // vectorized code depends on whether the corresponding memory instruction
6896    // is scalarized or not. Therefore, we handle GEPs with the memory
6897    // instruction cost.
6898    return 0;
6899  case Instruction::Br: {
6900    // In cases of scalarized and predicated instructions, there will be VF
6901    // predicated blocks in the vectorized loop. Each branch around these
6902    // blocks requires also an extract of its vector compare i1 element.
6903    bool ScalarPredicatedBB = false;
6904    BranchInst *BI = cast<BranchInst>(I);
6905    if (VF.isVector() && BI->isConditional() &&
6906        (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6907         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6908      ScalarPredicatedBB = true;
6909
6910    if (ScalarPredicatedBB) {
6911      // Not possible to scalarize scalable vector with predicated instructions.
6912      if (VF.isScalable())
6913        return InstructionCost::getInvalid();
6914      // Return cost for branches around scalarized and predicated blocks.
6915      auto *Vec_i1Ty =
6916          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6917      return (
6918          TTI.getScalarizationOverhead(
6919              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6920              /*Insert*/ false, /*Extract*/ true, CostKind) +
6921          (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6922    } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6923      // The back-edge branch will remain, as will all scalar branches.
6924      return TTI.getCFInstrCost(Instruction::Br, CostKind);
6925    else
6926      // This branch will be eliminated by if-conversion.
6927      return 0;
6928    // Note: We currently assume zero cost for an unconditional branch inside
6929    // a predicated block since it will become a fall-through, although we
6930    // may decide in the future to call TTI for all branches.
6931  }
6932  case Instruction::PHI: {
6933    auto *Phi = cast<PHINode>(I);
6934
6935    // First-order recurrences are replaced by vector shuffles inside the loop.
6936    if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6937      SmallVector<int> Mask(VF.getKnownMinValue());
6938      std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6939      return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6940                                cast<VectorType>(VectorTy), Mask, CostKind,
6941                                VF.getKnownMinValue() - 1);
6942    }
6943
6944    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6945    // converted into select instructions. We require N - 1 selects per phi
6946    // node, where N is the number of incoming values.
6947    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6948      return (Phi->getNumIncomingValues() - 1) *
6949             TTI.getCmpSelInstrCost(
6950                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6951                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6952                 CmpInst::BAD_ICMP_PREDICATE, CostKind);
6953
6954    return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6955  }
6956  case Instruction::UDiv:
6957  case Instruction::SDiv:
6958  case Instruction::URem:
6959  case Instruction::SRem:
6960    if (VF.isVector() && isPredicatedInst(I)) {
6961      const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6962      return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6963        ScalarCost : SafeDivisorCost;
6964    }
6965    // We've proven all lanes safe to speculate, fall through.
6966    [[fallthrough]];
6967  case Instruction::Add:
6968  case Instruction::FAdd:
6969  case Instruction::Sub:
6970  case Instruction::FSub:
6971  case Instruction::Mul:
6972  case Instruction::FMul:
6973  case Instruction::FDiv:
6974  case Instruction::FRem:
6975  case Instruction::Shl:
6976  case Instruction::LShr:
6977  case Instruction::AShr:
6978  case Instruction::And:
6979  case Instruction::Or:
6980  case Instruction::Xor: {
6981    // If we're speculating on the stride being 1, the multiplication may
6982    // fold away.  We can generalize this for all operations using the notion
6983    // of neutral elements.  (TODO)
6984    if (I->getOpcode() == Instruction::Mul &&
6985        (PSE.getSCEV(I->getOperand(0))->isOne() ||
6986         PSE.getSCEV(I->getOperand(1))->isOne()))
6987      return 0;
6988
6989    // Detect reduction patterns
6990    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6991      return *RedCost;
6992
6993    // Certain instructions can be cheaper to vectorize if they have a constant
6994    // second vector operand. One example of this are shifts on x86.
6995    Value *Op2 = I->getOperand(1);
6996    auto Op2Info = TTI.getOperandInfo(Op2);
6997    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6998        Legal->isInvariant(Op2))
6999      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7000
7001    SmallVector<const Value *, 4> Operands(I->operand_values());
7002    auto InstrCost = TTI.getArithmeticInstrCost(
7003        I->getOpcode(), VectorTy, CostKind,
7004        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7005        Op2Info, Operands, I);
7006
7007    // Some targets can replace frem with vector library calls.
7008    InstructionCost VecCallCost = InstructionCost::getInvalid();
7009    if (I->getOpcode() == Instruction::FRem) {
7010      LibFunc Func;
7011      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
7012          TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
7013        SmallVector<Type *, 4> OpTypes;
7014        for (auto &Op : I->operands())
7015          OpTypes.push_back(Op->getType());
7016        VecCallCost =
7017            TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
7018      }
7019    }
7020    return std::min(InstrCost, VecCallCost);
7021  }
7022  case Instruction::FNeg: {
7023    return TTI.getArithmeticInstrCost(
7024        I->getOpcode(), VectorTy, CostKind,
7025        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7026        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7027        I->getOperand(0), I);
7028  }
7029  case Instruction::Select: {
7030    SelectInst *SI = cast<SelectInst>(I);
7031    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7032    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7033
7034    const Value *Op0, *Op1;
7035    using namespace llvm::PatternMatch;
7036    if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7037                        match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7038      // select x, y, false --> x & y
7039      // select x, true, y --> x | y
7040      const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7041      const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7042      assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7043              Op1->getType()->getScalarSizeInBits() == 1);
7044
7045      SmallVector<const Value *, 2> Operands{Op0, Op1};
7046      return TTI.getArithmeticInstrCost(
7047          match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7048          CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7049    }
7050
7051    Type *CondTy = SI->getCondition()->getType();
7052    if (!ScalarCond)
7053      CondTy = VectorType::get(CondTy, VF);
7054
7055    CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7056    if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7057      Pred = Cmp->getPredicate();
7058    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7059                                  CostKind, I);
7060  }
7061  case Instruction::ICmp:
7062  case Instruction::FCmp: {
7063    Type *ValTy = I->getOperand(0)->getType();
7064    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7065    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7066      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7067    VectorTy = ToVectorTy(ValTy, VF);
7068    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7069                                  cast<CmpInst>(I)->getPredicate(), CostKind,
7070                                  I);
7071  }
7072  case Instruction::Store:
7073  case Instruction::Load: {
7074    ElementCount Width = VF;
7075    if (Width.isVector()) {
7076      InstWidening Decision = getWideningDecision(I, Width);
7077      assert(Decision != CM_Unknown &&
7078             "CM decision should be taken at this point");
7079      if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7080        return InstructionCost::getInvalid();
7081      if (Decision == CM_Scalarize)
7082        Width = ElementCount::getFixed(1);
7083    }
7084    VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7085    return getMemoryInstructionCost(I, VF);
7086  }
7087  case Instruction::BitCast:
7088    if (I->getType()->isPointerTy())
7089      return 0;
7090    [[fallthrough]];
7091  case Instruction::ZExt:
7092  case Instruction::SExt:
7093  case Instruction::FPToUI:
7094  case Instruction::FPToSI:
7095  case Instruction::FPExt:
7096  case Instruction::PtrToInt:
7097  case Instruction::IntToPtr:
7098  case Instruction::SIToFP:
7099  case Instruction::UIToFP:
7100  case Instruction::Trunc:
7101  case Instruction::FPTrunc: {
7102    // Computes the CastContextHint from a Load/Store instruction.
7103    auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7104      assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7105             "Expected a load or a store!");
7106
7107      if (VF.isScalar() || !TheLoop->contains(I))
7108        return TTI::CastContextHint::Normal;
7109
7110      switch (getWideningDecision(I, VF)) {
7111      case LoopVectorizationCostModel::CM_GatherScatter:
7112        return TTI::CastContextHint::GatherScatter;
7113      case LoopVectorizationCostModel::CM_Interleave:
7114        return TTI::CastContextHint::Interleave;
7115      case LoopVectorizationCostModel::CM_Scalarize:
7116      case LoopVectorizationCostModel::CM_Widen:
7117        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7118                                        : TTI::CastContextHint::Normal;
7119      case LoopVectorizationCostModel::CM_Widen_Reverse:
7120        return TTI::CastContextHint::Reversed;
7121      case LoopVectorizationCostModel::CM_Unknown:
7122        llvm_unreachable("Instr did not go through cost modelling?");
7123      case LoopVectorizationCostModel::CM_VectorCall:
7124      case LoopVectorizationCostModel::CM_IntrinsicCall:
7125        llvm_unreachable_internal("Instr has invalid widening decision");
7126      }
7127
7128      llvm_unreachable("Unhandled case!");
7129    };
7130
7131    unsigned Opcode = I->getOpcode();
7132    TTI::CastContextHint CCH = TTI::CastContextHint::None;
7133    // For Trunc, the context is the only user, which must be a StoreInst.
7134    if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7135      if (I->hasOneUse())
7136        if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7137          CCH = ComputeCCH(Store);
7138    }
7139    // For Z/Sext, the context is the operand, which must be a LoadInst.
7140    else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7141             Opcode == Instruction::FPExt) {
7142      if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7143        CCH = ComputeCCH(Load);
7144    }
7145
7146    // We optimize the truncation of induction variables having constant
7147    // integer steps. The cost of these truncations is the same as the scalar
7148    // operation.
7149    if (isOptimizableIVTruncate(I, VF)) {
7150      auto *Trunc = cast<TruncInst>(I);
7151      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7152                                  Trunc->getSrcTy(), CCH, CostKind, Trunc);
7153    }
7154
7155    // Detect reduction patterns
7156    if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7157      return *RedCost;
7158
7159    Type *SrcScalarTy = I->getOperand(0)->getType();
7160    Type *SrcVecTy =
7161        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7162    if (canTruncateToMinimalBitwidth(I, VF)) {
7163      // This cast is going to be shrunk. This may remove the cast or it might
7164      // turn it into slightly different cast. For example, if MinBW == 16,
7165      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7166      //
7167      // Calculate the modified src and dest types.
7168      Type *MinVecTy = VectorTy;
7169      if (Opcode == Instruction::Trunc) {
7170        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7171        VectorTy =
7172            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7173      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7174        // Leave SrcVecTy unchanged - we only shrink the destination element
7175        // type.
7176        VectorTy =
7177            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7178      }
7179    }
7180
7181    return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7182  }
7183  case Instruction::Call:
7184    return getVectorCallCost(cast<CallInst>(I), VF);
7185  case Instruction::ExtractValue:
7186    return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7187  case Instruction::Alloca:
7188    // We cannot easily widen alloca to a scalable alloca, as
7189    // the result would need to be a vector of pointers.
7190    if (VF.isScalable())
7191      return InstructionCost::getInvalid();
7192    [[fallthrough]];
7193  default:
7194    // This opcode is unknown. Assume that it is the same as 'mul'.
7195    return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7196  } // end of switch.
7197}
7198
7199void LoopVectorizationCostModel::collectValuesToIgnore() {
7200  // Ignore ephemeral values.
7201  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7202
7203  // Find all stores to invariant variables. Since they are going to sink
7204  // outside the loop we do not need calculate cost for them.
7205  for (BasicBlock *BB : TheLoop->blocks())
7206    for (Instruction &I : *BB) {
7207      StoreInst *SI;
7208      if ((SI = dyn_cast<StoreInst>(&I)) &&
7209          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7210        ValuesToIgnore.insert(&I);
7211    }
7212
7213  // Ignore type-promoting instructions we identified during reduction
7214  // detection.
7215  for (const auto &Reduction : Legal->getReductionVars()) {
7216    const RecurrenceDescriptor &RedDes = Reduction.second;
7217    const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7218    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7219  }
7220  // Ignore type-casting instructions we identified during induction
7221  // detection.
7222  for (const auto &Induction : Legal->getInductionVars()) {
7223    const InductionDescriptor &IndDes = Induction.second;
7224    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7225    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7226  }
7227}
7228
7229void LoopVectorizationCostModel::collectInLoopReductions() {
7230  for (const auto &Reduction : Legal->getReductionVars()) {
7231    PHINode *Phi = Reduction.first;
7232    const RecurrenceDescriptor &RdxDesc = Reduction.second;
7233
7234    // We don't collect reductions that are type promoted (yet).
7235    if (RdxDesc.getRecurrenceType() != Phi->getType())
7236      continue;
7237
7238    // If the target would prefer this reduction to happen "in-loop", then we
7239    // want to record it as such.
7240    unsigned Opcode = RdxDesc.getOpcode();
7241    if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7242        !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7243                                   TargetTransformInfo::ReductionFlags()))
7244      continue;
7245
7246    // Check that we can correctly put the reductions into the loop, by
7247    // finding the chain of operations that leads from the phi to the loop
7248    // exit value.
7249    SmallVector<Instruction *, 4> ReductionOperations =
7250        RdxDesc.getReductionOpChain(Phi, TheLoop);
7251    bool InLoop = !ReductionOperations.empty();
7252
7253    if (InLoop) {
7254      InLoopReductions.insert(Phi);
7255      // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256      Instruction *LastChain = Phi;
7257      for (auto *I : ReductionOperations) {
7258        InLoopReductionImmediateChains[I] = LastChain;
7259        LastChain = I;
7260      }
7261    }
7262    LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263                      << " reduction for phi: " << *Phi << "\n");
7264  }
7265}
7266
7267VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7268                               DebugLoc DL, const Twine &Name) {
7269  assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7270         Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7271  return tryInsertInstruction(
7272      new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7273}
7274
7275// This function will select a scalable VF if the target supports scalable
7276// vectors and a fixed one otherwise.
7277// TODO: we could return a pair of values that specify the max VF and
7278// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7279// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7280// doesn't have a cost model that can choose which plan to execute if
7281// more than one is generated.
7282static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7283                                     LoopVectorizationCostModel &CM) {
7284  unsigned WidestType;
7285  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7286
7287  TargetTransformInfo::RegisterKind RegKind =
7288      TTI.enableScalableVectorization()
7289          ? TargetTransformInfo::RGK_ScalableVector
7290          : TargetTransformInfo::RGK_FixedWidthVector;
7291
7292  TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7293  unsigned N = RegSize.getKnownMinValue() / WidestType;
7294  return ElementCount::get(N, RegSize.isScalable());
7295}
7296
7297VectorizationFactor
7298LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7299  ElementCount VF = UserVF;
7300  // Outer loop handling: They may require CFG and instruction level
7301  // transformations before even evaluating whether vectorization is profitable.
7302  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7303  // the vectorization pipeline.
7304  if (!OrigLoop->isInnermost()) {
7305    // If the user doesn't provide a vectorization factor, determine a
7306    // reasonable one.
7307    if (UserVF.isZero()) {
7308      VF = determineVPlanVF(TTI, CM);
7309      LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7310
7311      // Make sure we have a VF > 1 for stress testing.
7312      if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7313        LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7314                          << "overriding computed VF.\n");
7315        VF = ElementCount::getFixed(4);
7316      }
7317    } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7318               !ForceTargetSupportsScalableVectors) {
7319      LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320                        << "not supported by the target.\n");
7321      reportVectorizationFailure(
7322          "Scalable vectorization requested but not supported by the target",
7323          "the scalable user-specified vectorization width for outer-loop "
7324          "vectorization cannot be used because the target does not support "
7325          "scalable vectors.",
7326          "ScalableVFUnfeasible", ORE, OrigLoop);
7327      return VectorizationFactor::Disabled();
7328    }
7329    assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7330    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7331           "VF needs to be a power of two");
7332    LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7333                      << "VF " << VF << " to build VPlans.\n");
7334    buildVPlans(VF, VF);
7335
7336    // For VPlan build stress testing, we bail out after VPlan construction.
7337    if (VPlanBuildStressTest)
7338      return VectorizationFactor::Disabled();
7339
7340    return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7341  }
7342
7343  LLVM_DEBUG(
7344      dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7345                "VPlan-native path.\n");
7346  return VectorizationFactor::Disabled();
7347}
7348
7349std::optional<VectorizationFactor>
7350LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7351  assert(OrigLoop->isInnermost() && "Inner loop expected.");
7352  CM.collectValuesToIgnore();
7353  CM.collectElementTypesForWidening();
7354
7355  FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7356  if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7357    return std::nullopt;
7358
7359  // Invalidate interleave groups if all blocks of loop will be predicated.
7360  if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7361      !useMaskedInterleavedAccesses(TTI)) {
7362    LLVM_DEBUG(
7363        dbgs()
7364        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7365           "which requires masked-interleaved support.\n");
7366    if (CM.InterleaveInfo.invalidateGroups())
7367      // Invalidating interleave groups also requires invalidating all decisions
7368      // based on them, which includes widening decisions and uniform and scalar
7369      // values.
7370      CM.invalidateCostModelingDecisions();
7371  }
7372
7373  ElementCount MaxUserVF =
7374      UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7375  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7376  if (!UserVF.isZero() && UserVFIsLegal) {
7377    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378           "VF needs to be a power of two");
7379    // Collect the instructions (and their associated costs) that will be more
7380    // profitable to scalarize.
7381    CM.collectInLoopReductions();
7382    if (CM.selectUserVectorizationFactor(UserVF)) {
7383      LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7384      buildVPlansWithVPRecipes(UserVF, UserVF);
7385      if (!hasPlanWithVF(UserVF)) {
7386        LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7387                          << ".\n");
7388        return std::nullopt;
7389      }
7390
7391      LLVM_DEBUG(printPlans(dbgs()));
7392      return {{UserVF, 0, 0}};
7393    } else
7394      reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395                              "InvalidCost", ORE, OrigLoop);
7396  }
7397
7398  // Populate the set of Vectorization Factor Candidates.
7399  ElementCountSet VFCandidates;
7400  for (auto VF = ElementCount::getFixed(1);
7401       ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7402    VFCandidates.insert(VF);
7403  for (auto VF = ElementCount::getScalable(1);
7404       ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7405    VFCandidates.insert(VF);
7406
7407  CM.collectInLoopReductions();
7408  for (const auto &VF : VFCandidates) {
7409    // Collect Uniform and Scalar instructions after vectorization with VF.
7410    CM.collectUniformsAndScalars(VF);
7411
7412    // Collect the instructions (and their associated costs) that will be more
7413    // profitable to scalarize.
7414    if (VF.isVector())
7415      CM.collectInstsToScalarize(VF);
7416  }
7417
7418  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7419  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7420
7421  LLVM_DEBUG(printPlans(dbgs()));
7422  if (!MaxFactors.hasVector())
7423    return VectorizationFactor::Disabled();
7424
7425  // Select the optimal vectorization factor.
7426  VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7427  assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7428  if (!hasPlanWithVF(VF.Width)) {
7429    LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7430                      << ".\n");
7431    return std::nullopt;
7432  }
7433  return VF;
7434}
7435
7436VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7437  assert(count_if(VPlans,
7438                  [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7439             1 &&
7440         "Best VF has not a single VPlan.");
7441
7442  for (const VPlanPtr &Plan : VPlans) {
7443    if (Plan->hasVF(VF))
7444      return *Plan.get();
7445  }
7446  llvm_unreachable("No plan found!");
7447}
7448
7449static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7450  SmallVector<Metadata *, 4> MDs;
7451  // Reserve first location for self reference to the LoopID metadata node.
7452  MDs.push_back(nullptr);
7453  bool IsUnrollMetadata = false;
7454  MDNode *LoopID = L->getLoopID();
7455  if (LoopID) {
7456    // First find existing loop unrolling disable metadata.
7457    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7458      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7459      if (MD) {
7460        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7461        IsUnrollMetadata =
7462            S && S->getString().starts_with("llvm.loop.unroll.disable");
7463      }
7464      MDs.push_back(LoopID->getOperand(i));
7465    }
7466  }
7467
7468  if (!IsUnrollMetadata) {
7469    // Add runtime unroll disable metadata.
7470    LLVMContext &Context = L->getHeader()->getContext();
7471    SmallVector<Metadata *, 1> DisableOperands;
7472    DisableOperands.push_back(
7473        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7474    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7475    MDs.push_back(DisableNode);
7476    MDNode *NewLoopID = MDNode::get(Context, MDs);
7477    // Set operand 0 to refer to the loop id itself.
7478    NewLoopID->replaceOperandWith(0, NewLoopID);
7479    L->setLoopID(NewLoopID);
7480  }
7481}
7482
7483// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484// create a merge phi node for it and add it to \p ReductionResumeValues.
7485static void createAndCollectMergePhiForReduction(
7486    VPInstruction *RedResult,
7487    DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7488    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7489  if (!RedResult ||
7490      RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7491    return;
7492
7493  auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7494  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7495
7496  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7497  Value *FinalValue =
7498      State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7499  auto *ResumePhi =
7500      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7501
7502  // TODO: bc.merge.rdx should not be created here, instead it should be
7503  // modeled in VPlan.
7504  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7505  // Create a phi node that merges control-flow from the backedge-taken check
7506  // block and the middle block.
7507  auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7508                                     LoopScalarPreHeader->getTerminator());
7509
7510  // If we are fixing reductions in the epilogue loop then we should already
7511  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512  // we carry over the incoming values correctly.
7513  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7514    if (Incoming == LoopMiddleBlock)
7515      BCBlockPhi->addIncoming(FinalValue, Incoming);
7516    else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7517      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7518                              Incoming);
7519    else
7520      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7521  }
7522
7523  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7524  // TODO: This fixup should instead be modeled in VPlan.
7525  // Fix the scalar loop reduction variable with the incoming reduction sum
7526  // from the vector body and from the backedge value.
7527  int IncomingEdgeBlockIdx =
7528      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7529  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7530  // Pick the other block.
7531  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7532  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7533  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7534  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7535
7536  ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7537}
7538
7539std::pair<DenseMap<const SCEV *, Value *>,
7540          DenseMap<const RecurrenceDescriptor *, Value *>>
7541LoopVectorizationPlanner::executePlan(
7542    ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7543    InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7544    const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7545  assert(BestVPlan.hasVF(BestVF) &&
7546         "Trying to execute plan with unsupported VF");
7547  assert(BestVPlan.hasUF(BestUF) &&
7548         "Trying to execute plan with unsupported UF");
7549  assert(
7550      (IsEpilogueVectorization || !ExpandedSCEVs) &&
7551      "expanded SCEVs to reuse can only be used during epilogue vectorization");
7552
7553  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554                    << '\n');
7555
7556  if (!IsEpilogueVectorization)
7557    VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7558
7559  // Perform the actual loop transformation.
7560  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7561                         OrigLoop->getHeader()->getContext());
7562
7563  // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564  // before making any changes to the CFG.
7565  if (!BestVPlan.getPreheader()->empty()) {
7566    State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7567    State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7568    BestVPlan.getPreheader()->execute(&State);
7569  }
7570  if (!ILV.getTripCount())
7571    ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7572  else
7573    assert(IsEpilogueVectorization && "should only re-use the existing trip "
7574                                      "count during epilogue vectorization");
7575
7576  // 1. Set up the skeleton for vectorization, including vector pre-header and
7577  // middle block. The vector loop is created during VPlan execution.
7578  Value *CanonicalIVStartValue;
7579  std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7580      ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7581                                                     : State.ExpandedSCEVs);
7582
7583  // Only use noalias metadata when using memory checks guaranteeing no overlap
7584  // across all iterations.
7585  const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7586  std::unique_ptr<LoopVersioning> LVer = nullptr;
7587  if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7588      !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7589
7590    //  We currently don't use LoopVersioning for the actual loop cloning but we
7591    //  still use it to add the noalias metadata.
7592    //  TODO: Find a better way to re-use LoopVersioning functionality to add
7593    //        metadata.
7594    LVer = std::make_unique<LoopVersioning>(
7595        *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7596        PSE.getSE());
7597    State.LVer = &*LVer;
7598    State.LVer->prepareNoAliasMetadata();
7599  }
7600
7601  ILV.collectPoisonGeneratingRecipes(State);
7602
7603  ILV.printDebugTracesAtStart();
7604
7605  //===------------------------------------------------===//
7606  //
7607  // Notice: any optimization or new instruction that go
7608  // into the code below should also be implemented in
7609  // the cost-model.
7610  //
7611  //===------------------------------------------------===//
7612
7613  // 2. Copy and widen instructions from the old loop into the new loop.
7614  BestVPlan.prepareToExecute(ILV.getTripCount(),
7615                             ILV.getOrCreateVectorTripCount(nullptr),
7616                             CanonicalIVStartValue, State);
7617
7618  BestVPlan.execute(&State);
7619
7620  // 2.5 Collect reduction resume values.
7621  DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7622  auto *ExitVPBB =
7623      cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7624  for (VPRecipeBase &R : *ExitVPBB) {
7625    createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7626                                         ReductionResumeValues, State, OrigLoop,
7627                                         State.CFG.VPBB2IRBB[ExitVPBB]);
7628  }
7629
7630  // 2.6. Maintain Loop Hints
7631  // Keep all loop hints from the original loop on the vector loop (we'll
7632  // replace the vectorizer-specific hints below).
7633  MDNode *OrigLoopID = OrigLoop->getLoopID();
7634
7635  std::optional<MDNode *> VectorizedLoopID =
7636      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7637                                      LLVMLoopVectorizeFollowupVectorized});
7638
7639  VPBasicBlock *HeaderVPBB =
7640      BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7641  Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7642  if (VectorizedLoopID)
7643    L->setLoopID(*VectorizedLoopID);
7644  else {
7645    // Keep all loop hints from the original loop on the vector loop (we'll
7646    // replace the vectorizer-specific hints below).
7647    if (MDNode *LID = OrigLoop->getLoopID())
7648      L->setLoopID(LID);
7649
7650    LoopVectorizeHints Hints(L, true, *ORE);
7651    Hints.setAlreadyVectorized();
7652  }
7653  TargetTransformInfo::UnrollingPreferences UP;
7654  TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7655  if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7656    AddRuntimeUnrollDisableMetaData(L);
7657
7658  // 3. Fix the vectorized code: take care of header phi's, live-outs,
7659  //    predication, updating analyses.
7660  ILV.fixVectorizedLoop(State, BestVPlan);
7661
7662  ILV.printDebugTracesAtEnd();
7663
7664  return {State.ExpandedSCEVs, ReductionResumeValues};
7665}
7666
7667#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7668void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7669  for (const auto &Plan : VPlans)
7670    if (PrintVPlansInDotFormat)
7671      Plan->printDOT(O);
7672    else
7673      Plan->print(O);
7674}
7675#endif
7676
7677//===--------------------------------------------------------------------===//
7678// EpilogueVectorizerMainLoop
7679//===--------------------------------------------------------------------===//
7680
7681/// This function is partially responsible for generating the control flow
7682/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7683std::pair<BasicBlock *, Value *>
7684EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685    const SCEV2ValueTy &ExpandedSCEVs) {
7686  createVectorLoopSkeleton("");
7687
7688  // Generate the code to check the minimum iteration count of the vector
7689  // epilogue (see below).
7690  EPI.EpilogueIterationCountCheck =
7691      emitIterationCountCheck(LoopScalarPreHeader, true);
7692  EPI.EpilogueIterationCountCheck->setName("iter.check");
7693
7694  // Generate the code to check any assumptions that we've made for SCEV
7695  // expressions.
7696  EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7697
7698  // Generate the code that checks at runtime if arrays overlap. We put the
7699  // checks into a separate block to make the more common case of few elements
7700  // faster.
7701  EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7702
7703  // Generate the iteration count check for the main loop, *after* the check
7704  // for the epilogue loop, so that the path-length is shorter for the case
7705  // that goes directly through the vector epilogue. The longer-path length for
7706  // the main loop is compensated for, by the gain from vectorizing the larger
7707  // trip count. Note: the branch will get updated later on when we vectorize
7708  // the epilogue.
7709  EPI.MainLoopIterationCountCheck =
7710      emitIterationCountCheck(LoopScalarPreHeader, false);
7711
7712  // Generate the induction variable.
7713  EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7714
7715  // Skip induction resume value creation here because they will be created in
7716  // the second pass for the scalar loop. The induction resume values for the
7717  // inductions in the epilogue loop are created before executing the plan for
7718  // the epilogue loop.
7719
7720  return {completeLoopSkeleton(), nullptr};
7721}
7722
7723void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7724  LLVM_DEBUG({
7725    dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726           << "Main Loop VF:" << EPI.MainLoopVF
7727           << ", Main Loop UF:" << EPI.MainLoopUF
7728           << ", Epilogue Loop VF:" << EPI.EpilogueVF
7729           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7730  });
7731}
7732
7733void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734  DEBUG_WITH_TYPE(VerboseDebug, {
7735    dbgs() << "intermediate fn:\n"
7736           << *OrigLoop->getHeader()->getParent() << "\n";
7737  });
7738}
7739
7740BasicBlock *
7741EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7742                                                    bool ForEpilogue) {
7743  assert(Bypass && "Expected valid bypass basic block.");
7744  ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7745  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7746  Value *Count = getTripCount();
7747  // Reuse existing vector loop preheader for TC checks.
7748  // Note that new preheader block is generated for vector loop.
7749  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7750  IRBuilder<> Builder(TCCheckBlock->getTerminator());
7751
7752  // Generate code to check if the loop's trip count is less than VF * UF of the
7753  // main vector loop.
7754  auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7755                                                    : VF.isVector())
7756               ? ICmpInst::ICMP_ULE
7757               : ICmpInst::ICMP_ULT;
7758
7759  Value *CheckMinIters = Builder.CreateICmp(
7760      P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7761      "min.iters.check");
7762
7763  if (!ForEpilogue)
7764    TCCheckBlock->setName("vector.main.loop.iter.check");
7765
7766  // Create new preheader for vector loop.
7767  LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7768                                   DT, LI, nullptr, "vector.ph");
7769
7770  if (ForEpilogue) {
7771    assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7772                                 DT->getNode(Bypass)->getIDom()) &&
7773           "TC check is expected to dominate Bypass");
7774
7775    // Update dominator for Bypass & LoopExit.
7776    DT->changeImmediateDominator(Bypass, TCCheckBlock);
7777    if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7778      // For loops with multiple exits, there's no edge from the middle block
7779      // to exit blocks (as the epilogue must run) and thus no need to update
7780      // the immediate dominator of the exit blocks.
7781      DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7782
7783    LoopBypassBlocks.push_back(TCCheckBlock);
7784
7785    // Save the trip count so we don't have to regenerate it in the
7786    // vec.epilog.iter.check. This is safe to do because the trip count
7787    // generated here dominates the vector epilog iter check.
7788    EPI.TripCount = Count;
7789  }
7790
7791  BranchInst &BI =
7792      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7793  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7794    setBranchWeights(BI, MinItersBypassWeights);
7795  ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7796
7797  return TCCheckBlock;
7798}
7799
7800//===--------------------------------------------------------------------===//
7801// EpilogueVectorizerEpilogueLoop
7802//===--------------------------------------------------------------------===//
7803
7804/// This function is partially responsible for generating the control flow
7805/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7806std::pair<BasicBlock *, Value *>
7807EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808    const SCEV2ValueTy &ExpandedSCEVs) {
7809  createVectorLoopSkeleton("vec.epilog.");
7810
7811  // Now, compare the remaining count and if there aren't enough iterations to
7812  // execute the vectorized epilogue skip to the scalar part.
7813  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7814  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7815  LoopVectorPreHeader =
7816      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7817                 LI, nullptr, "vec.epilog.ph");
7818  emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7819                                          VecEpilogueIterationCountCheck);
7820
7821  // Adjust the control flow taking the state info from the main loop
7822  // vectorization into account.
7823  assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7824         "expected this to be saved from the previous pass.");
7825  EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7826      VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7827
7828  DT->changeImmediateDominator(LoopVectorPreHeader,
7829                               EPI.MainLoopIterationCountCheck);
7830
7831  EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7832      VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7833
7834  if (EPI.SCEVSafetyCheck)
7835    EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7836        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7837  if (EPI.MemSafetyCheck)
7838    EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7839        VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7840
7841  DT->changeImmediateDominator(
7842      VecEpilogueIterationCountCheck,
7843      VecEpilogueIterationCountCheck->getSinglePredecessor());
7844
7845  DT->changeImmediateDominator(LoopScalarPreHeader,
7846                               EPI.EpilogueIterationCountCheck);
7847  if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7848    // If there is an epilogue which must run, there's no edge from the
7849    // middle block to exit blocks  and thus no need to update the immediate
7850    // dominator of the exit blocks.
7851    DT->changeImmediateDominator(LoopExitBlock,
7852                                 EPI.EpilogueIterationCountCheck);
7853
7854  // Keep track of bypass blocks, as they feed start values to the induction and
7855  // reduction phis in the scalar loop preheader.
7856  if (EPI.SCEVSafetyCheck)
7857    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7858  if (EPI.MemSafetyCheck)
7859    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7860  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7861
7862  // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863  // reductions which merge control-flow from the latch block and the middle
7864  // block. Update the incoming values here and move the Phi into the preheader.
7865  SmallVector<PHINode *, 4> PhisInBlock;
7866  for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7867    PhisInBlock.push_back(&Phi);
7868
7869  for (PHINode *Phi : PhisInBlock) {
7870    Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7871    Phi->replaceIncomingBlockWith(
7872        VecEpilogueIterationCountCheck->getSinglePredecessor(),
7873        VecEpilogueIterationCountCheck);
7874
7875    // If the phi doesn't have an incoming value from the
7876    // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877    // value and also those from other check blocks. This is needed for
7878    // reduction phis only.
7879    if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7880          return EPI.EpilogueIterationCountCheck == IncB;
7881        }))
7882      continue;
7883    Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7884    if (EPI.SCEVSafetyCheck)
7885      Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7886    if (EPI.MemSafetyCheck)
7887      Phi->removeIncomingValue(EPI.MemSafetyCheck);
7888  }
7889
7890  // Generate a resume induction for the vector epilogue and put it in the
7891  // vector epilogue preheader
7892  Type *IdxTy = Legal->getWidestInductionType();
7893  PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7894  EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7895  EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896  EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897                           EPI.MainLoopIterationCountCheck);
7898
7899  // Generate induction resume values. These variables save the new starting
7900  // indexes for the scalar loop. They are used to test if there are any tail
7901  // iterations left once the vector loop has completed.
7902  // Note that when the vectorized epilogue is skipped due to iteration count
7903  // check, then the resume value for the induction variable comes from
7904  // the trip count of the main vector loop, hence passing the AdditionalBypass
7905  // argument.
7906  createInductionResumeValues(ExpandedSCEVs,
7907                              {VecEpilogueIterationCountCheck,
7908                               EPI.VectorTripCount} /* AdditionalBypass */);
7909
7910  return {completeLoopSkeleton(), EPResumeVal};
7911}
7912
7913BasicBlock *
7914EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7915    BasicBlock *Bypass, BasicBlock *Insert) {
7916
7917  assert(EPI.TripCount &&
7918         "Expected trip count to have been safed in the first pass.");
7919  assert(
7920      (!isa<Instruction>(EPI.TripCount) ||
7921       DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7922      "saved trip count does not dominate insertion point.");
7923  Value *TC = EPI.TripCount;
7924  IRBuilder<> Builder(Insert->getTerminator());
7925  Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7926
7927  // Generate code to check if the loop's trip count is less than VF * UF of the
7928  // vector epilogue loop.
7929  auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7930               ? ICmpInst::ICMP_ULE
7931               : ICmpInst::ICMP_ULT;
7932
7933  Value *CheckMinIters =
7934      Builder.CreateICmp(P, Count,
7935                         createStepForVF(Builder, Count->getType(),
7936                                         EPI.EpilogueVF, EPI.EpilogueUF),
7937                         "min.epilog.iters.check");
7938
7939  BranchInst &BI =
7940      *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7941  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7942    unsigned MainLoopStep = UF * VF.getKnownMinValue();
7943    unsigned EpilogueLoopStep =
7944        EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7945    // We assume the remaining `Count` is equally distributed in
7946    // [0, MainLoopStep)
7947    // So the probability for `Count < EpilogueLoopStep` should be
7948    // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949    unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7950    const uint32_t Weights[] = {EstimatedSkipCount,
7951                                MainLoopStep - EstimatedSkipCount};
7952    setBranchWeights(BI, Weights);
7953  }
7954  ReplaceInstWithInst(Insert->getTerminator(), &BI);
7955
7956  LoopBypassBlocks.push_back(Insert);
7957  return Insert;
7958}
7959
7960void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7961  LLVM_DEBUG({
7962    dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963           << "Epilogue Loop VF:" << EPI.EpilogueVF
7964           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7965  });
7966}
7967
7968void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969  DEBUG_WITH_TYPE(VerboseDebug, {
7970    dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7971  });
7972}
7973
7974bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975    const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7976  assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7977  bool PredicateAtRangeStart = Predicate(Range.Start);
7978
7979  for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7980    if (Predicate(TmpVF) != PredicateAtRangeStart) {
7981      Range.End = TmpVF;
7982      break;
7983    }
7984
7985  return PredicateAtRangeStart;
7986}
7987
7988/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7989/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7990/// of VF's starting at a given VF and extending it as much as possible. Each
7991/// vectorization decision can potentially shorten this sub-range during
7992/// buildVPlan().
7993void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7994                                           ElementCount MaxVF) {
7995  auto MaxVFTimes2 = MaxVF * 2;
7996  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7997    VFRange SubRange = {VF, MaxVFTimes2};
7998    VPlans.push_back(buildVPlan(SubRange));
7999    VF = SubRange.End;
8000  }
8001}
8002
8003VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8004                                         VPlan &Plan) {
8005  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8006
8007  // Look for cached value.
8008  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8009  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8010  if (ECEntryIt != EdgeMaskCache.end())
8011    return ECEntryIt->second;
8012
8013  VPValue *SrcMask = getBlockInMask(Src);
8014
8015  // The terminator has to be a branch inst!
8016  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8017  assert(BI && "Unexpected terminator found");
8018
8019  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8020    return EdgeMaskCache[Edge] = SrcMask;
8021
8022  // If source is an exiting block, we know the exit edge is dynamically dead
8023  // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8024  // adding uses of an otherwise potentially dead instruction.
8025  if (OrigLoop->isLoopExiting(Src))
8026    return EdgeMaskCache[Edge] = SrcMask;
8027
8028  VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
8029  assert(EdgeMask && "No Edge Mask found for condition");
8030
8031  if (BI->getSuccessor(0) != Dst)
8032    EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8033
8034  if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8035    // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036    // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037    // The select version does not introduce new UB if SrcMask is false and
8038    // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039    VPValue *False = Plan.getVPValueOrAddLiveIn(
8040        ConstantInt::getFalse(BI->getCondition()->getType()));
8041    EdgeMask =
8042        Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8043  }
8044
8045  return EdgeMaskCache[Edge] = EdgeMask;
8046}
8047
8048void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8049  BasicBlock *Header = OrigLoop->getHeader();
8050
8051  // When not folding the tail, use nullptr to model all-true mask.
8052  if (!CM.foldTailByMasking()) {
8053    BlockMaskCache[Header] = nullptr;
8054    return;
8055  }
8056
8057  // Introduce the early-exit compare IV <= BTC to form header block mask.
8058  // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8059  // constructing the desired canonical IV in the header block as its first
8060  // non-phi instructions.
8061
8062  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8063  auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8064  auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8065  HeaderVPBB->insert(IV, NewInsertionPoint);
8066
8067  VPBuilder::InsertPointGuard Guard(Builder);
8068  Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8069  VPValue *BlockMask = nullptr;
8070  VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8071  BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8072  BlockMaskCache[Header] = BlockMask;
8073}
8074
8075VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8076  // Return the cached value.
8077  BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8078  assert(BCEntryIt != BlockMaskCache.end() &&
8079         "Trying to access mask for block without one.");
8080  return BCEntryIt->second;
8081}
8082
8083void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8084  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8085  assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8086  assert(OrigLoop->getHeader() != BB &&
8087         "Loop header must have cached block mask");
8088
8089  // All-one mask is modelled as no-mask following the convention for masked
8090  // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091  VPValue *BlockMask = nullptr;
8092  // This is the block mask. We OR all incoming edges.
8093  for (auto *Predecessor : predecessors(BB)) {
8094    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8095    if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8096      BlockMaskCache[BB] = EdgeMask;
8097      return;
8098    }
8099
8100    if (!BlockMask) { // BlockMask has its initialized nullptr value.
8101      BlockMask = EdgeMask;
8102      continue;
8103    }
8104
8105    BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8106  }
8107
8108  BlockMaskCache[BB] = BlockMask;
8109}
8110
8111VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8112                                                ArrayRef<VPValue *> Operands,
8113                                                VFRange &Range,
8114                                                VPlanPtr &Plan) {
8115  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8116         "Must be called with either a load or store");
8117
8118  auto willWiden = [&](ElementCount VF) -> bool {
8119    LoopVectorizationCostModel::InstWidening Decision =
8120        CM.getWideningDecision(I, VF);
8121    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8122           "CM decision should be taken at this point.");
8123    if (Decision == LoopVectorizationCostModel::CM_Interleave)
8124      return true;
8125    if (CM.isScalarAfterVectorization(I, VF) ||
8126        CM.isProfitableToScalarize(I, VF))
8127      return false;
8128    return Decision != LoopVectorizationCostModel::CM_Scalarize;
8129  };
8130
8131  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8132    return nullptr;
8133
8134  VPValue *Mask = nullptr;
8135  if (Legal->isMaskRequired(I))
8136    Mask = getBlockInMask(I->getParent());
8137
8138  // Determine if the pointer operand of the access is either consecutive or
8139  // reverse consecutive.
8140  LoopVectorizationCostModel::InstWidening Decision =
8141      CM.getWideningDecision(I, Range.Start);
8142  bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8143  bool Consecutive =
8144      Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8145
8146  VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8147  if (Consecutive) {
8148    auto *GEP = dyn_cast<GetElementPtrInst>(
8149        Ptr->getUnderlyingValue()->stripPointerCasts());
8150    auto *VectorPtr = new VPVectorPointerRecipe(
8151        Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8152        I->getDebugLoc());
8153    Builder.getInsertBlock()->appendRecipe(VectorPtr);
8154    Ptr = VectorPtr;
8155  }
8156  if (LoadInst *Load = dyn_cast<LoadInst>(I))
8157    return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8158                                              Reverse);
8159
8160  StoreInst *Store = cast<StoreInst>(I);
8161  return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8162                                            Consecutive, Reverse);
8163}
8164
8165/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8166/// insert a recipe to expand the step for the induction recipe.
8167static VPWidenIntOrFpInductionRecipe *
8168createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8169                            VPValue *Start, const InductionDescriptor &IndDesc,
8170                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8171                            VFRange &Range) {
8172  assert(IndDesc.getStartValue() ==
8173         Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8174  assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8175         "step must be loop invariant");
8176
8177  VPValue *Step =
8178      vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8179  if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8180    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8181  }
8182  assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8183  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8184}
8185
8186VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8187    PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8188
8189  // Check if this is an integer or fp induction. If so, build the recipe that
8190  // produces its scalar and vector values.
8191  if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8192    return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8193                                       *PSE.getSE(), *OrigLoop, Range);
8194
8195  // Check if this is pointer induction. If so, build the recipe for it.
8196  if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8197    VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8198                                                           *PSE.getSE());
8199    return new VPWidenPointerInductionRecipe(
8200        Phi, Operands[0], Step, *II,
8201        LoopVectorizationPlanner::getDecisionAndClampRange(
8202            [&](ElementCount VF) {
8203              return CM.isScalarAfterVectorization(Phi, VF);
8204            },
8205            Range));
8206  }
8207  return nullptr;
8208}
8209
8210VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8211    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8212  // Optimize the special case where the source is a constant integer
8213  // induction variable. Notice that we can only optimize the 'trunc' case
8214  // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8215  // (c) other casts depend on pointer size.
8216
8217  // Determine whether \p K is a truncation based on an induction variable that
8218  // can be optimized.
8219  auto isOptimizableIVTruncate =
8220      [&](Instruction *K) -> std::function<bool(ElementCount)> {
8221    return [=](ElementCount VF) -> bool {
8222      return CM.isOptimizableIVTruncate(K, VF);
8223    };
8224  };
8225
8226  if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227          isOptimizableIVTruncate(I), Range)) {
8228
8229    auto *Phi = cast<PHINode>(I->getOperand(0));
8230    const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8231    VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8232    return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8233                                       *OrigLoop, Range);
8234  }
8235  return nullptr;
8236}
8237
8238VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8239                                                ArrayRef<VPValue *> Operands,
8240                                                VPlanPtr &Plan) {
8241  // If all incoming values are equal, the incoming VPValue can be used directly
8242  // instead of creating a new VPBlendRecipe.
8243  if (llvm::all_equal(Operands))
8244    return Operands[0];
8245
8246  unsigned NumIncoming = Phi->getNumIncomingValues();
8247  // For in-loop reductions, we do not need to create an additional select.
8248  VPValue *InLoopVal = nullptr;
8249  for (unsigned In = 0; In < NumIncoming; In++) {
8250    PHINode *PhiOp =
8251        dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8252    if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8253      assert(!InLoopVal && "Found more than one in-loop reduction!");
8254      InLoopVal = Operands[In];
8255    }
8256  }
8257
8258  assert((!InLoopVal || NumIncoming == 2) &&
8259         "Found an in-loop reduction for PHI with unexpected number of "
8260         "incoming values");
8261  if (InLoopVal)
8262    return Operands[Operands[0] == InLoopVal ? 1 : 0];
8263
8264  // We know that all PHIs in non-header blocks are converted into selects, so
8265  // we don't have to worry about the insertion order and we can just use the
8266  // builder. At this point we generate the predication tree. There may be
8267  // duplications since this is a simple recursive scan, but future
8268  // optimizations will clean it up.
8269  SmallVector<VPValue *, 2> OperandsWithMask;
8270
8271  for (unsigned In = 0; In < NumIncoming; In++) {
8272    VPValue *EdgeMask =
8273        createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8274    assert((EdgeMask || NumIncoming == 1) &&
8275           "Multiple predecessors with one having a full mask");
8276    OperandsWithMask.push_back(Operands[In]);
8277    if (EdgeMask)
8278      OperandsWithMask.push_back(EdgeMask);
8279  }
8280  return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8281}
8282
8283VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8284                                                   ArrayRef<VPValue *> Operands,
8285                                                   VFRange &Range,
8286                                                   VPlanPtr &Plan) {
8287  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8288      [this, CI](ElementCount VF) {
8289        return CM.isScalarWithPredication(CI, VF);
8290      },
8291      Range);
8292
8293  if (IsPredicated)
8294    return nullptr;
8295
8296  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8297  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8298             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8299             ID == Intrinsic::pseudoprobe ||
8300             ID == Intrinsic::experimental_noalias_scope_decl))
8301    return nullptr;
8302
8303  SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8304
8305  // Is it beneficial to perform intrinsic call compared to lib call?
8306  bool ShouldUseVectorIntrinsic =
8307      ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8308                [&](ElementCount VF) -> bool {
8309                  return CM.getCallWideningDecision(CI, VF).Kind ==
8310                         LoopVectorizationCostModel::CM_IntrinsicCall;
8311                },
8312                Range);
8313  if (ShouldUseVectorIntrinsic)
8314    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8315                                 CI->getDebugLoc());
8316
8317  Function *Variant = nullptr;
8318  std::optional<unsigned> MaskPos;
8319  // Is better to call a vectorized version of the function than to to scalarize
8320  // the call?
8321  auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8322      [&](ElementCount VF) -> bool {
8323        // The following case may be scalarized depending on the VF.
8324        // The flag shows whether we can use a usual Call for vectorized
8325        // version of the instruction.
8326
8327        // If we've found a variant at a previous VF, then stop looking. A
8328        // vectorized variant of a function expects input in a certain shape
8329        // -- basically the number of input registers, the number of lanes
8330        // per register, and whether there's a mask required.
8331        // We store a pointer to the variant in the VPWidenCallRecipe, so
8332        // once we have an appropriate variant it's only valid for that VF.
8333        // This will force a different vplan to be generated for each VF that
8334        // finds a valid variant.
8335        if (Variant)
8336          return false;
8337        LoopVectorizationCostModel::CallWideningDecision Decision =
8338            CM.getCallWideningDecision(CI, VF);
8339        if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8340          Variant = Decision.Variant;
8341          MaskPos = Decision.MaskPos;
8342          return true;
8343        }
8344
8345        return false;
8346      },
8347      Range);
8348  if (ShouldUseVectorCall) {
8349    if (MaskPos.has_value()) {
8350      // We have 2 cases that would require a mask:
8351      //   1) The block needs to be predicated, either due to a conditional
8352      //      in the scalar loop or use of an active lane mask with
8353      //      tail-folding, and we use the appropriate mask for the block.
8354      //   2) No mask is required for the block, but the only available
8355      //      vector variant at this VF requires a mask, so we synthesize an
8356      //      all-true mask.
8357      VPValue *Mask = nullptr;
8358      if (Legal->isMaskRequired(CI))
8359        Mask = getBlockInMask(CI->getParent());
8360      else
8361        Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362            IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8363
8364      Ops.insert(Ops.begin() + *MaskPos, Mask);
8365    }
8366
8367    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8368                                 Intrinsic::not_intrinsic, CI->getDebugLoc(),
8369                                 Variant);
8370  }
8371
8372  return nullptr;
8373}
8374
8375bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8376  assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8377         !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8378  // Instruction should be widened, unless it is scalar after vectorization,
8379  // scalarization is profitable or it is predicated.
8380  auto WillScalarize = [this, I](ElementCount VF) -> bool {
8381    return CM.isScalarAfterVectorization(I, VF) ||
8382           CM.isProfitableToScalarize(I, VF) ||
8383           CM.isScalarWithPredication(I, VF);
8384  };
8385  return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8386                                                             Range);
8387}
8388
8389VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8390                                          ArrayRef<VPValue *> Operands,
8391                                          VPBasicBlock *VPBB, VPlanPtr &Plan) {
8392  switch (I->getOpcode()) {
8393  default:
8394    return nullptr;
8395  case Instruction::SDiv:
8396  case Instruction::UDiv:
8397  case Instruction::SRem:
8398  case Instruction::URem: {
8399    // If not provably safe, use a select to form a safe divisor before widening the
8400    // div/rem operation itself.  Otherwise fall through to general handling below.
8401    if (CM.isPredicatedInst(I)) {
8402      SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8403      VPValue *Mask = getBlockInMask(I->getParent());
8404      VPValue *One = Plan->getVPValueOrAddLiveIn(
8405          ConstantInt::get(I->getType(), 1u, false));
8406      auto *SafeRHS =
8407         new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8408                           I->getDebugLoc());
8409      VPBB->appendRecipe(SafeRHS);
8410      Ops[1] = SafeRHS;
8411      return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8412    }
8413    [[fallthrough]];
8414  }
8415  case Instruction::Add:
8416  case Instruction::And:
8417  case Instruction::AShr:
8418  case Instruction::FAdd:
8419  case Instruction::FCmp:
8420  case Instruction::FDiv:
8421  case Instruction::FMul:
8422  case Instruction::FNeg:
8423  case Instruction::FRem:
8424  case Instruction::FSub:
8425  case Instruction::ICmp:
8426  case Instruction::LShr:
8427  case Instruction::Mul:
8428  case Instruction::Or:
8429  case Instruction::Select:
8430  case Instruction::Shl:
8431  case Instruction::Sub:
8432  case Instruction::Xor:
8433  case Instruction::Freeze:
8434    return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8435  };
8436}
8437
8438void VPRecipeBuilder::fixHeaderPhis() {
8439  BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8440  for (VPHeaderPHIRecipe *R : PhisToFix) {
8441    auto *PN = cast<PHINode>(R->getUnderlyingValue());
8442    VPRecipeBase *IncR =
8443        getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8444    R->addOperand(IncR->getVPSingleValue());
8445  }
8446}
8447
8448VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8449                                                       VFRange &Range,
8450                                                       VPlan &Plan) {
8451  bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8452      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8453      Range);
8454
8455  bool IsPredicated = CM.isPredicatedInst(I);
8456
8457  // Even if the instruction is not marked as uniform, there are certain
8458  // intrinsic calls that can be effectively treated as such, so we check for
8459  // them here. Conservatively, we only do this for scalable vectors, since
8460  // for fixed-width VFs we can always fall back on full scalarization.
8461  if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8462    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8463    case Intrinsic::assume:
8464    case Intrinsic::lifetime_start:
8465    case Intrinsic::lifetime_end:
8466      // For scalable vectors if one of the operands is variant then we still
8467      // want to mark as uniform, which will generate one instruction for just
8468      // the first lane of the vector. We can't scalarize the call in the same
8469      // way as for fixed-width vectors because we don't know how many lanes
8470      // there are.
8471      //
8472      // The reasons for doing it this way for scalable vectors are:
8473      //   1. For the assume intrinsic generating the instruction for the first
8474      //      lane is still be better than not generating any at all. For
8475      //      example, the input may be a splat across all lanes.
8476      //   2. For the lifetime start/end intrinsics the pointer operand only
8477      //      does anything useful when the input comes from a stack object,
8478      //      which suggests it should always be uniform. For non-stack objects
8479      //      the effect is to poison the object, which still allows us to
8480      //      remove the call.
8481      IsUniform = true;
8482      break;
8483    default:
8484      break;
8485    }
8486  }
8487  VPValue *BlockInMask = nullptr;
8488  if (!IsPredicated) {
8489    // Finalize the recipe for Instr, first if it is not predicated.
8490    LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8491  } else {
8492    LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8493    // Instructions marked for predication are replicated and a mask operand is
8494    // added initially. Masked replicate recipes will later be placed under an
8495    // if-then construct to prevent side-effects. Generate recipes to compute
8496    // the block mask for this region.
8497    BlockInMask = getBlockInMask(I->getParent());
8498  }
8499
8500  auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8501                                       IsUniform, BlockInMask);
8502  return toVPRecipeResult(Recipe);
8503}
8504
8505VPRecipeOrVPValueTy
8506VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8507                                        ArrayRef<VPValue *> Operands,
8508                                        VFRange &Range, VPBasicBlock *VPBB,
8509                                        VPlanPtr &Plan) {
8510  // First, check for specific widening recipes that deal with inductions, Phi
8511  // nodes, calls and memory operations.
8512  VPRecipeBase *Recipe;
8513  if (auto Phi = dyn_cast<PHINode>(Instr)) {
8514    if (Phi->getParent() != OrigLoop->getHeader())
8515      return tryToBlend(Phi, Operands, Plan);
8516
8517    // Always record recipes for header phis. Later first-order recurrence phis
8518    // can have earlier phis as incoming values.
8519    recordRecipeOf(Phi);
8520
8521    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8522      return toVPRecipeResult(Recipe);
8523
8524    VPHeaderPHIRecipe *PhiRecipe = nullptr;
8525    assert((Legal->isReductionVariable(Phi) ||
8526            Legal->isFixedOrderRecurrence(Phi)) &&
8527           "can only widen reductions and fixed-order recurrences here");
8528    VPValue *StartV = Operands[0];
8529    if (Legal->isReductionVariable(Phi)) {
8530      const RecurrenceDescriptor &RdxDesc =
8531          Legal->getReductionVars().find(Phi)->second;
8532      assert(RdxDesc.getRecurrenceStartValue() ==
8533             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8534      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8535                                           CM.isInLoopReduction(Phi),
8536                                           CM.useOrderedReductions(RdxDesc));
8537    } else {
8538      // TODO: Currently fixed-order recurrences are modeled as chains of
8539      // first-order recurrences. If there are no users of the intermediate
8540      // recurrences in the chain, the fixed order recurrence should be modeled
8541      // directly, enabling more efficient codegen.
8542      PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8543    }
8544
8545    // Record the incoming value from the backedge, so we can add the incoming
8546    // value from the backedge after all recipes have been created.
8547    auto *Inc = cast<Instruction>(
8548        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8549    auto RecipeIter = Ingredient2Recipe.find(Inc);
8550    if (RecipeIter == Ingredient2Recipe.end())
8551      recordRecipeOf(Inc);
8552
8553    PhisToFix.push_back(PhiRecipe);
8554    return toVPRecipeResult(PhiRecipe);
8555  }
8556
8557  if (isa<TruncInst>(Instr) &&
8558      (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559                                               Range, *Plan)))
8560    return toVPRecipeResult(Recipe);
8561
8562  // All widen recipes below deal only with VF > 1.
8563  if (LoopVectorizationPlanner::getDecisionAndClampRange(
8564          [&](ElementCount VF) { return VF.isScalar(); }, Range))
8565    return nullptr;
8566
8567  if (auto *CI = dyn_cast<CallInst>(Instr))
8568    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8569
8570  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8571    return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8572
8573  if (!shouldWiden(Instr, Range))
8574    return nullptr;
8575
8576  if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577    return toVPRecipeResult(new VPWidenGEPRecipe(
8578        GEP, make_range(Operands.begin(), Operands.end())));
8579
8580  if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581    return toVPRecipeResult(new VPWidenSelectRecipe(
8582        *SI, make_range(Operands.begin(), Operands.end())));
8583  }
8584
8585  if (auto *CI = dyn_cast<CastInst>(Instr)) {
8586    return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8587                                                  CI->getType(), *CI));
8588  }
8589
8590  return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8591}
8592
8593void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8594                                                        ElementCount MaxVF) {
8595  assert(OrigLoop->isInnermost() && "Inner loop expected.");
8596
8597  auto MaxVFTimes2 = MaxVF * 2;
8598  for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8599    VFRange SubRange = {VF, MaxVFTimes2};
8600    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8601      // Now optimize the initial VPlan.
8602      if (!Plan->hasVF(ElementCount::getFixed(1)))
8603        VPlanTransforms::truncateToMinimalBitwidths(
8604            *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8605      VPlanTransforms::optimize(*Plan, *PSE.getSE());
8606      assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8607      VPlans.push_back(std::move(Plan));
8608    }
8609    VF = SubRange.End;
8610  }
8611}
8612
8613// Add the necessary canonical IV and branch recipes required to control the
8614// loop.
8615static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8616                                  DebugLoc DL) {
8617  Value *StartIdx = ConstantInt::get(IdxTy, 0);
8618  auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8619
8620  // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8621  auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8622  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8623  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8624  Header->insert(CanonicalIVPHI, Header->begin());
8625
8626  // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8627  // IV by VF * UF.
8628  auto *CanonicalIVIncrement =
8629      new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8630                        {HasNUW, false}, DL, "index.next");
8631  CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8632
8633  VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8634  EB->appendRecipe(CanonicalIVIncrement);
8635
8636  // Add the BranchOnCount VPInstruction to the latch.
8637  VPInstruction *BranchBack =
8638      new VPInstruction(VPInstruction::BranchOnCount,
8639                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8640  EB->appendRecipe(BranchBack);
8641}
8642
8643// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8644// original exit block.
8645static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8646                                VPlan &Plan) {
8647  BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8648  BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8649  // Only handle single-exit loops with unique exit blocks for now.
8650  if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8651    return;
8652
8653  // Introduce VPUsers modeling the exit values.
8654  for (PHINode &ExitPhi : ExitBB->phis()) {
8655    Value *IncomingValue =
8656        ExitPhi.getIncomingValueForBlock(ExitingBB);
8657    VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8658    Plan.addLiveOut(&ExitPhi, V);
8659  }
8660}
8661
8662VPlanPtr
8663LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8664
8665  SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8666
8667  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8668
8669  // ---------------------------------------------------------------------------
8670  // Pre-construction: record ingredients whose recipes we'll need to further
8671  // process after constructing the initial VPlan.
8672  // ---------------------------------------------------------------------------
8673
8674  // For each interleave group which is relevant for this (possibly trimmed)
8675  // Range, add it to the set of groups to be later applied to the VPlan and add
8676  // placeholders for its members' Recipes which we'll be replacing with a
8677  // single VPInterleaveRecipe.
8678  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8679    auto applyIG = [IG, this](ElementCount VF) -> bool {
8680      bool Result = (VF.isVector() && // Query is illegal for VF == 1
8681                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
8682                         LoopVectorizationCostModel::CM_Interleave);
8683      // For scalable vectors, the only interleave factor currently supported
8684      // is 2 since we require the (de)interleave2 intrinsics instead of
8685      // shufflevectors.
8686      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8687             "Unsupported interleave factor for scalable vectors");
8688      return Result;
8689    };
8690    if (!getDecisionAndClampRange(applyIG, Range))
8691      continue;
8692    InterleaveGroups.insert(IG);
8693    for (unsigned i = 0; i < IG->getFactor(); i++)
8694      if (Instruction *Member = IG->getMember(i))
8695        RecipeBuilder.recordRecipeOf(Member);
8696  };
8697
8698  // ---------------------------------------------------------------------------
8699  // Build initial VPlan: Scan the body of the loop in a topological order to
8700  // visit each basic block after having visited its predecessor basic blocks.
8701  // ---------------------------------------------------------------------------
8702
8703  // Create initial VPlan skeleton, having a basic block for the pre-header
8704  // which contains SCEV expansions that need to happen before the CFG is
8705  // modified; a basic block for the vector pre-header, followed by a region for
8706  // the vector loop, followed by the middle basic block. The skeleton vector
8707  // loop region contains a header and latch basic blocks.
8708  VPlanPtr Plan = VPlan::createInitialVPlan(
8709      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8710      *PSE.getSE());
8711  VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8712  VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8713  VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8714  Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8715  Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8716
8717  // Don't use getDecisionAndClampRange here, because we don't know the UF
8718  // so this function is better to be conservative, rather than to split
8719  // it up into different VPlans.
8720  // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721  bool IVUpdateMayOverflow = false;
8722  for (ElementCount VF : Range)
8723    IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8724
8725  DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8726  TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8727  // When not folding the tail, we know that the induction increment will not
8728  // overflow.
8729  bool HasNUW = Style == TailFoldingStyle::None;
8730  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8731
8732  // Scan the body of the loop in a topological order to visit each basic block
8733  // after having visited its predecessor basic blocks.
8734  LoopBlocksDFS DFS(OrigLoop);
8735  DFS.perform(LI);
8736
8737  VPBasicBlock *VPBB = HeaderVPBB;
8738  bool NeedsMasks = CM.foldTailByMasking() ||
8739                    any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8740                      return Legal->blockNeedsPredication(BB);
8741                    });
8742  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8743    // Relevant instructions from basic block BB will be grouped into VPRecipe
8744    // ingredients and fill a new VPBasicBlock.
8745    if (VPBB != HeaderVPBB)
8746      VPBB->setName(BB->getName());
8747    Builder.setInsertPoint(VPBB);
8748
8749    if (VPBB == HeaderVPBB)
8750      RecipeBuilder.createHeaderMask(*Plan);
8751    else if (NeedsMasks)
8752      RecipeBuilder.createBlockInMask(BB, *Plan);
8753
8754    // Introduce each ingredient into VPlan.
8755    // TODO: Model and preserve debug intrinsics in VPlan.
8756    for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8757      Instruction *Instr = &I;
8758      SmallVector<VPValue *, 4> Operands;
8759      auto *Phi = dyn_cast<PHINode>(Instr);
8760      if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8761        Operands.push_back(Plan->getVPValueOrAddLiveIn(
8762            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8763      } else {
8764        auto OpRange = Plan->mapToVPValues(Instr->operands());
8765        Operands = {OpRange.begin(), OpRange.end()};
8766      }
8767
8768      // Invariant stores inside loop will be deleted and a single store
8769      // with the final reduction value will be added to the exit block
8770      StoreInst *SI;
8771      if ((SI = dyn_cast<StoreInst>(&I)) &&
8772          Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8773        continue;
8774
8775      auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8776          Instr, Operands, Range, VPBB, Plan);
8777      if (!RecipeOrValue)
8778        RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8779      // If Instr can be simplified to an existing VPValue, use it.
8780      if (isa<VPValue *>(RecipeOrValue)) {
8781        auto *VPV = cast<VPValue *>(RecipeOrValue);
8782        Plan->addVPValue(Instr, VPV);
8783        // If the re-used value is a recipe, register the recipe for the
8784        // instruction, in case the recipe for Instr needs to be recorded.
8785        if (VPRecipeBase *R = VPV->getDefiningRecipe())
8786          RecipeBuilder.setRecipe(Instr, R);
8787        continue;
8788      }
8789      // Otherwise, add the new recipe.
8790      VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8791      for (auto *Def : Recipe->definedValues()) {
8792        auto *UV = Def->getUnderlyingValue();
8793        Plan->addVPValue(UV, Def);
8794      }
8795
8796      RecipeBuilder.setRecipe(Instr, Recipe);
8797      if (isa<VPHeaderPHIRecipe>(Recipe)) {
8798        // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799        // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800        // recipes and need to be moved to the phi section of HeaderVPBB:
8801        // * tail-folding (non-phi recipes computing the header mask are
8802        // introduced earlier than regular header phi recipes, and should appear
8803        // after them)
8804        // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8805
8806        assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8807                CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8808               "unexpected recipe needs moving");
8809        Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8810      } else
8811        VPBB->appendRecipe(Recipe);
8812    }
8813
8814    VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8815    VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8816  }
8817
8818  // After here, VPBB should not be used.
8819  VPBB = nullptr;
8820
8821  if (CM.requiresScalarEpilogue(Range)) {
8822    // No edge from the middle block to the unique exit block has been inserted
8823    // and there is nothing to fix from vector loop; phis should have incoming
8824    // from scalar loop only.
8825  } else
8826    addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8827
8828  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8829         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8830         "entry block must be set to a VPRegionBlock having a non-empty entry "
8831         "VPBasicBlock");
8832  RecipeBuilder.fixHeaderPhis();
8833
8834  // ---------------------------------------------------------------------------
8835  // Transform initial VPlan: Apply previously taken decisions, in order, to
8836  // bring the VPlan to its final state.
8837  // ---------------------------------------------------------------------------
8838
8839  // Adjust the recipes for any inloop reductions.
8840  adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8841
8842  // Interleave memory: for each Interleave Group we marked earlier as relevant
8843  // for this VPlan, replace the Recipes widening its memory instructions with a
8844  // single VPInterleaveRecipe at its insertion point.
8845  for (const auto *IG : InterleaveGroups) {
8846    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8847        RecipeBuilder.getRecipe(IG->getInsertPos()));
8848    SmallVector<VPValue *, 4> StoredValues;
8849    for (unsigned i = 0; i < IG->getFactor(); ++i)
8850      if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8851        auto *StoreR =
8852            cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8853        StoredValues.push_back(StoreR->getStoredValue());
8854      }
8855
8856    bool NeedsMaskForGaps =
8857        IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8858    auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8859                                        Recipe->getMask(), NeedsMaskForGaps);
8860    VPIG->insertBefore(Recipe);
8861    unsigned J = 0;
8862    for (unsigned i = 0; i < IG->getFactor(); ++i)
8863      if (Instruction *Member = IG->getMember(i)) {
8864        VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8865        if (!Member->getType()->isVoidTy()) {
8866          VPValue *OriginalV = MemberR->getVPSingleValue();
8867          OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8868          J++;
8869        }
8870        MemberR->eraseFromParent();
8871      }
8872  }
8873
8874  for (ElementCount VF : Range)
8875    Plan->addVF(VF);
8876  Plan->setName("Initial VPlan");
8877
8878  // Replace VPValues for known constant strides guaranteed by predicate scalar
8879  // evolution.
8880  for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8881    auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8882    auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8883    // Only handle constant strides for now.
8884    if (!ScevStride)
8885      continue;
8886    Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8887
8888    auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8889    // The versioned value may not be used in the loop directly, so just add a
8890    // new live-in in those cases.
8891    Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8892  }
8893
8894  // From this point onwards, VPlan-to-VPlan transformations may change the plan
8895  // in ways that accessing values using original IR values is incorrect.
8896  Plan->disableValue2VPValue();
8897
8898  // Sink users of fixed-order recurrence past the recipe defining the previous
8899  // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900  if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8901    return nullptr;
8902
8903  if (useActiveLaneMask(Style)) {
8904    // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905    // TailFoldingStyle is visible there.
8906    bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8907    bool WithoutRuntimeCheck =
8908        Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8909    VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8910                                       WithoutRuntimeCheck);
8911  }
8912  return Plan;
8913}
8914
8915VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8916  // Outer loop handling: They may require CFG and instruction level
8917  // transformations before even evaluating whether vectorization is profitable.
8918  // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919  // the vectorization pipeline.
8920  assert(!OrigLoop->isInnermost());
8921  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922
8923  // Create new empty VPlan
8924  auto Plan = VPlan::createInitialVPlan(
8925      createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8926      *PSE.getSE());
8927
8928  // Build hierarchical CFG
8929  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8930  HCFGBuilder.buildHierarchicalCFG();
8931
8932  for (ElementCount VF : Range)
8933    Plan->addVF(VF);
8934
8935  VPlanTransforms::VPInstructionsToVPRecipes(
8936      Plan,
8937      [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8938      *PSE.getSE(), *TLI);
8939
8940  // Remove the existing terminator of the exiting block of the top-most region.
8941  // A BranchOnCount will be added instead when adding the canonical IV recipes.
8942  auto *Term =
8943      Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8944  Term->eraseFromParent();
8945
8946  // Tail folding is not supported for outer loops, so the induction increment
8947  // is guaranteed to not wrap.
8948  bool HasNUW = true;
8949  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8950                        DebugLoc());
8951  return Plan;
8952}
8953
8954// Adjust the recipes for reductions. For in-loop reductions the chain of
8955// instructions leading from the loop exit instr to the phi need to be converted
8956// to reductions, with one operand being vector and the other being the scalar
8957// reduction chain. For other reductions, a select is introduced between the phi
8958// and live-out recipes when folding the tail.
8959//
8960// A ComputeReductionResult recipe is added to the middle block, also for
8961// in-loop reductions which compute their result in-loop, because generating
8962// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8963void LoopVectorizationPlanner::adjustRecipesForReductions(
8964    VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8965    ElementCount MinVF) {
8966  VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8967  VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8968  // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969  // sank outside of the loop would keep the same order as they had in the
8970  // original loop.
8971  SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8972  for (VPRecipeBase &R : Header->phis()) {
8973    if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8974      ReductionPHIList.emplace_back(ReductionPhi);
8975  }
8976  bool HasIntermediateStore = false;
8977  stable_sort(ReductionPHIList,
8978              [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8979                                            const VPReductionPHIRecipe *R2) {
8980                auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8981                auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8982                HasIntermediateStore |= IS1 || IS2;
8983
8984                // If neither of the recipes has an intermediate store, keep the
8985                // order the same.
8986                if (!IS1 && !IS2)
8987                  return false;
8988
8989                // If only one of the recipes has an intermediate store, then
8990                // move it towards the beginning of the list.
8991                if (IS1 && !IS2)
8992                  return true;
8993
8994                if (!IS1 && IS2)
8995                  return false;
8996
8997                // If both recipes have an intermediate store, then the recipe
8998                // with the later store should be processed earlier. So it
8999                // should go to the beginning of the list.
9000                return DT->dominates(IS2, IS1);
9001              });
9002
9003  if (HasIntermediateStore && ReductionPHIList.size() > 1)
9004    for (VPRecipeBase *R : ReductionPHIList)
9005      R->moveBefore(*Header, Header->getFirstNonPhi());
9006
9007  for (VPRecipeBase &R : Header->phis()) {
9008    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9009    if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9010      continue;
9011
9012    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9013    RecurKind Kind = RdxDesc.getRecurrenceKind();
9014    assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9015           "AnyOf reductions are not allowed for in-loop reductions");
9016
9017    // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018    SetVector<VPSingleDefRecipe *> Worklist;
9019    Worklist.insert(PhiR);
9020    for (unsigned I = 0; I != Worklist.size(); ++I) {
9021      VPSingleDefRecipe *Cur = Worklist[I];
9022      for (VPUser *U : Cur->users()) {
9023        auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9024        if (!UserRecipe) {
9025          assert(isa<VPLiveOut>(U) &&
9026                 "U must either be a VPSingleDef or VPLiveOut");
9027          continue;
9028        }
9029        Worklist.insert(UserRecipe);
9030      }
9031    }
9032
9033    // Visit operation "Links" along the reduction chain top-down starting from
9034    // the phi until LoopExitValue. We keep track of the previous item
9035    // (PreviousLink) to tell which of the two operands of a Link will remain
9036    // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037    // the select instructions.
9038    VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9039    for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9040      Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9041
9042      // Index of the first operand which holds a non-mask vector operand.
9043      unsigned IndexOfFirstOperand;
9044      // Recognize a call to the llvm.fmuladd intrinsic.
9045      bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9046      VPValue *VecOp;
9047      VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9048      if (IsFMulAdd) {
9049        assert(
9050            RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9051            "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052        assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9053                isa<VPWidenCallRecipe>(CurrentLink)) &&
9054               CurrentLink->getOperand(2) == PreviousLink &&
9055               "expected a call where the previous link is the added operand");
9056
9057        // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058        // need to create an fmul recipe (multiplying the first two operands of
9059        // the fmuladd together) to use as the vector operand for the fadd
9060        // reduction.
9061        VPInstruction *FMulRecipe = new VPInstruction(
9062            Instruction::FMul,
9063            {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9064            CurrentLinkI->getFastMathFlags());
9065        LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9066        VecOp = FMulRecipe;
9067      } else {
9068        if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9069          if (isa<VPWidenRecipe>(CurrentLink)) {
9070            assert(isa<CmpInst>(CurrentLinkI) &&
9071                   "need to have the compare of the select");
9072            continue;
9073          }
9074          assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9075                 "must be a select recipe");
9076          IndexOfFirstOperand = 1;
9077        } else {
9078          assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9079                 "Expected to replace a VPWidenSC");
9080          IndexOfFirstOperand = 0;
9081        }
9082        // Note that for non-commutable operands (cmp-selects), the semantics of
9083        // the cmp-select are captured in the recurrence kind.
9084        unsigned VecOpId =
9085            CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9086                ? IndexOfFirstOperand + 1
9087                : IndexOfFirstOperand;
9088        VecOp = CurrentLink->getOperand(VecOpId);
9089        assert(VecOp != PreviousLink &&
9090               CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9091                                       (VecOpId - IndexOfFirstOperand)) ==
9092                   PreviousLink &&
9093               "PreviousLink must be the operand other than VecOp");
9094      }
9095
9096      BasicBlock *BB = CurrentLinkI->getParent();
9097      VPValue *CondOp = nullptr;
9098      if (CM.blockNeedsPredicationForAnyReason(BB)) {
9099        VPBuilder::InsertPointGuard Guard(Builder);
9100        Builder.setInsertPoint(CurrentLink);
9101        CondOp = RecipeBuilder.getBlockInMask(BB);
9102      }
9103
9104      VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9105          RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9106      // Append the recipe to the end of the VPBasicBlock because we need to
9107      // ensure that it comes after all of it's inputs, including CondOp.
9108      // Note that this transformation may leave over dead recipes (including
9109      // CurrentLink), which will be cleaned by a later VPlan transform.
9110      LinkVPBB->appendRecipe(RedRecipe);
9111      CurrentLink->replaceAllUsesWith(RedRecipe);
9112      PreviousLink = RedRecipe;
9113    }
9114  }
9115  Builder.setInsertPoint(&*LatchVPBB->begin());
9116  for (VPRecipeBase &R :
9117       Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118    VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119    if (!PhiR)
9120      continue;
9121
9122    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123    // If tail is folded by masking, introduce selects between the phi
9124    // and the live-out instruction of each reduction, at the beginning of the
9125    // dedicated latch block.
9126    auto *OrigExitingVPV = PhiR->getBackedgeValue();
9127    auto *NewExitingVPV = PhiR->getBackedgeValue();
9128    if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9129      VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9130      assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9131             "reduction recipe must be defined before latch");
9132      Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9133      std::optional<FastMathFlags> FMFs =
9134          PhiTy->isFloatingPointTy()
9135              ? std::make_optional(RdxDesc.getFastMathFlags())
9136              : std::nullopt;
9137      NewExitingVPV =
9138          Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9139      OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9140        return isa<VPInstruction>(&U) &&
9141               cast<VPInstruction>(&U)->getOpcode() ==
9142                   VPInstruction::ComputeReductionResult;
9143      });
9144      if (PreferPredicatedReductionSelect ||
9145          TTI.preferPredicatedReductionSelect(
9146              PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9147              TargetTransformInfo::ReductionFlags()))
9148        PhiR->setOperand(1, NewExitingVPV);
9149    }
9150
9151    // If the vector reduction can be performed in a smaller type, we truncate
9152    // then extend the loop exit value to enable InstCombine to evaluate the
9153    // entire expression in the smaller type.
9154    Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9155    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9156      assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9157      Type *RdxTy = RdxDesc.getRecurrenceType();
9158      auto *Trunc =
9159          new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9160      auto *Extnd =
9161          RdxDesc.isSigned()
9162              ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9163              : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9164
9165      Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9166      Extnd->insertAfter(Trunc);
9167      if (PhiR->getOperand(1) == NewExitingVPV)
9168        PhiR->setOperand(1, Extnd->getVPSingleValue());
9169      NewExitingVPV = Extnd;
9170    }
9171
9172    // We want code in the middle block to appear to execute on the location of
9173    // the scalar loop's latch terminator because: (a) it is all compiler
9174    // generated, (b) these instructions are always executed after evaluating
9175    // the latch conditional branch, and (c) other passes may add new
9176    // predecessors which terminate on this line. This is the easiest way to
9177    // ensure we don't accidentally cause an extra step back into the loop while
9178    // debugging.
9179    DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9180
9181    // TODO: At the moment ComputeReductionResult also drives creation of the
9182    // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183    // even for in-loop reductions, until the reduction resume value handling is
9184    // also modeled in VPlan.
9185    auto *FinalReductionResult = new VPInstruction(
9186        VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9187    cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9188        ->appendRecipe(FinalReductionResult);
9189    OrigExitingVPV->replaceUsesWithIf(
9190        FinalReductionResult,
9191        [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9192  }
9193
9194  VPlanTransforms::clearReductionWrapFlags(*Plan);
9195}
9196
9197#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9198void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9199                               VPSlotTracker &SlotTracker) const {
9200  O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9201  IG->getInsertPos()->printAsOperand(O, false);
9202  O << ", ";
9203  getAddr()->printAsOperand(O, SlotTracker);
9204  VPValue *Mask = getMask();
9205  if (Mask) {
9206    O << ", ";
9207    Mask->printAsOperand(O, SlotTracker);
9208  }
9209
9210  unsigned OpIdx = 0;
9211  for (unsigned i = 0; i < IG->getFactor(); ++i) {
9212    if (!IG->getMember(i))
9213      continue;
9214    if (getNumStoreOperands() > 0) {
9215      O << "\n" << Indent << "  store ";
9216      getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9217      O << " to index " << i;
9218    } else {
9219      O << "\n" << Indent << "  ";
9220      getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9221      O << " = load from index " << i;
9222    }
9223    ++OpIdx;
9224  }
9225}
9226#endif
9227
9228void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9229  assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9230         "Not a pointer induction according to InductionDescriptor!");
9231  assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9232         "Unexpected type.");
9233
9234  auto *IVR = getParent()->getPlan()->getCanonicalIV();
9235  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9236
9237  if (onlyScalarsGenerated(State.VF)) {
9238    // This is the normalized GEP that starts counting at zero.
9239    Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9240        CanonicalIV, IndDesc.getStep()->getType());
9241    // Determine the number of scalars we need to generate for each unroll
9242    // iteration. If the instruction is uniform, we only need to generate the
9243    // first lane. Otherwise, we generate all VF values.
9244    bool IsUniform = vputils::onlyFirstLaneUsed(this);
9245    assert((IsUniform || !State.VF.isScalable()) &&
9246           "Cannot scalarize a scalable VF");
9247    unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9248
9249    for (unsigned Part = 0; Part < State.UF; ++Part) {
9250      Value *PartStart =
9251          createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9252
9253      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9254        Value *Idx = State.Builder.CreateAdd(
9255            PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9256        Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9257
9258        Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9259        Value *SclrGep = emitTransformedIndex(
9260            State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9261            IndDesc.getKind(), IndDesc.getInductionBinOp());
9262        SclrGep->setName("next.gep");
9263        State.set(this, SclrGep, VPIteration(Part, Lane));
9264      }
9265    }
9266    return;
9267  }
9268
9269  Type *PhiType = IndDesc.getStep()->getType();
9270
9271  // Build a pointer phi
9272  Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9273  Type *ScStValueType = ScalarStartValue->getType();
9274  PHINode *NewPointerPhi =
9275      PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9276
9277  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9278  NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9279
9280  // A pointer induction, performed by using a gep
9281  Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9282
9283  Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9284  Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9285  Value *NumUnrolledElems =
9286      State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9287  Value *InductionGEP = GetElementPtrInst::Create(
9288      State.Builder.getInt8Ty(), NewPointerPhi,
9289      State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9290      InductionLoc);
9291  // Add induction update using an incorrect block temporarily. The phi node
9292  // will be fixed after VPlan execution. Note that at this point the latch
9293  // block cannot be used, as it does not exist yet.
9294  // TODO: Model increment value in VPlan, by turning the recipe into a
9295  // multi-def and a subclass of VPHeaderPHIRecipe.
9296  NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9297
9298  // Create UF many actual address geps that use the pointer
9299  // phi as base and a vectorized version of the step value
9300  // (<step*0, ..., step*N>) as offset.
9301  for (unsigned Part = 0; Part < State.UF; ++Part) {
9302    Type *VecPhiType = VectorType::get(PhiType, State.VF);
9303    Value *StartOffsetScalar =
9304        State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9305    Value *StartOffset =
9306        State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9307    // Create a vector of consecutive numbers from zero to VF.
9308    StartOffset = State.Builder.CreateAdd(
9309        StartOffset, State.Builder.CreateStepVector(VecPhiType));
9310
9311    assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9312           "scalar step must be the same across all parts");
9313    Value *GEP = State.Builder.CreateGEP(
9314        State.Builder.getInt8Ty(), NewPointerPhi,
9315        State.Builder.CreateMul(
9316            StartOffset,
9317            State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9318            "vector.gep"));
9319    State.set(this, GEP, Part);
9320  }
9321}
9322
9323void VPDerivedIVRecipe::execute(VPTransformState &State) {
9324  assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9325
9326  // Fast-math-flags propagate from the original induction instruction.
9327  IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9328  if (FPBinOp)
9329    State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9330
9331  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9332  Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9333  Value *DerivedIV = emitTransformedIndex(
9334      State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9335      Kind, cast_if_present<BinaryOperator>(FPBinOp));
9336  DerivedIV->setName("offset.idx");
9337  if (TruncResultTy) {
9338    assert(TruncResultTy != DerivedIV->getType() &&
9339           Step->getType()->isIntegerTy() &&
9340           "Truncation requires an integer step");
9341    DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9342  }
9343  assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9344
9345  State.set(this, DerivedIV, VPIteration(0, 0));
9346}
9347
9348void VPInterleaveRecipe::execute(VPTransformState &State) {
9349  assert(!State.Instance && "Interleave group being replicated.");
9350  State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9351                                      getStoredValues(), getMask(),
9352                                      NeedsMaskForGaps);
9353}
9354
9355void VPReductionRecipe::execute(VPTransformState &State) {
9356  assert(!State.Instance && "Reduction being replicated.");
9357  Value *PrevInChain = State.get(getChainOp(), 0);
9358  RecurKind Kind = RdxDesc.getRecurrenceKind();
9359  bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9360  // Propagate the fast-math flags carried by the underlying instruction.
9361  IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9362  State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9363  for (unsigned Part = 0; Part < State.UF; ++Part) {
9364    Value *NewVecOp = State.get(getVecOp(), Part);
9365    if (VPValue *Cond = getCondOp()) {
9366      Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9367                                           : State.get(Cond, {Part, 0});
9368      VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9369      Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9370      Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9371                                                  RdxDesc.getFastMathFlags());
9372      if (State.VF.isVector()) {
9373        Iden =
9374            State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9375      }
9376
9377      Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9378      NewVecOp = Select;
9379    }
9380    Value *NewRed;
9381    Value *NextInChain;
9382    if (IsOrdered) {
9383      if (State.VF.isVector())
9384        NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9385                                        PrevInChain);
9386      else
9387        NewRed = State.Builder.CreateBinOp(
9388            (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9389            NewVecOp);
9390      PrevInChain = NewRed;
9391    } else {
9392      PrevInChain = State.get(getChainOp(), Part);
9393      NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9394    }
9395    if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9396      NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9397                                   NewRed, PrevInChain);
9398    } else if (IsOrdered)
9399      NextInChain = NewRed;
9400    else
9401      NextInChain = State.Builder.CreateBinOp(
9402          (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9403    State.set(this, NextInChain, Part);
9404  }
9405}
9406
9407void VPReplicateRecipe::execute(VPTransformState &State) {
9408  Instruction *UI = getUnderlyingInstr();
9409  if (State.Instance) { // Generate a single instance.
9410    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411    State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9412    // Insert scalar instance packing it into a vector.
9413    if (State.VF.isVector() && shouldPack()) {
9414      // If we're constructing lane 0, initialize to start from poison.
9415      if (State.Instance->Lane.isFirstLane()) {
9416        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9417        Value *Poison = PoisonValue::get(
9418            VectorType::get(UI->getType(), State.VF));
9419        State.set(this, Poison, State.Instance->Part);
9420      }
9421      State.packScalarIntoVectorValue(this, *State.Instance);
9422    }
9423    return;
9424  }
9425
9426  if (IsUniform) {
9427    // If the recipe is uniform across all parts (instead of just per VF), only
9428    // generate a single instance.
9429    if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9430        all_of(operands(), [](VPValue *Op) {
9431          return Op->isDefinedOutsideVectorRegions();
9432        })) {
9433      State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9434      if (user_begin() != user_end()) {
9435        for (unsigned Part = 1; Part < State.UF; ++Part)
9436          State.set(this, State.get(this, VPIteration(0, 0)),
9437                    VPIteration(Part, 0));
9438      }
9439      return;
9440    }
9441
9442    // Uniform within VL means we need to generate lane 0 only for each
9443    // unrolled copy.
9444    for (unsigned Part = 0; Part < State.UF; ++Part)
9445      State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9446    return;
9447  }
9448
9449  // A store of a loop varying value to a uniform address only needs the last
9450  // copy of the store.
9451  if (isa<StoreInst>(UI) &&
9452      vputils::isUniformAfterVectorization(getOperand(1))) {
9453    auto Lane = VPLane::getLastLaneForVF(State.VF);
9454    State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9455                                    State);
9456    return;
9457  }
9458
9459  // Generate scalar instances for all VF lanes of all UF parts.
9460  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9461  const unsigned EndLane = State.VF.getKnownMinValue();
9462  for (unsigned Part = 0; Part < State.UF; ++Part)
9463    for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9464      State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9465}
9466
9467void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9468  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9469
9470  // Attempt to issue a wide load.
9471  LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9472  StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9473
9474  assert((LI || SI) && "Invalid Load/Store instruction");
9475  assert((!SI || StoredValue) && "No stored value provided for widened store");
9476  assert((!LI || !StoredValue) && "Stored value provided for widened load");
9477
9478  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9479
9480  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9481  const Align Alignment = getLoadStoreAlignment(&Ingredient);
9482  bool CreateGatherScatter = !isConsecutive();
9483
9484  auto &Builder = State.Builder;
9485  InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9486  bool isMaskRequired = getMask();
9487  if (isMaskRequired) {
9488    // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489    // a null all-one mask is a null mask.
9490    for (unsigned Part = 0; Part < State.UF; ++Part) {
9491      Value *Mask = State.get(getMask(), Part);
9492      if (isReverse())
9493        Mask = Builder.CreateVectorReverse(Mask, "reverse");
9494      BlockInMaskParts[Part] = Mask;
9495    }
9496  }
9497
9498  // Handle Stores:
9499  if (SI) {
9500    State.setDebugLocFrom(SI->getDebugLoc());
9501
9502    for (unsigned Part = 0; Part < State.UF; ++Part) {
9503      Instruction *NewSI = nullptr;
9504      Value *StoredVal = State.get(StoredValue, Part);
9505      if (CreateGatherScatter) {
9506        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9507        Value *VectorGep = State.get(getAddr(), Part);
9508        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9509                                            MaskPart);
9510      } else {
9511        if (isReverse()) {
9512          // If we store to reverse consecutive memory locations, then we need
9513          // to reverse the order of elements in the stored value.
9514          StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9515          // We don't want to update the value in the map as it might be used in
9516          // another expression. So don't call resetVectorValue(StoredVal).
9517        }
9518        auto *VecPtr = State.get(getAddr(), Part);
9519        if (isMaskRequired)
9520          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9521                                            BlockInMaskParts[Part]);
9522        else
9523          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9524      }
9525      State.addMetadata(NewSI, SI);
9526    }
9527    return;
9528  }
9529
9530  // Handle loads.
9531  assert(LI && "Must have a load instruction");
9532  State.setDebugLocFrom(LI->getDebugLoc());
9533  for (unsigned Part = 0; Part < State.UF; ++Part) {
9534    Value *NewLI;
9535    if (CreateGatherScatter) {
9536      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9537      Value *VectorGep = State.get(getAddr(), Part);
9538      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9539                                         nullptr, "wide.masked.gather");
9540      State.addMetadata(NewLI, LI);
9541    } else {
9542      auto *VecPtr = State.get(getAddr(), Part);
9543      if (isMaskRequired)
9544        NewLI = Builder.CreateMaskedLoad(
9545            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9546            PoisonValue::get(DataTy), "wide.masked.load");
9547      else
9548        NewLI =
9549            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9550
9551      // Add metadata to the load, but setVectorValue to the reverse shuffle.
9552      State.addMetadata(NewLI, LI);
9553      if (Reverse)
9554        NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9555    }
9556
9557    State.set(getVPSingleValue(), NewLI, Part);
9558  }
9559}
9560
9561// Determine how to lower the scalar epilogue, which depends on 1) optimising
9562// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563// predication, and 4) a TTI hook that analyses whether the loop is suitable
9564// for predication.
9565static ScalarEpilogueLowering getScalarEpilogueLowering(
9566    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9567    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9568    LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9569  // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570  // don't look at hints or options, and don't request a scalar epilogue.
9571  // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572  // LoopAccessInfo (due to code dependency and not being able to reliably get
9573  // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574  // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575  // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576  // back to the old way and vectorize with versioning when forced. See D81345.)
9577  if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9578                                                      PGSOQueryType::IRPass) &&
9579                          Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9580    return CM_ScalarEpilogueNotAllowedOptSize;
9581
9582  // 2) If set, obey the directives
9583  if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9584    switch (PreferPredicateOverEpilogue) {
9585    case PreferPredicateTy::ScalarEpilogue:
9586      return CM_ScalarEpilogueAllowed;
9587    case PreferPredicateTy::PredicateElseScalarEpilogue:
9588      return CM_ScalarEpilogueNotNeededUsePredicate;
9589    case PreferPredicateTy::PredicateOrDontVectorize:
9590      return CM_ScalarEpilogueNotAllowedUsePredicate;
9591    };
9592  }
9593
9594  // 3) If set, obey the hints
9595  switch (Hints.getPredicate()) {
9596  case LoopVectorizeHints::FK_Enabled:
9597    return CM_ScalarEpilogueNotNeededUsePredicate;
9598  case LoopVectorizeHints::FK_Disabled:
9599    return CM_ScalarEpilogueAllowed;
9600  };
9601
9602  // 4) if the TTI hook indicates this is profitable, request predication.
9603  TailFoldingInfo TFI(TLI, &LVL, IAI);
9604  if (TTI->preferPredicateOverEpilogue(&TFI))
9605    return CM_ScalarEpilogueNotNeededUsePredicate;
9606
9607  return CM_ScalarEpilogueAllowed;
9608}
9609
9610// Process the loop in the VPlan-native vectorization path. This path builds
9611// VPlan upfront in the vectorization pipeline, which allows to apply
9612// VPlan-to-VPlan transformations from the very beginning without modifying the
9613// input LLVM IR.
9614static bool processLoopInVPlanNativePath(
9615    Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9616    LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9617    TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9618    OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9619    ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9620    LoopVectorizationRequirements &Requirements) {
9621
9622  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9623    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9624    return false;
9625  }
9626  assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9627  Function *F = L->getHeader()->getParent();
9628  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9629
9630  ScalarEpilogueLowering SEL =
9631      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9632
9633  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9634                                &Hints, IAI);
9635  // Use the planner for outer loop vectorization.
9636  // TODO: CM is not used at this point inside the planner. Turn CM into an
9637  // optional argument if we don't need it in the future.
9638  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9639                               ORE);
9640
9641  // Get user vectorization factor.
9642  ElementCount UserVF = Hints.getWidth();
9643
9644  CM.collectElementTypesForWidening();
9645
9646  // Plan how to best vectorize, return the best VF and its cost.
9647  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9648
9649  // If we are stress testing VPlan builds, do not attempt to generate vector
9650  // code. Masked vector code generation support will follow soon.
9651  // Also, do not attempt to vectorize if no vector code will be produced.
9652  if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9653    return false;
9654
9655  VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9656
9657  {
9658    bool AddBranchWeights =
9659        hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9660    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9661                             F->getParent()->getDataLayout(), AddBranchWeights);
9662    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9663                           VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9664    LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9665                      << L->getHeader()->getParent()->getName() << "\"\n");
9666    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9667  }
9668
9669  reportVectorization(ORE, L, VF, 1);
9670
9671  // Mark the loop as already vectorized to avoid vectorizing again.
9672  Hints.setAlreadyVectorized();
9673  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9674  return true;
9675}
9676
9677// Emit a remark if there are stores to floats that required a floating point
9678// extension. If the vectorized loop was generated with floating point there
9679// will be a performance penalty from the conversion overhead and the change in
9680// the vector width.
9681static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9682  SmallVector<Instruction *, 4> Worklist;
9683  for (BasicBlock *BB : L->getBlocks()) {
9684    for (Instruction &Inst : *BB) {
9685      if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9686        if (S->getValueOperand()->getType()->isFloatTy())
9687          Worklist.push_back(S);
9688      }
9689    }
9690  }
9691
9692  // Traverse the floating point stores upwards searching, for floating point
9693  // conversions.
9694  SmallPtrSet<const Instruction *, 4> Visited;
9695  SmallPtrSet<const Instruction *, 4> EmittedRemark;
9696  while (!Worklist.empty()) {
9697    auto *I = Worklist.pop_back_val();
9698    if (!L->contains(I))
9699      continue;
9700    if (!Visited.insert(I).second)
9701      continue;
9702
9703    // Emit a remark if the floating point store required a floating
9704    // point conversion.
9705    // TODO: More work could be done to identify the root cause such as a
9706    // constant or a function return type and point the user to it.
9707    if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9708      ORE->emit([&]() {
9709        return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9710                                          I->getDebugLoc(), L->getHeader())
9711               << "floating point conversion changes vector width. "
9712               << "Mixed floating point precision requires an up/down "
9713               << "cast that will negatively impact performance.";
9714      });
9715
9716    for (Use &Op : I->operands())
9717      if (auto *OpI = dyn_cast<Instruction>(Op))
9718        Worklist.push_back(OpI);
9719  }
9720}
9721
9722static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9723                                       VectorizationFactor &VF,
9724                                       std::optional<unsigned> VScale, Loop *L,
9725                                       ScalarEvolution &SE,
9726                                       ScalarEpilogueLowering SEL) {
9727  InstructionCost CheckCost = Checks.getCost();
9728  if (!CheckCost.isValid())
9729    return false;
9730
9731  // When interleaving only scalar and vector cost will be equal, which in turn
9732  // would lead to a divide by 0. Fall back to hard threshold.
9733  if (VF.Width.isScalar()) {
9734    if (CheckCost > VectorizeMemoryCheckThreshold) {
9735      LLVM_DEBUG(
9736          dbgs()
9737          << "LV: Interleaving only is not profitable due to runtime checks\n");
9738      return false;
9739    }
9740    return true;
9741  }
9742
9743  // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744  double ScalarC = *VF.ScalarCost.getValue();
9745  if (ScalarC == 0)
9746    return true;
9747
9748  // First, compute the minimum iteration count required so that the vector
9749  // loop outperforms the scalar loop.
9750  //  The total cost of the scalar loop is
9751  //   ScalarC * TC
9752  //  where
9753  //  * TC is the actual trip count of the loop.
9754  //  * ScalarC is the cost of a single scalar iteration.
9755  //
9756  //  The total cost of the vector loop is
9757  //    RtC + VecC * (TC / VF) + EpiC
9758  //  where
9759  //  * RtC is the cost of the generated runtime checks
9760  //  * VecC is the cost of a single vector iteration.
9761  //  * TC is the actual trip count of the loop
9762  //  * VF is the vectorization factor
9763  //  * EpiCost is the cost of the generated epilogue, including the cost
9764  //    of the remaining scalar operations.
9765  //
9766  // Vectorization is profitable once the total vector cost is less than the
9767  // total scalar cost:
9768  //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9769  //
9770  // Now we can compute the minimum required trip count TC as
9771  //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9772  //
9773  // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774  // the computations are performed on doubles, not integers and the result
9775  // is rounded up, hence we get an upper estimate of the TC.
9776  unsigned IntVF = VF.Width.getKnownMinValue();
9777  if (VF.Width.isScalable()) {
9778    unsigned AssumedMinimumVscale = 1;
9779    if (VScale)
9780      AssumedMinimumVscale = *VScale;
9781    IntVF *= AssumedMinimumVscale;
9782  }
9783  double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9784  double RtC = *CheckCost.getValue();
9785  double MinTC1 = RtC / (ScalarC - VecCOverVF);
9786
9787  // Second, compute a minimum iteration count so that the cost of the
9788  // runtime checks is only a fraction of the total scalar loop cost. This
9789  // adds a loop-dependent bound on the overhead incurred if the runtime
9790  // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791  // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9792  // cost, compute
9793  //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9794  double MinTC2 = RtC * 10 / ScalarC;
9795
9796  // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797  // epilogue is allowed, choose the next closest multiple of VF. This should
9798  // partly compensate for ignoring the epilogue cost.
9799  uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9800  if (SEL == CM_ScalarEpilogueAllowed)
9801    MinTC = alignTo(MinTC, IntVF);
9802  VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9803
9804  LLVM_DEBUG(
9805      dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806             << VF.MinProfitableTripCount << "\n");
9807
9808  // Skip vectorization if the expected trip count is less than the minimum
9809  // required trip count.
9810  if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9811    if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9812                                VF.MinProfitableTripCount)) {
9813      LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814                           "trip count < minimum profitable VF ("
9815                        << *ExpectedTC << " < " << VF.MinProfitableTripCount
9816                        << ")\n");
9817
9818      return false;
9819    }
9820  }
9821  return true;
9822}
9823
9824LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9825    : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9826                               !EnableLoopInterleaving),
9827      VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9828                              !EnableLoopVectorization) {}
9829
9830bool LoopVectorizePass::processLoop(Loop *L) {
9831  assert((EnableVPlanNativePath || L->isInnermost()) &&
9832         "VPlan-native path is not enabled. Only process inner loops.");
9833
9834#ifndef NDEBUG
9835  const std::string DebugLocStr = getDebugLocString(L);
9836#endif /* NDEBUG */
9837
9838  LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9839                    << L->getHeader()->getParent()->getName() << "' from "
9840                    << DebugLocStr << "\n");
9841
9842  LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9843
9844  LLVM_DEBUG(
9845      dbgs() << "LV: Loop hints:"
9846             << " force="
9847             << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9848                     ? "disabled"
9849                     : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9850                            ? "enabled"
9851                            : "?"))
9852             << " width=" << Hints.getWidth()
9853             << " interleave=" << Hints.getInterleave() << "\n");
9854
9855  // Function containing loop
9856  Function *F = L->getHeader()->getParent();
9857
9858  // Looking at the diagnostic output is the only way to determine if a loop
9859  // was vectorized (other than looking at the IR or machine code), so it
9860  // is important to generate an optimization remark for each loop. Most of
9861  // these messages are generated as OptimizationRemarkAnalysis. Remarks
9862  // generated as OptimizationRemark and OptimizationRemarkMissed are
9863  // less verbose reporting vectorized loops and unvectorized loops that may
9864  // benefit from vectorization, respectively.
9865
9866  if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9867    LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9868    return false;
9869  }
9870
9871  PredicatedScalarEvolution PSE(*SE, *L);
9872
9873  // Check if it is legal to vectorize the loop.
9874  LoopVectorizationRequirements Requirements;
9875  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9876                                &Requirements, &Hints, DB, AC, BFI, PSI);
9877  if (!LVL.canVectorize(EnableVPlanNativePath)) {
9878    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9879    Hints.emitRemarkWithHints();
9880    return false;
9881  }
9882
9883  // Entrance to the VPlan-native vectorization path. Outer loops are processed
9884  // here. They may require CFG and instruction level transformations before
9885  // even evaluating whether vectorization is profitable. Since we cannot modify
9886  // the incoming IR, we need to build VPlan upfront in the vectorization
9887  // pipeline.
9888  if (!L->isInnermost())
9889    return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9890                                        ORE, BFI, PSI, Hints, Requirements);
9891
9892  assert(L->isInnermost() && "Inner loop expected.");
9893
9894  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9895  bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9896
9897  // If an override option has been passed in for interleaved accesses, use it.
9898  if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9899    UseInterleaved = EnableInterleavedMemAccesses;
9900
9901  // Analyze interleaved memory accesses.
9902  if (UseInterleaved)
9903    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9904
9905  // Check the function attributes and profiles to find out if this function
9906  // should be optimized for size.
9907  ScalarEpilogueLowering SEL =
9908      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9909
9910  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9911  // count by optimizing for size, to minimize overheads.
9912  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9913  if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9914    LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9915                      << "This loop is worth vectorizing only if no scalar "
9916                      << "iteration overheads are incurred.");
9917    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9918      LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9919    else {
9920      if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9921        LLVM_DEBUG(dbgs() << "\n");
9922        // Predicate tail-folded loops are efficient even when the loop
9923        // iteration count is low. However, setting the epilogue policy to
9924        // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925        // with runtime checks. It's more effective to let
9926        // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9927        // for the loop.
9928        if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9929          SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9930      } else {
9931        LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9932                             "small to consider vectorizing.\n");
9933        reportVectorizationFailure(
9934            "The trip count is below the minial threshold value.",
9935            "loop trip count is too low, avoiding vectorization",
9936            "LowTripCount", ORE, L);
9937        Hints.emitRemarkWithHints();
9938        return false;
9939      }
9940    }
9941  }
9942
9943  // Check the function attributes to see if implicit floats or vectors are
9944  // allowed.
9945  if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9946    reportVectorizationFailure(
9947        "Can't vectorize when the NoImplicitFloat attribute is used",
9948        "loop not vectorized due to NoImplicitFloat attribute",
9949        "NoImplicitFloat", ORE, L);
9950    Hints.emitRemarkWithHints();
9951    return false;
9952  }
9953
9954  // Check if the target supports potentially unsafe FP vectorization.
9955  // FIXME: Add a check for the type of safety issue (denormal, signaling)
9956  // for the target we're vectorizing for, to make sure none of the
9957  // additional fp-math flags can help.
9958  if (Hints.isPotentiallyUnsafe() &&
9959      TTI->isFPVectorizationPotentiallyUnsafe()) {
9960    reportVectorizationFailure(
9961        "Potentially unsafe FP op prevents vectorization",
9962        "loop not vectorized due to unsafe FP support.",
9963        "UnsafeFP", ORE, L);
9964    Hints.emitRemarkWithHints();
9965    return false;
9966  }
9967
9968  bool AllowOrderedReductions;
9969  // If the flag is set, use that instead and override the TTI behaviour.
9970  if (ForceOrderedReductions.getNumOccurrences() > 0)
9971    AllowOrderedReductions = ForceOrderedReductions;
9972  else
9973    AllowOrderedReductions = TTI->enableOrderedReductions();
9974  if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9975    ORE->emit([&]() {
9976      auto *ExactFPMathInst = Requirements.getExactFPInst();
9977      return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9978                                                 ExactFPMathInst->getDebugLoc(),
9979                                                 ExactFPMathInst->getParent())
9980             << "loop not vectorized: cannot prove it is safe to reorder "
9981                "floating-point operations";
9982    });
9983    LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984                         "reorder floating-point operations\n");
9985    Hints.emitRemarkWithHints();
9986    return false;
9987  }
9988
9989  // Use the cost model.
9990  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9991                                F, &Hints, IAI);
9992  // Use the planner for vectorization.
9993  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9994                               ORE);
9995
9996  // Get user vectorization factor and interleave count.
9997  ElementCount UserVF = Hints.getWidth();
9998  unsigned UserIC = Hints.getInterleave();
9999
10000  // Plan how to best vectorize, return the best VF and its cost.
10001  std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10002
10003  VectorizationFactor VF = VectorizationFactor::Disabled();
10004  unsigned IC = 1;
10005
10006  bool AddBranchWeights =
10007      hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10008  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10009                           F->getParent()->getDataLayout(), AddBranchWeights);
10010  if (MaybeVF) {
10011    VF = *MaybeVF;
10012    // Select the interleave count.
10013    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10014
10015    unsigned SelectedIC = std::max(IC, UserIC);
10016    //  Optimistically generate runtime checks if they are needed. Drop them if
10017    //  they turn out to not be profitable.
10018    if (VF.Width.isVector() || SelectedIC > 1)
10019      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10020
10021    // Check if it is profitable to vectorize with runtime checks.
10022    bool ForceVectorization =
10023        Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10024    if (!ForceVectorization &&
10025        !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10026                                    *PSE.getSE(), SEL)) {
10027      ORE->emit([&]() {
10028        return OptimizationRemarkAnalysisAliasing(
10029                   DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10030                   L->getHeader())
10031               << "loop not vectorized: cannot prove it is safe to reorder "
10032                  "memory operations";
10033      });
10034      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10035      Hints.emitRemarkWithHints();
10036      return false;
10037    }
10038  }
10039
10040  // Identify the diagnostic messages that should be produced.
10041  std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10042  bool VectorizeLoop = true, InterleaveLoop = true;
10043  if (VF.Width.isScalar()) {
10044    LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10045    VecDiagMsg = std::make_pair(
10046        "VectorizationNotBeneficial",
10047        "the cost-model indicates that vectorization is not beneficial");
10048    VectorizeLoop = false;
10049  }
10050
10051  if (!MaybeVF && UserIC > 1) {
10052    // Tell the user interleaving was avoided up-front, despite being explicitly
10053    // requested.
10054    LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10055                         "interleaving should be avoided up front\n");
10056    IntDiagMsg = std::make_pair(
10057        "InterleavingAvoided",
10058        "Ignoring UserIC, because interleaving was avoided up front");
10059    InterleaveLoop = false;
10060  } else if (IC == 1 && UserIC <= 1) {
10061    // Tell the user interleaving is not beneficial.
10062    LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10063    IntDiagMsg = std::make_pair(
10064        "InterleavingNotBeneficial",
10065        "the cost-model indicates that interleaving is not beneficial");
10066    InterleaveLoop = false;
10067    if (UserIC == 1) {
10068      IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10069      IntDiagMsg.second +=
10070          " and is explicitly disabled or interleave count is set to 1";
10071    }
10072  } else if (IC > 1 && UserIC == 1) {
10073    // Tell the user interleaving is beneficial, but it explicitly disabled.
10074    LLVM_DEBUG(
10075        dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10076    IntDiagMsg = std::make_pair(
10077        "InterleavingBeneficialButDisabled",
10078        "the cost-model indicates that interleaving is beneficial "
10079        "but is explicitly disabled or interleave count is set to 1");
10080    InterleaveLoop = false;
10081  }
10082
10083  // Override IC if user provided an interleave count.
10084  IC = UserIC > 0 ? UserIC : IC;
10085
10086  // Emit diagnostic messages, if any.
10087  const char *VAPassName = Hints.vectorizeAnalysisPassName();
10088  if (!VectorizeLoop && !InterleaveLoop) {
10089    // Do not vectorize or interleaving the loop.
10090    ORE->emit([&]() {
10091      return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10092                                      L->getStartLoc(), L->getHeader())
10093             << VecDiagMsg.second;
10094    });
10095    ORE->emit([&]() {
10096      return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10097                                      L->getStartLoc(), L->getHeader())
10098             << IntDiagMsg.second;
10099    });
10100    return false;
10101  } else if (!VectorizeLoop && InterleaveLoop) {
10102    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10103    ORE->emit([&]() {
10104      return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10105                                        L->getStartLoc(), L->getHeader())
10106             << VecDiagMsg.second;
10107    });
10108  } else if (VectorizeLoop && !InterleaveLoop) {
10109    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10110                      << ") in " << DebugLocStr << '\n');
10111    ORE->emit([&]() {
10112      return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10113                                        L->getStartLoc(), L->getHeader())
10114             << IntDiagMsg.second;
10115    });
10116  } else if (VectorizeLoop && InterleaveLoop) {
10117    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10118                      << ") in " << DebugLocStr << '\n');
10119    LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10120  }
10121
10122  bool DisableRuntimeUnroll = false;
10123  MDNode *OrigLoopID = L->getLoopID();
10124  {
10125    using namespace ore;
10126    if (!VectorizeLoop) {
10127      assert(IC > 1 && "interleave count should not be 1 or 0");
10128      // If we decided that it is not legal to vectorize the loop, then
10129      // interleave it.
10130      InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10131                                 &CM, BFI, PSI, Checks);
10132
10133      VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10134      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10135
10136      ORE->emit([&]() {
10137        return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10138                                  L->getHeader())
10139               << "interleaved loop (interleaved count: "
10140               << NV("InterleaveCount", IC) << ")";
10141      });
10142    } else {
10143      // If we decided that it is *legal* to vectorize the loop, then do it.
10144
10145      // Consider vectorizing the epilogue too if it's profitable.
10146      VectorizationFactor EpilogueVF =
10147          LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10148      if (EpilogueVF.Width.isVector()) {
10149
10150        // The first pass vectorizes the main loop and creates a scalar epilogue
10151        // to be vectorized by executing the plan (potentially with a different
10152        // factor) again shortly afterwards.
10153        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10154        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10155                                           EPI, &LVL, &CM, BFI, PSI, Checks);
10156
10157        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10158        const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10159            EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10160        ++LoopsVectorized;
10161
10162        // Second pass vectorizes the epilogue and adjusts the control flow
10163        // edges from the first pass.
10164        EPI.MainLoopVF = EPI.EpilogueVF;
10165        EPI.MainLoopUF = EPI.EpilogueUF;
10166        EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10167                                                 ORE, EPI, &LVL, &CM, BFI, PSI,
10168                                                 Checks);
10169
10170        VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10171        VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10172        VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10173        Header->setName("vec.epilog.vector.body");
10174
10175        // Re-use the trip count and steps expanded for the main loop, as
10176        // skeleton creation needs it as a value that dominates both the scalar
10177        // and vector epilogue loops
10178        // TODO: This is a workaround needed for epilogue vectorization and it
10179        // should be removed once induction resume value creation is done
10180        // directly in VPlan.
10181        EpilogILV.setTripCount(MainILV.getTripCount());
10182        for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10183          auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10184          auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10185              ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10186          ExpandR->replaceAllUsesWith(ExpandedVal);
10187          ExpandR->eraseFromParent();
10188        }
10189
10190        // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191        // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192        // before vectorizing the epilogue loop.
10193        for (VPRecipeBase &R : Header->phis()) {
10194          if (isa<VPCanonicalIVPHIRecipe>(&R))
10195            continue;
10196
10197          Value *ResumeV = nullptr;
10198          // TODO: Move setting of resume values to prepareToExecute.
10199          if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10200            ResumeV = ReductionResumeValues
10201                          .find(&ReductionPhi->getRecurrenceDescriptor())
10202                          ->second;
10203          } else {
10204            // Create induction resume values for both widened pointer and
10205            // integer/fp inductions and update the start value of the induction
10206            // recipes to use the resume value.
10207            PHINode *IndPhi = nullptr;
10208            const InductionDescriptor *ID;
10209            if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10210              IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10211              ID = &Ind->getInductionDescriptor();
10212            } else {
10213              auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10214              IndPhi = WidenInd->getPHINode();
10215              ID = &WidenInd->getInductionDescriptor();
10216            }
10217
10218            ResumeV = MainILV.createInductionResumeValue(
10219                IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10220                {EPI.MainLoopIterationCountCheck});
10221          }
10222          assert(ResumeV && "Must have a resume value");
10223          VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10224          cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10225        }
10226
10227        LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10228                        DT, true, &ExpandedSCEVs);
10229        ++LoopsEpilogueVectorized;
10230
10231        if (!MainILV.areSafetyChecksAdded())
10232          DisableRuntimeUnroll = true;
10233      } else {
10234        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10235                               VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10236                               PSI, Checks);
10237
10238        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10239        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10240        ++LoopsVectorized;
10241
10242        // Add metadata to disable runtime unrolling a scalar loop when there
10243        // are no runtime checks about strides and memory. A scalar loop that is
10244        // rarely used is not worth unrolling.
10245        if (!LB.areSafetyChecksAdded())
10246          DisableRuntimeUnroll = true;
10247      }
10248      // Report the vectorization decision.
10249      reportVectorization(ORE, L, VF, IC);
10250    }
10251
10252    if (ORE->allowExtraAnalysis(LV_NAME))
10253      checkMixedPrecision(L, ORE);
10254  }
10255
10256  std::optional<MDNode *> RemainderLoopID =
10257      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10258                                      LLVMLoopVectorizeFollowupEpilogue});
10259  if (RemainderLoopID) {
10260    L->setLoopID(*RemainderLoopID);
10261  } else {
10262    if (DisableRuntimeUnroll)
10263      AddRuntimeUnrollDisableMetaData(L);
10264
10265    // Mark the loop as already vectorized to avoid vectorizing again.
10266    Hints.setAlreadyVectorized();
10267  }
10268
10269  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10270  return true;
10271}
10272
10273LoopVectorizeResult LoopVectorizePass::runImpl(
10274    Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10275    DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10276    DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10277    OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10278  SE = &SE_;
10279  LI = &LI_;
10280  TTI = &TTI_;
10281  DT = &DT_;
10282  BFI = BFI_;
10283  TLI = TLI_;
10284  AC = &AC_;
10285  LAIs = &LAIs_;
10286  DB = &DB_;
10287  ORE = &ORE_;
10288  PSI = PSI_;
10289
10290  // Don't attempt if
10291  // 1. the target claims to have no vector registers, and
10292  // 2. interleaving won't help ILP.
10293  //
10294  // The second condition is necessary because, even if the target has no
10295  // vector registers, loop vectorization may still enable scalar
10296  // interleaving.
10297  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10298      TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10299    return LoopVectorizeResult(false, false);
10300
10301  bool Changed = false, CFGChanged = false;
10302
10303  // The vectorizer requires loops to be in simplified form.
10304  // Since simplification may add new inner loops, it has to run before the
10305  // legality and profitability checks. This means running the loop vectorizer
10306  // will simplify all loops, regardless of whether anything end up being
10307  // vectorized.
10308  for (const auto &L : *LI)
10309    Changed |= CFGChanged |=
10310        simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10311
10312  // Build up a worklist of inner-loops to vectorize. This is necessary as
10313  // the act of vectorizing or partially unrolling a loop creates new loops
10314  // and can invalidate iterators across the loops.
10315  SmallVector<Loop *, 8> Worklist;
10316
10317  for (Loop *L : *LI)
10318    collectSupportedLoops(*L, LI, ORE, Worklist);
10319
10320  LoopsAnalyzed += Worklist.size();
10321
10322  // Now walk the identified inner loops.
10323  while (!Worklist.empty()) {
10324    Loop *L = Worklist.pop_back_val();
10325
10326    // For the inner loops we actually process, form LCSSA to simplify the
10327    // transform.
10328    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10329
10330    Changed |= CFGChanged |= processLoop(L);
10331
10332    if (Changed) {
10333      LAIs->clear();
10334
10335#ifndef NDEBUG
10336      if (VerifySCEV)
10337        SE->verify();
10338#endif
10339    }
10340  }
10341
10342  // Process each loop nest in the function.
10343  return LoopVectorizeResult(Changed, CFGChanged);
10344}
10345
10346PreservedAnalyses LoopVectorizePass::run(Function &F,
10347                                         FunctionAnalysisManager &AM) {
10348    auto &LI = AM.getResult<LoopAnalysis>(F);
10349    // There are no loops in the function. Return before computing other expensive
10350    // analyses.
10351    if (LI.empty())
10352      return PreservedAnalyses::all();
10353    auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10354    auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10355    auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10356    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10357    auto &AC = AM.getResult<AssumptionAnalysis>(F);
10358    auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10359    auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10360
10361    LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10362    auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10363    ProfileSummaryInfo *PSI =
10364        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10365    BlockFrequencyInfo *BFI = nullptr;
10366    if (PSI && PSI->hasProfileSummary())
10367      BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10368    LoopVectorizeResult Result =
10369        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10370    if (!Result.MadeAnyChange)
10371      return PreservedAnalyses::all();
10372    PreservedAnalyses PA;
10373
10374    if (isAssignmentTrackingEnabled(*F.getParent())) {
10375      for (auto &BB : F)
10376        RemoveRedundantDbgInstrs(&BB);
10377    }
10378
10379    // We currently do not preserve loopinfo/dominator analyses with outer loop
10380    // vectorization. Until this is addressed, mark these analyses as preserved
10381    // only for non-VPlan-native path.
10382    // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10383    if (!EnableVPlanNativePath) {
10384      PA.preserve<LoopAnalysis>();
10385      PA.preserve<DominatorTreeAnalysis>();
10386      PA.preserve<ScalarEvolutionAnalysis>();
10387    }
10388
10389    if (Result.MadeCFGChange) {
10390      // Making CFG changes likely means a loop got vectorized. Indicate that
10391      // extra simplification passes should be run.
10392      // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10393      // be run if runtime checks have been added.
10394      AM.getResult<ShouldRunExtraVectorPasses>(F);
10395      PA.preserve<ShouldRunExtraVectorPasses>();
10396    } else {
10397      PA.preserveSet<CFGAnalyses>();
10398    }
10399    return PA;
10400}
10401
10402void LoopVectorizePass::printPipeline(
10403    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10404  static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10405      OS, MapClassName2PassName);
10406
10407  OS << '<';
10408  OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10409  OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10410  OS << '>';
10411}
10412