1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10// and generates target-independent LLVM-IR. 11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12// of instructions in order to estimate the profitability of vectorization. 13// 14// The loop vectorizer combines consecutive loop iterations into a single 15// 'wide' iteration. After this transformation the index is incremented 16// by the SIMD vector width, and not by one. 17// 18// This pass has three parts: 19// 1. The main loop pass that drives the different parts. 20// 2. LoopVectorizationLegality - A unit that checks for the legality 21// of the vectorization. 22// 3. InnerLoopVectorizer - A unit that performs the actual 23// widening of instructions. 24// 4. LoopVectorizationCostModel - A unit that checks for the profitability 25// of vectorization. It decides on the optimal vector width, which 26// can be one, if vectorization is not profitable. 27// 28// There is a development effort going on to migrate loop vectorizer to the 29// VPlan infrastructure and to introduce outer loop vectorization support (see 30// docs/VectorizationPlan.rst and 31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32// purpose, we temporarily introduced the VPlan-native vectorization path: an 33// alternative vectorization path that is natively implemented on top of the 34// VPlan infrastructure. See EnableVPlanNativePath for enabling. 35// 36//===----------------------------------------------------------------------===// 37// 38// The reduction-variable vectorization is based on the paper: 39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40// 41// Variable uniformity checks are inspired by: 42// Karrenberg, R. and Hack, S. Whole Function Vectorization. 43// 44// The interleaved access vectorization is based on the paper: 45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46// Data for SIMD 47// 48// Other ideas/concepts are from: 49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50// 51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52// Vectorizing Compilers. 53// 54//===----------------------------------------------------------------------===// 55 56#include "llvm/Transforms/Vectorize/LoopVectorize.h" 57#include "LoopVectorizationPlanner.h" 58#include "VPRecipeBuilder.h" 59#include "VPlan.h" 60#include "VPlanAnalysis.h" 61#include "VPlanHCFGBuilder.h" 62#include "VPlanTransforms.h" 63#include "llvm/ADT/APInt.h" 64#include "llvm/ADT/ArrayRef.h" 65#include "llvm/ADT/DenseMap.h" 66#include "llvm/ADT/DenseMapInfo.h" 67#include "llvm/ADT/Hashing.h" 68#include "llvm/ADT/MapVector.h" 69#include "llvm/ADT/STLExtras.h" 70#include "llvm/ADT/SmallPtrSet.h" 71#include "llvm/ADT/SmallSet.h" 72#include "llvm/ADT/SmallVector.h" 73#include "llvm/ADT/Statistic.h" 74#include "llvm/ADT/StringRef.h" 75#include "llvm/ADT/Twine.h" 76#include "llvm/ADT/iterator_range.h" 77#include "llvm/Analysis/AssumptionCache.h" 78#include "llvm/Analysis/BasicAliasAnalysis.h" 79#include "llvm/Analysis/BlockFrequencyInfo.h" 80#include "llvm/Analysis/CFG.h" 81#include "llvm/Analysis/CodeMetrics.h" 82#include "llvm/Analysis/DemandedBits.h" 83#include "llvm/Analysis/GlobalsModRef.h" 84#include "llvm/Analysis/LoopAccessAnalysis.h" 85#include "llvm/Analysis/LoopAnalysisManager.h" 86#include "llvm/Analysis/LoopInfo.h" 87#include "llvm/Analysis/LoopIterator.h" 88#include "llvm/Analysis/OptimizationRemarkEmitter.h" 89#include "llvm/Analysis/ProfileSummaryInfo.h" 90#include "llvm/Analysis/ScalarEvolution.h" 91#include "llvm/Analysis/ScalarEvolutionExpressions.h" 92#include "llvm/Analysis/TargetLibraryInfo.h" 93#include "llvm/Analysis/TargetTransformInfo.h" 94#include "llvm/Analysis/ValueTracking.h" 95#include "llvm/Analysis/VectorUtils.h" 96#include "llvm/IR/Attributes.h" 97#include "llvm/IR/BasicBlock.h" 98#include "llvm/IR/CFG.h" 99#include "llvm/IR/Constant.h" 100#include "llvm/IR/Constants.h" 101#include "llvm/IR/DataLayout.h" 102#include "llvm/IR/DebugInfo.h" 103#include "llvm/IR/DebugInfoMetadata.h" 104#include "llvm/IR/DebugLoc.h" 105#include "llvm/IR/DerivedTypes.h" 106#include "llvm/IR/DiagnosticInfo.h" 107#include "llvm/IR/Dominators.h" 108#include "llvm/IR/Function.h" 109#include "llvm/IR/IRBuilder.h" 110#include "llvm/IR/InstrTypes.h" 111#include "llvm/IR/Instruction.h" 112#include "llvm/IR/Instructions.h" 113#include "llvm/IR/IntrinsicInst.h" 114#include "llvm/IR/Intrinsics.h" 115#include "llvm/IR/MDBuilder.h" 116#include "llvm/IR/Metadata.h" 117#include "llvm/IR/Module.h" 118#include "llvm/IR/Operator.h" 119#include "llvm/IR/PatternMatch.h" 120#include "llvm/IR/ProfDataUtils.h" 121#include "llvm/IR/Type.h" 122#include "llvm/IR/Use.h" 123#include "llvm/IR/User.h" 124#include "llvm/IR/Value.h" 125#include "llvm/IR/ValueHandle.h" 126#include "llvm/IR/Verifier.h" 127#include "llvm/Support/Casting.h" 128#include "llvm/Support/CommandLine.h" 129#include "llvm/Support/Compiler.h" 130#include "llvm/Support/Debug.h" 131#include "llvm/Support/ErrorHandling.h" 132#include "llvm/Support/InstructionCost.h" 133#include "llvm/Support/MathExtras.h" 134#include "llvm/Support/raw_ostream.h" 135#include "llvm/Transforms/Utils/BasicBlockUtils.h" 136#include "llvm/Transforms/Utils/InjectTLIMappings.h" 137#include "llvm/Transforms/Utils/LoopSimplify.h" 138#include "llvm/Transforms/Utils/LoopUtils.h" 139#include "llvm/Transforms/Utils/LoopVersioning.h" 140#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 141#include "llvm/Transforms/Utils/SizeOpts.h" 142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143#include <algorithm> 144#include <cassert> 145#include <cmath> 146#include <cstdint> 147#include <functional> 148#include <iterator> 149#include <limits> 150#include <map> 151#include <memory> 152#include <string> 153#include <tuple> 154#include <utility> 155 156using namespace llvm; 157 158#define LV_NAME "loop-vectorize" 159#define DEBUG_TYPE LV_NAME 160 161#ifndef NDEBUG 162const char VerboseDebug[] = DEBUG_TYPE "-verbose"; 163#endif 164 165/// @{ 166/// Metadata attribute names 167const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all"; 168const char LLVMLoopVectorizeFollowupVectorized[] = 169 "llvm.loop.vectorize.followup_vectorized"; 170const char LLVMLoopVectorizeFollowupEpilogue[] = 171 "llvm.loop.vectorize.followup_epilogue"; 172/// @} 173 174STATISTIC(LoopsVectorized, "Number of loops vectorized"); 175STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 176STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); 177 178static cl::opt<bool> EnableEpilogueVectorization( 179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden, 180 cl::desc("Enable vectorization of epilogue loops.")); 181 182static cl::opt<unsigned> EpilogueVectorizationForceVF( 183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, 184 cl::desc("When epilogue vectorization is enabled, and a value greater than " 185 "1 is specified, forces the given VF for all applicable epilogue " 186 "loops.")); 187 188static cl::opt<unsigned> EpilogueVectorizationMinVF( 189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden, 190 cl::desc("Only loops with vectorization factor equal to or larger than " 191 "the specified value are considered for epilogue vectorization.")); 192 193/// Loops with a known constant trip count below this number are vectorized only 194/// if no scalar iteration overheads are incurred. 195static cl::opt<unsigned> TinyTripCountVectorThreshold( 196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 197 cl::desc("Loops with a constant trip count that is smaller than this " 198 "value are vectorized only if no scalar iteration overheads " 199 "are incurred.")); 200 201static cl::opt<unsigned> VectorizeMemoryCheckThreshold( 202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 203 cl::desc("The maximum allowed number of runtime memory checks")); 204 205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206// that predication is preferred, and this lists all options. I.e., the 207// vectorizer will try to fold the tail-loop (epilogue) into the vector body 208// and predicate the instructions accordingly. If tail-folding fails, there are 209// different fallback strategies depending on these values: 210namespace PreferPredicateTy { 211 enum Option { 212 ScalarEpilogue = 0, 213 PredicateElseScalarEpilogue, 214 PredicateOrDontVectorize 215 }; 216} // namespace PreferPredicateTy 217 218static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( 219 "prefer-predicate-over-epilogue", 220 cl::init(PreferPredicateTy::ScalarEpilogue), 221 cl::Hidden, 222 cl::desc("Tail-folding and predication preferences over creating a scalar " 223 "epilogue loop."), 224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, 225 "scalar-epilogue", 226 "Don't tail-predicate loops, create scalar epilogue"), 227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, 228 "predicate-else-scalar-epilogue", 229 "prefer tail-folding, create scalar epilogue if tail " 230 "folding fails."), 231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, 232 "predicate-dont-vectorize", 233 "prefers tail-folding, don't attempt vectorization if " 234 "tail-folding fails."))); 235 236static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( 237 "force-tail-folding-style", cl::desc("Force the tail folding style"), 238 cl::init(TailFoldingStyle::None), 239 cl::values( 240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), 241 clEnumValN( 242 TailFoldingStyle::Data, "data", 243 "Create lane mask for data only, using active.lane.mask intrinsic"), 244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask, 245 "data-without-lane-mask", 246 "Create lane mask with compare/stepvector"), 247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", 248 "Create lane mask using active.lane.mask intrinsic, and use " 249 "it for both data and control flow"), 250 clEnumValN( 251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, 252 "data-and-control-without-rt-check", 253 "Similar to data-and-control, but remove the runtime check"))); 254 255static cl::opt<bool> MaximizeBandwidth( 256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 257 cl::desc("Maximize bandwidth when selecting vectorization factor which " 258 "will be determined by the smallest type in loop.")); 259 260static cl::opt<bool> EnableInterleavedMemAccesses( 261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 262 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 263 264/// An interleave-group may need masking if it resides in a block that needs 265/// predication, or in order to mask away gaps. 266static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 269 270static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 272 cl::desc("We don't interleave loops with a estimated constant trip count " 273 "below this number")); 274 275static cl::opt<unsigned> ForceTargetNumScalarRegs( 276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 277 cl::desc("A flag that overrides the target's number of scalar registers.")); 278 279static cl::opt<unsigned> ForceTargetNumVectorRegs( 280 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 281 cl::desc("A flag that overrides the target's number of vector registers.")); 282 283static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 285 cl::desc("A flag that overrides the target's max interleave factor for " 286 "scalar loops.")); 287 288static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 290 cl::desc("A flag that overrides the target's max interleave factor for " 291 "vectorized loops.")); 292 293static cl::opt<unsigned> ForceTargetInstructionCost( 294 "force-target-instruction-cost", cl::init(0), cl::Hidden, 295 cl::desc("A flag that overrides the target's expected cost for " 296 "an instruction to a single constant value. Mostly " 297 "useful for getting consistent testing.")); 298 299static cl::opt<bool> ForceTargetSupportsScalableVectors( 300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, 301 cl::desc( 302 "Pretend that scalable vectors are supported, even if the target does " 303 "not support them. This flag should only be used for testing.")); 304 305static cl::opt<unsigned> SmallLoopCost( 306 "small-loop-cost", cl::init(20), cl::Hidden, 307 cl::desc( 308 "The cost of a loop that is considered 'small' by the interleaver.")); 309 310static cl::opt<bool> LoopVectorizeWithBlockFrequency( 311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 312 cl::desc("Enable the use of the block frequency analysis to access PGO " 313 "heuristics minimizing code growth in cold regions and being more " 314 "aggressive in hot regions.")); 315 316// Runtime interleave loops for load/store throughput. 317static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 319 cl::desc( 320 "Enable runtime interleaving until load/store ports are saturated")); 321 322/// Interleave small loops with scalar reductions. 323static cl::opt<bool> InterleaveSmallLoopScalarReduction( 324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden, 325 cl::desc("Enable interleaving for loops with small iteration counts that " 326 "contain scalar reductions to expose ILP.")); 327 328/// The number of stores in a loop that are allowed to need predication. 329static cl::opt<unsigned> NumberOfStoresToPredicate( 330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 331 cl::desc("Max number of stores to be predicated behind an if.")); 332 333static cl::opt<bool> EnableIndVarRegisterHeur( 334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 335 cl::desc("Count the induction variable only once when interleaving")); 336 337static cl::opt<bool> EnableCondStoresVectorization( 338 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 339 cl::desc("Enable if predication of stores during vectorization.")); 340 341static cl::opt<unsigned> MaxNestedScalarReductionIC( 342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 343 cl::desc("The maximum interleave count to use when interleaving a scalar " 344 "reduction in a nested loop.")); 345 346static cl::opt<bool> 347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), 348 cl::Hidden, 349 cl::desc("Prefer in-loop vector reductions, " 350 "overriding the targets preference.")); 351 352static cl::opt<bool> ForceOrderedReductions( 353 "force-ordered-reductions", cl::init(false), cl::Hidden, 354 cl::desc("Enable the vectorisation of loops with in-order (strict) " 355 "FP reductions")); 356 357static cl::opt<bool> PreferPredicatedReductionSelect( 358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, 359 cl::desc( 360 "Prefer predicating a reduction operation over an after loop select.")); 361 362namespace llvm { 363cl::opt<bool> EnableVPlanNativePath( 364 "enable-vplan-native-path", cl::Hidden, 365 cl::desc("Enable VPlan-native vectorization path with " 366 "support for outer loop vectorization.")); 367} 368 369// This flag enables the stress testing of the VPlan H-CFG construction in the 370// VPlan-native vectorization path. It must be used in conjuction with 371// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 372// verification of the H-CFGs built. 373static cl::opt<bool> VPlanBuildStressTest( 374 "vplan-build-stress-test", cl::init(false), cl::Hidden, 375 cl::desc( 376 "Build VPlan for every supported loop nest in the function and bail " 377 "out right after the build (stress test the VPlan H-CFG construction " 378 "in the VPlan-native vectorization path).")); 379 380cl::opt<bool> llvm::EnableLoopInterleaving( 381 "interleave-loops", cl::init(true), cl::Hidden, 382 cl::desc("Enable loop interleaving in Loop vectorization passes")); 383cl::opt<bool> llvm::EnableLoopVectorization( 384 "vectorize-loops", cl::init(true), cl::Hidden, 385 cl::desc("Run the Loop vectorization passes")); 386 387static cl::opt<bool> PrintVPlansInDotFormat( 388 "vplan-print-in-dot-format", cl::Hidden, 389 cl::desc("Use dot format instead of plain text when dumping VPlans")); 390 391static cl::opt<cl::boolOrDefault> ForceSafeDivisor( 392 "force-widen-divrem-via-safe-divisor", cl::Hidden, 393 cl::desc( 394 "Override cost based safe divisor widening for div/rem instructions")); 395 396static cl::opt<bool> UseWiderVFIfCallVariantsPresent( 397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), 398 cl::Hidden, 399 cl::desc("Try wider VFs if they enable the use of vector variants")); 400 401// Likelyhood of bypassing the vectorized loop because assumptions about SCEV 402// variables not overflowing do not hold. See `emitSCEVChecks`. 403static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; 404// Likelyhood of bypassing the vectorized loop because pointers overlap. See 405// `emitMemRuntimeChecks`. 406static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; 407// Likelyhood of bypassing the vectorized loop because there are zero trips left 408// after prolog. See `emitIterationCountCheck`. 409static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; 410 411/// A helper function that returns true if the given type is irregular. The 412/// type is irregular if its allocated size doesn't equal the store size of an 413/// element of the corresponding vector type. 414static bool hasIrregularType(Type *Ty, const DataLayout &DL) { 415 // Determine if an array of N elements of type Ty is "bitcast compatible" 416 // with a <N x Ty> vector. 417 // This is only true if there is no padding between the array elements. 418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 419} 420 421/// A helper function that returns the reciprocal of the block probability of 422/// predicated blocks. If we return X, we are assuming the predicated block 423/// will execute once for every X iterations of the loop header. 424/// 425/// TODO: We should use actual block probability here, if available. Currently, 426/// we always assume predicated blocks have a 50% chance of executing. 427static unsigned getReciprocalPredBlockProb() { return 2; } 428 429/// Returns "best known" trip count for the specified loop \p L as defined by 430/// the following procedure: 431/// 1) Returns exact trip count if it is known. 432/// 2) Returns expected trip count according to profile data if any. 433/// 3) Returns upper bound estimate if it is known. 434/// 4) Returns std::nullopt if all of the above failed. 435static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, 436 Loop *L) { 437 // Check if exact trip count is known. 438 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 439 return ExpectedTC; 440 441 // Check if there is an expected trip count available from profile data. 442 if (LoopVectorizeWithBlockFrequency) 443 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 444 return *EstimatedTC; 445 446 // Check if upper bound estimate is known. 447 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 448 return ExpectedTC; 449 450 return std::nullopt; 451} 452 453/// Return a vector containing interleaved elements from multiple 454/// smaller input vectors. 455static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, 456 const Twine &Name) { 457 unsigned Factor = Vals.size(); 458 assert(Factor > 1 && "Tried to interleave invalid number of vectors"); 459 460 VectorType *VecTy = cast<VectorType>(Vals[0]->getType()); 461#ifndef NDEBUG 462 for (Value *Val : Vals) 463 assert(Val->getType() == VecTy && "Tried to interleave mismatched types"); 464#endif 465 466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so 467 // must use intrinsics to interleave. 468 if (VecTy->isScalableTy()) { 469 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); 470 return Builder.CreateIntrinsic( 471 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, 472 /*FMFSource=*/nullptr, Name); 473 } 474 475 // Fixed length. Start by concatenating all vectors into a wide vector. 476 Value *WideVec = concatenateVectors(Builder, Vals); 477 478 // Interleave the elements into the wide vector. 479 const unsigned NumElts = VecTy->getElementCount().getFixedValue(); 480 return Builder.CreateShuffleVector( 481 WideVec, createInterleaveMask(NumElts, Factor), Name); 482} 483 484namespace { 485// Forward declare GeneratedRTChecks. 486class GeneratedRTChecks; 487 488using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; 489} // namespace 490 491namespace llvm { 492 493AnalysisKey ShouldRunExtraVectorPasses::Key; 494 495/// InnerLoopVectorizer vectorizes loops which contain only one basic 496/// block to a specified vectorization factor (VF). 497/// This class performs the widening of scalars into vectors, or multiple 498/// scalars. This class also implements the following features: 499/// * It inserts an epilogue loop for handling loops that don't have iteration 500/// counts that are known to be a multiple of the vectorization factor. 501/// * It handles the code generation for reduction variables. 502/// * Scalarization (implementation using scalars) of un-vectorizable 503/// instructions. 504/// InnerLoopVectorizer does not perform any vectorization-legality 505/// checks, and relies on the caller to check for the different legality 506/// aspects. The InnerLoopVectorizer relies on the 507/// LoopVectorizationLegality class to provide information about the induction 508/// and reduction variables that were found to a given vectorization factor. 509class InnerLoopVectorizer { 510public: 511 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 512 LoopInfo *LI, DominatorTree *DT, 513 const TargetLibraryInfo *TLI, 514 const TargetTransformInfo *TTI, AssumptionCache *AC, 515 OptimizationRemarkEmitter *ORE, ElementCount VecWidth, 516 ElementCount MinProfitableTripCount, 517 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 518 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 519 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) 520 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 521 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 522 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), 523 PSI(PSI), RTChecks(RTChecks) { 524 // Query this against the original loop and save it here because the profile 525 // of the original loop header may change as the transformation happens. 526 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( 527 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); 528 529 if (MinProfitableTripCount.isZero()) 530 this->MinProfitableTripCount = VecWidth; 531 else 532 this->MinProfitableTripCount = MinProfitableTripCount; 533 } 534 535 virtual ~InnerLoopVectorizer() = default; 536 537 /// Create a new empty loop that will contain vectorized instructions later 538 /// on, while the old loop will be used as the scalar remainder. Control flow 539 /// is generated around the vectorized (and scalar epilogue) loops consisting 540 /// of various checks and bypasses. Return the pre-header block of the new 541 /// loop and the start value for the canonical induction, if it is != 0. The 542 /// latter is the case when vectorizing the epilogue loop. In the case of 543 /// epilogue vectorization, this function is overriden to handle the more 544 /// complex control flow around the loops. \p ExpandedSCEVs is used to 545 /// look up SCEV expansions for expressions needed during skeleton creation. 546 virtual std::pair<BasicBlock *, Value *> 547 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); 548 549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 550 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); 551 552 // Return true if any runtime check is added. 553 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 554 555 /// A type for vectorized values in the new loop. Each value from the 556 /// original loop, when vectorized, is represented by UF vector values in the 557 /// new unrolled loop, where UF is the unroll factor. 558 using VectorParts = SmallVector<Value *, 2>; 559 560 /// A helper function to scalarize a single Instruction in the innermost loop. 561 /// Generates a sequence of scalar instances for each lane between \p MinLane 562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 564 /// Instr's operands. 565 void scalarizeInstruction(const Instruction *Instr, 566 VPReplicateRecipe *RepRecipe, 567 const VPIteration &Instance, 568 VPTransformState &State); 569 570 /// Try to vectorize interleaved access group \p Group with the base address 571 /// given in \p Addr, optionally masking the vector operations if \p 572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR 573 /// values in the vectorized loop. 574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, 575 ArrayRef<VPValue *> VPDefs, 576 VPTransformState &State, VPValue *Addr, 577 ArrayRef<VPValue *> StoredValues, 578 VPValue *BlockInMask, bool NeedsMaskForGaps); 579 580 /// Fix the non-induction PHIs in \p Plan. 581 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); 582 583 /// Returns true if the reordering of FP operations is not allowed, but we are 584 /// able to vectorize with strict in-order reductions for the given RdxDesc. 585 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); 586 587 /// Create a new phi node for the induction variable \p OrigPhi to resume 588 /// iteration count in the scalar epilogue, from where the vectorized loop 589 /// left off. \p Step is the SCEV-expanded induction step to use. In cases 590 /// where the loop skeleton is more complicated (i.e., epilogue vectorization) 591 /// and the resume values can come from an additional bypass block, the \p 592 /// AdditionalBypass pair provides information about the bypass block and the 593 /// end value on the edge from bypass to this loop. 594 PHINode *createInductionResumeValue( 595 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, 596 ArrayRef<BasicBlock *> BypassBlocks, 597 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 598 599 /// Returns the original loop trip count. 600 Value *getTripCount() const { return TripCount; } 601 602 /// Used to set the trip count after ILV's construction and after the 603 /// preheader block has been executed. Note that this always holds the trip 604 /// count of the original loop for both main loop and epilogue vectorization. 605 void setTripCount(Value *TC) { TripCount = TC; } 606 607protected: 608 friend class LoopVectorizationPlanner; 609 610 /// A small list of PHINodes. 611 using PhiVector = SmallVector<PHINode *, 4>; 612 613 /// A type for scalarized values in the new loop. Each value from the 614 /// original loop, when scalarized, is represented by UF x VF scalar values 615 /// in the new unrolled loop, where UF is the unroll factor and VF is the 616 /// vectorization factor. 617 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 618 619 /// Set up the values of the IVs correctly when exiting the vector loop. 620 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 621 Value *VectorTripCount, Value *EndValue, 622 BasicBlock *MiddleBlock, BasicBlock *VectorHeader, 623 VPlan &Plan, VPTransformState &State); 624 625 /// Create the exit value of first order recurrences in the middle block and 626 /// update their users. 627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, 628 VPTransformState &State); 629 630 /// Create code for the loop exit value of the reduction. 631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); 632 633 /// Iteratively sink the scalarized operands of a predicated instruction into 634 /// the block that was created for it. 635 void sinkScalarOperands(Instruction *PredInst); 636 637 /// Returns (and creates if needed) the trip count of the widened loop. 638 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); 639 640 /// Returns a bitcasted value to the requested vector type. 641 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 642 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 643 const DataLayout &DL); 644 645 /// Emit a bypass check to see if the vector trip count is zero, including if 646 /// it overflows. 647 void emitIterationCountCheck(BasicBlock *Bypass); 648 649 /// Emit a bypass check to see if all of the SCEV assumptions we've 650 /// had to make are correct. Returns the block containing the checks or 651 /// nullptr if no checks have been added. 652 BasicBlock *emitSCEVChecks(BasicBlock *Bypass); 653 654 /// Emit bypass checks to check any memory assumptions we may have made. 655 /// Returns the block containing the checks or nullptr if no checks have been 656 /// added. 657 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); 658 659 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 660 /// vector loop preheader, middle block and scalar preheader. 661 void createVectorLoopSkeleton(StringRef Prefix); 662 663 /// Create new phi nodes for the induction variables to resume iteration count 664 /// in the scalar epilogue, from where the vectorized loop left off. 665 /// In cases where the loop skeleton is more complicated (eg. epilogue 666 /// vectorization) and the resume values can come from an additional bypass 667 /// block, the \p AdditionalBypass pair provides information about the bypass 668 /// block and the end value on the edge from bypass to this loop. 669 void createInductionResumeValues( 670 const SCEV2ValueTy &ExpandedSCEVs, 671 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); 672 673 /// Complete the loop skeleton by adding debug MDs, creating appropriate 674 /// conditional branches in the middle block, preparing the builder and 675 /// running the verifier. Return the preheader of the completed vector loop. 676 BasicBlock *completeLoopSkeleton(); 677 678 /// Collect poison-generating recipes that may generate a poison value that is 679 /// used after vectorization, even when their operands are not poison. Those 680 /// recipes meet the following conditions: 681 /// * Contribute to the address computation of a recipe generating a widen 682 /// memory load/store (VPWidenMemoryInstructionRecipe or 683 /// VPInterleaveRecipe). 684 /// * Such a widen memory load/store has at least one underlying Instruction 685 /// that is in a basic block that needs predication and after vectorization 686 /// the generated instruction won't be predicated. 687 void collectPoisonGeneratingRecipes(VPTransformState &State); 688 689 /// Allow subclasses to override and print debug traces before/after vplan 690 /// execution, when trace information is requested. 691 virtual void printDebugTracesAtStart(){}; 692 virtual void printDebugTracesAtEnd(){}; 693 694 /// The original loop. 695 Loop *OrigLoop; 696 697 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 698 /// dynamic knowledge to simplify SCEV expressions and converts them to a 699 /// more usable form. 700 PredicatedScalarEvolution &PSE; 701 702 /// Loop Info. 703 LoopInfo *LI; 704 705 /// Dominator Tree. 706 DominatorTree *DT; 707 708 /// Target Library Info. 709 const TargetLibraryInfo *TLI; 710 711 /// Target Transform Info. 712 const TargetTransformInfo *TTI; 713 714 /// Assumption Cache. 715 AssumptionCache *AC; 716 717 /// Interface to emit optimization remarks. 718 OptimizationRemarkEmitter *ORE; 719 720 /// The vectorization SIMD factor to use. Each vector will have this many 721 /// vector elements. 722 ElementCount VF; 723 724 ElementCount MinProfitableTripCount; 725 726 /// The vectorization unroll factor to use. Each scalar is vectorized to this 727 /// many different vector instructions. 728 unsigned UF; 729 730 /// The builder that we use 731 IRBuilder<> Builder; 732 733 // --- Vectorization state --- 734 735 /// The vector-loop preheader. 736 BasicBlock *LoopVectorPreHeader; 737 738 /// The scalar-loop preheader. 739 BasicBlock *LoopScalarPreHeader; 740 741 /// Middle Block between the vector and the scalar. 742 BasicBlock *LoopMiddleBlock; 743 744 /// The unique ExitBlock of the scalar loop if one exists. Note that 745 /// there can be multiple exiting edges reaching this block. 746 BasicBlock *LoopExitBlock; 747 748 /// The scalar loop body. 749 BasicBlock *LoopScalarBody; 750 751 /// A list of all bypass blocks. The first block is the entry of the loop. 752 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 753 754 /// Store instructions that were predicated. 755 SmallVector<Instruction *, 4> PredicatedInstructions; 756 757 /// Trip count of the original loop. 758 Value *TripCount = nullptr; 759 760 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 761 Value *VectorTripCount = nullptr; 762 763 /// The legality analysis. 764 LoopVectorizationLegality *Legal; 765 766 /// The profitablity analysis. 767 LoopVectorizationCostModel *Cost; 768 769 // Record whether runtime checks are added. 770 bool AddedSafetyChecks = false; 771 772 // Holds the end values for each induction variable. We save the end values 773 // so we can later fix-up the external users of the induction variables. 774 DenseMap<PHINode *, Value *> IVEndValues; 775 776 /// BFI and PSI are used to check for profile guided size optimizations. 777 BlockFrequencyInfo *BFI; 778 ProfileSummaryInfo *PSI; 779 780 // Whether this loop should be optimized for size based on profile guided size 781 // optimizatios. 782 bool OptForSizeBasedOnProfile; 783 784 /// Structure to hold information about generated runtime checks, responsible 785 /// for cleaning the checks, if vectorization turns out unprofitable. 786 GeneratedRTChecks &RTChecks; 787 788 // Holds the resume values for reductions in the loops, used to set the 789 // correct start value of reduction PHIs when vectorizing the epilogue. 790 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> 791 ReductionResumeValues; 792}; 793 794class InnerLoopUnroller : public InnerLoopVectorizer { 795public: 796 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 797 LoopInfo *LI, DominatorTree *DT, 798 const TargetLibraryInfo *TLI, 799 const TargetTransformInfo *TTI, AssumptionCache *AC, 800 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 801 LoopVectorizationLegality *LVL, 802 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, 803 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) 804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 805 ElementCount::getFixed(1), 806 ElementCount::getFixed(1), UnrollFactor, LVL, CM, 807 BFI, PSI, Check) {} 808}; 809 810/// Encapsulate information regarding vectorization of a loop and its epilogue. 811/// This information is meant to be updated and used across two stages of 812/// epilogue vectorization. 813struct EpilogueLoopVectorizationInfo { 814 ElementCount MainLoopVF = ElementCount::getFixed(0); 815 unsigned MainLoopUF = 0; 816 ElementCount EpilogueVF = ElementCount::getFixed(0); 817 unsigned EpilogueUF = 0; 818 BasicBlock *MainLoopIterationCountCheck = nullptr; 819 BasicBlock *EpilogueIterationCountCheck = nullptr; 820 BasicBlock *SCEVSafetyCheck = nullptr; 821 BasicBlock *MemSafetyCheck = nullptr; 822 Value *TripCount = nullptr; 823 Value *VectorTripCount = nullptr; 824 825 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, 826 ElementCount EVF, unsigned EUF) 827 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { 828 assert(EUF == 1 && 829 "A high UF for the epilogue loop is likely not beneficial."); 830 } 831}; 832 833/// An extension of the inner loop vectorizer that creates a skeleton for a 834/// vectorized loop that has its epilogue (residual) also vectorized. 835/// The idea is to run the vplan on a given loop twice, firstly to setup the 836/// skeleton and vectorize the main loop, and secondly to complete the skeleton 837/// from the first step and vectorize the epilogue. This is achieved by 838/// deriving two concrete strategy classes from this base class and invoking 839/// them in succession from the loop vectorizer planner. 840class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { 841public: 842 InnerLoopAndEpilogueVectorizer( 843 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 844 DominatorTree *DT, const TargetLibraryInfo *TLI, 845 const TargetTransformInfo *TTI, AssumptionCache *AC, 846 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 847 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 848 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 849 GeneratedRTChecks &Checks) 850 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 851 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, 852 CM, BFI, PSI, Checks), 853 EPI(EPI) {} 854 855 // Override this function to handle the more complex control flow around the 856 // three loops. 857 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( 858 const SCEV2ValueTy &ExpandedSCEVs) final { 859 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); 860 } 861 862 /// The interface for creating a vectorized skeleton using one of two 863 /// different strategies, each corresponding to one execution of the vplan 864 /// as described above. 865 virtual std::pair<BasicBlock *, Value *> 866 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; 867 868 /// Holds and updates state information required to vectorize the main loop 869 /// and its epilogue in two separate passes. This setup helps us avoid 870 /// regenerating and recomputing runtime safety checks. It also helps us to 871 /// shorten the iteration-count-check path length for the cases where the 872 /// iteration count of the loop is so small that the main vector loop is 873 /// completely skipped. 874 EpilogueLoopVectorizationInfo &EPI; 875}; 876 877/// A specialized derived class of inner loop vectorizer that performs 878/// vectorization of *main* loops in the process of vectorizing loops and their 879/// epilogues. 880class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { 881public: 882 EpilogueVectorizerMainLoop( 883 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 884 DominatorTree *DT, const TargetLibraryInfo *TLI, 885 const TargetTransformInfo *TTI, AssumptionCache *AC, 886 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 887 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 888 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 889 GeneratedRTChecks &Check) 890 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 891 EPI, LVL, CM, BFI, PSI, Check) {} 892 /// Implements the interface for creating a vectorized skeleton using the 893 /// *main loop* strategy (ie the first pass of vplan execution). 894 std::pair<BasicBlock *, Value *> 895 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 896 897protected: 898 /// Emits an iteration count bypass check once for the main loop (when \p 899 /// ForEpilogue is false) and once for the epilogue loop (when \p 900 /// ForEpilogue is true). 901 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); 902 void printDebugTracesAtStart() override; 903 void printDebugTracesAtEnd() override; 904}; 905 906// A specialized derived class of inner loop vectorizer that performs 907// vectorization of *epilogue* loops in the process of vectorizing loops and 908// their epilogues. 909class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { 910public: 911 EpilogueVectorizerEpilogueLoop( 912 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, 913 DominatorTree *DT, const TargetLibraryInfo *TLI, 914 const TargetTransformInfo *TTI, AssumptionCache *AC, 915 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, 916 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, 917 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, 918 GeneratedRTChecks &Checks) 919 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 920 EPI, LVL, CM, BFI, PSI, Checks) { 921 TripCount = EPI.TripCount; 922 } 923 /// Implements the interface for creating a vectorized skeleton using the 924 /// *epilogue loop* strategy (ie the second pass of vplan execution). 925 std::pair<BasicBlock *, Value *> 926 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; 927 928protected: 929 /// Emits an iteration count bypass check after the main vector loop has 930 /// finished to see if there are any iterations left to execute by either 931 /// the vector epilogue or the scalar epilogue. 932 BasicBlock *emitMinimumVectorEpilogueIterCountCheck( 933 BasicBlock *Bypass, 934 BasicBlock *Insert); 935 void printDebugTracesAtStart() override; 936 void printDebugTracesAtEnd() override; 937}; 938} // end namespace llvm 939 940/// Look for a meaningful debug location on the instruction or it's 941/// operands. 942static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { 943 if (!I) 944 return DebugLoc(); 945 946 DebugLoc Empty; 947 if (I->getDebugLoc() != Empty) 948 return I->getDebugLoc(); 949 950 for (Use &Op : I->operands()) { 951 if (Instruction *OpInst = dyn_cast<Instruction>(Op)) 952 if (OpInst->getDebugLoc() != Empty) 953 return OpInst->getDebugLoc(); 954 } 955 956 return I->getDebugLoc(); 957} 958 959/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 960/// is passed, the message relates to that particular instruction. 961#ifndef NDEBUG 962static void debugVectorizationMessage(const StringRef Prefix, 963 const StringRef DebugMsg, 964 Instruction *I) { 965 dbgs() << "LV: " << Prefix << DebugMsg; 966 if (I != nullptr) 967 dbgs() << " " << *I; 968 else 969 dbgs() << '.'; 970 dbgs() << '\n'; 971} 972#endif 973 974/// Create an analysis remark that explains why vectorization failed 975/// 976/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 977/// RemarkName is the identifier for the remark. If \p I is passed it is an 978/// instruction that prevents vectorization. Otherwise \p TheLoop is used for 979/// the location of the remark. \return the remark object that can be 980/// streamed to. 981static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 982 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 983 Value *CodeRegion = TheLoop->getHeader(); 984 DebugLoc DL = TheLoop->getStartLoc(); 985 986 if (I) { 987 CodeRegion = I->getParent(); 988 // If there is no debug location attached to the instruction, revert back to 989 // using the loop's. 990 if (I->getDebugLoc()) 991 DL = I->getDebugLoc(); 992 } 993 994 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); 995} 996 997namespace llvm { 998 999/// Return a value for Step multiplied by VF. 1000Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, 1001 int64_t Step) { 1002 assert(Ty->isIntegerTy() && "Expected an integer step"); 1003 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step)); 1004} 1005 1006/// Return the runtime value for VF. 1007Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { 1008 return B.CreateElementCount(Ty, VF); 1009} 1010 1011const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, 1012 Loop *OrigLoop) { 1013 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 1014 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count"); 1015 1016 ScalarEvolution &SE = *PSE.getSE(); 1017 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop); 1018} 1019 1020void reportVectorizationFailure(const StringRef DebugMsg, 1021 const StringRef OREMsg, const StringRef ORETag, 1022 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1023 Instruction *I) { 1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I)); 1025 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1026 ORE->emit( 1027 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1028 << "loop not vectorized: " << OREMsg); 1029} 1030 1031void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, 1032 OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1033 Instruction *I) { 1034 LLVM_DEBUG(debugVectorizationMessage("", Msg, I)); 1035 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 1036 ORE->emit( 1037 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I) 1038 << Msg); 1039} 1040 1041/// Report successful vectorization of the loop. In case an outer loop is 1042/// vectorized, prepend "outer" to the vectorization remark. 1043static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, 1044 VectorizationFactor VF, unsigned IC) { 1045 LLVM_DEBUG(debugVectorizationMessage( 1046 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", 1047 nullptr)); 1048 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer "; 1049 ORE->emit([&]() { 1050 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), 1051 TheLoop->getHeader()) 1052 << "vectorized " << LoopType << "loop (vectorization width: " 1053 << ore::NV("VectorizationFactor", VF.Width) 1054 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; 1055 }); 1056} 1057 1058} // end namespace llvm 1059 1060#ifndef NDEBUG 1061/// \return string containing a file name and a line # for the given loop. 1062static std::string getDebugLocString(const Loop *L) { 1063 std::string Result; 1064 if (L) { 1065 raw_string_ostream OS(Result); 1066 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 1067 LoopDbgLoc.print(OS); 1068 else 1069 // Just print the module name. 1070 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 1071 OS.flush(); 1072 } 1073 return Result; 1074} 1075#endif 1076 1077void InnerLoopVectorizer::collectPoisonGeneratingRecipes( 1078 VPTransformState &State) { 1079 1080 // Collect recipes in the backward slice of `Root` that may generate a poison 1081 // value that is used after vectorization. 1082 SmallPtrSet<VPRecipeBase *, 16> Visited; 1083 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { 1084 SmallVector<VPRecipeBase *, 16> Worklist; 1085 Worklist.push_back(Root); 1086 1087 // Traverse the backward slice of Root through its use-def chain. 1088 while (!Worklist.empty()) { 1089 VPRecipeBase *CurRec = Worklist.back(); 1090 Worklist.pop_back(); 1091 1092 if (!Visited.insert(CurRec).second) 1093 continue; 1094 1095 // Prune search if we find another recipe generating a widen memory 1096 // instruction. Widen memory instructions involved in address computation 1097 // will lead to gather/scatter instructions, which don't need to be 1098 // handled. 1099 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || 1100 isa<VPInterleaveRecipe>(CurRec) || 1101 isa<VPScalarIVStepsRecipe>(CurRec) || 1102 isa<VPCanonicalIVPHIRecipe>(CurRec) || 1103 isa<VPActiveLaneMaskPHIRecipe>(CurRec)) 1104 continue; 1105 1106 // This recipe contributes to the address computation of a widen 1107 // load/store. If the underlying instruction has poison-generating flags, 1108 // drop them directly. 1109 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) { 1110 RecWithFlags->dropPoisonGeneratingFlags(); 1111 } else { 1112 Instruction *Instr = dyn_cast_or_null<Instruction>( 1113 CurRec->getVPSingleValue()->getUnderlyingValue()); 1114 (void)Instr; 1115 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && 1116 "found instruction with poison generating flags not covered by " 1117 "VPRecipeWithIRFlags"); 1118 } 1119 1120 // Add new definitions to the worklist. 1121 for (VPValue *operand : CurRec->operands()) 1122 if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) 1123 Worklist.push_back(OpDef); 1124 } 1125 }); 1126 1127 // Traverse all the recipes in the VPlan and collect the poison-generating 1128 // recipes in the backward slice starting at the address of a VPWidenRecipe or 1129 // VPInterleaveRecipe. 1130 auto Iter = vp_depth_first_deep(State.Plan->getEntry()); 1131 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 1132 for (VPRecipeBase &Recipe : *VPBB) { 1133 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) { 1134 Instruction &UnderlyingInstr = WidenRec->getIngredient(); 1135 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); 1136 if (AddrDef && WidenRec->isConsecutive() && 1137 Legal->blockNeedsPredication(UnderlyingInstr.getParent())) 1138 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1139 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) { 1140 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); 1141 if (AddrDef) { 1142 // Check if any member of the interleave group needs predication. 1143 const InterleaveGroup<Instruction> *InterGroup = 1144 InterleaveRec->getInterleaveGroup(); 1145 bool NeedPredication = false; 1146 for (int I = 0, NumMembers = InterGroup->getNumMembers(); 1147 I < NumMembers; ++I) { 1148 Instruction *Member = InterGroup->getMember(I); 1149 if (Member) 1150 NeedPredication |= 1151 Legal->blockNeedsPredication(Member->getParent()); 1152 } 1153 1154 if (NeedPredication) 1155 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); 1156 } 1157 } 1158 } 1159 } 1160} 1161 1162namespace llvm { 1163 1164// Loop vectorization cost-model hints how the scalar epilogue loop should be 1165// lowered. 1166enum ScalarEpilogueLowering { 1167 1168 // The default: allowing scalar epilogues. 1169 CM_ScalarEpilogueAllowed, 1170 1171 // Vectorization with OptForSize: don't allow epilogues. 1172 CM_ScalarEpilogueNotAllowedOptSize, 1173 1174 // A special case of vectorisation with OptForSize: loops with a very small 1175 // trip count are considered for vectorization under OptForSize, thereby 1176 // making sure the cost of their loop body is dominant, free of runtime 1177 // guards and scalar iteration overheads. 1178 CM_ScalarEpilogueNotAllowedLowTripLoop, 1179 1180 // Loop hint predicate indicating an epilogue is undesired. 1181 CM_ScalarEpilogueNotNeededUsePredicate, 1182 1183 // Directive indicating we must either tail fold or not vectorize 1184 CM_ScalarEpilogueNotAllowedUsePredicate 1185}; 1186 1187using InstructionVFPair = std::pair<Instruction *, ElementCount>; 1188 1189/// LoopVectorizationCostModel - estimates the expected speedups due to 1190/// vectorization. 1191/// In many cases vectorization is not profitable. This can happen because of 1192/// a number of reasons. In this class we mainly attempt to predict the 1193/// expected speedup/slowdowns due to the supported instruction set. We use the 1194/// TargetTransformInfo to query the different backends for the cost of 1195/// different operations. 1196class LoopVectorizationCostModel { 1197public: 1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 1199 PredicatedScalarEvolution &PSE, LoopInfo *LI, 1200 LoopVectorizationLegality *Legal, 1201 const TargetTransformInfo &TTI, 1202 const TargetLibraryInfo *TLI, DemandedBits *DB, 1203 AssumptionCache *AC, 1204 OptimizationRemarkEmitter *ORE, const Function *F, 1205 const LoopVectorizeHints *Hints, 1206 InterleavedAccessInfo &IAI) 1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 1209 Hints(Hints), InterleaveInfo(IAI) {} 1210 1211 /// \return An upper bound for the vectorization factors (both fixed and 1212 /// scalable). If the factors are 0, vectorization and interleaving should be 1213 /// avoided up front. 1214 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); 1215 1216 /// \return True if runtime checks are required for vectorization, and false 1217 /// otherwise. 1218 bool runtimeChecksRequired(); 1219 1220 /// Setup cost-based decisions for user vectorization factor. 1221 /// \return true if the UserVF is a feasible VF to be chosen. 1222 bool selectUserVectorizationFactor(ElementCount UserVF) { 1223 collectUniformsAndScalars(UserVF); 1224 collectInstsToScalarize(UserVF); 1225 return expectedCost(UserVF).first.isValid(); 1226 } 1227 1228 /// \return The size (in bits) of the smallest and widest types in the code 1229 /// that needs to be vectorized. We ignore values that remain scalar such as 1230 /// 64 bit loop indices. 1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1232 1233 /// \return The desired interleave count. 1234 /// If interleave count has been specified by metadata it will be returned. 1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1236 /// are the selected vectorization factor and the cost of the selected VF. 1237 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); 1238 1239 /// Memory access instruction may be vectorized in more than one way. 1240 /// Form of instruction after vectorization depends on cost. 1241 /// This function takes cost-based decisions for Load/Store instructions 1242 /// and collects them in a map. This decisions map is used for building 1243 /// the lists of loop-uniform and loop-scalar instructions. 1244 /// The calculated cost is saved with widening decision in order to 1245 /// avoid redundant calculations. 1246 void setCostBasedWideningDecision(ElementCount VF); 1247 1248 /// A call may be vectorized in different ways depending on whether we have 1249 /// vectorized variants available and whether the target supports masking. 1250 /// This function analyzes all calls in the function at the supplied VF, 1251 /// makes a decision based on the costs of available options, and stores that 1252 /// decision in a map for use in planning and plan execution. 1253 void setVectorizedCallDecision(ElementCount VF); 1254 1255 /// A struct that represents some properties of the register usage 1256 /// of a loop. 1257 struct RegisterUsage { 1258 /// Holds the number of loop invariant values that are used in the loop. 1259 /// The key is ClassID of target-provided register class. 1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1261 /// Holds the maximum number of concurrent live intervals in the loop. 1262 /// The key is ClassID of target-provided register class. 1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1264 }; 1265 1266 /// \return Returns information about the register usages of the loop for the 1267 /// given vectorization factors. 1268 SmallVector<RegisterUsage, 8> 1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs); 1270 1271 /// Collect values we want to ignore in the cost model. 1272 void collectValuesToIgnore(); 1273 1274 /// Collect all element types in the loop for which widening is needed. 1275 void collectElementTypesForWidening(); 1276 1277 /// Split reductions into those that happen in the loop, and those that happen 1278 /// outside. In loop reductions are collected into InLoopReductions. 1279 void collectInLoopReductions(); 1280 1281 /// Returns true if we should use strict in-order reductions for the given 1282 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1283 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1284 /// of FP operations. 1285 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { 1286 return !Hints->allowReordering() && RdxDesc.isOrdered(); 1287 } 1288 1289 /// \returns The smallest bitwidth each instruction can be represented with. 1290 /// The vector equivalents of these instructions should be truncated to this 1291 /// type. 1292 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1293 return MinBWs; 1294 } 1295 1296 /// \returns True if it is more profitable to scalarize instruction \p I for 1297 /// vectorization factor \p VF. 1298 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { 1299 assert(VF.isVector() && 1300 "Profitable to scalarize relevant only for VF > 1."); 1301 1302 // Cost model is not run in the VPlan-native path - return conservative 1303 // result until this changes. 1304 if (EnableVPlanNativePath) 1305 return false; 1306 1307 auto Scalars = InstsToScalarize.find(VF); 1308 assert(Scalars != InstsToScalarize.end() && 1309 "VF not yet analyzed for scalarization profitability"); 1310 return Scalars->second.contains(I); 1311 } 1312 1313 /// Returns true if \p I is known to be uniform after vectorization. 1314 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { 1315 // Pseudo probe needs to be duplicated for each unrolled iteration and 1316 // vector lane so that profiled loop trip count can be accurately 1317 // accumulated instead of being under counted. 1318 if (isa<PseudoProbeInst>(I)) 1319 return false; 1320 1321 if (VF.isScalar()) 1322 return true; 1323 1324 // Cost model is not run in the VPlan-native path - return conservative 1325 // result until this changes. 1326 if (EnableVPlanNativePath) 1327 return false; 1328 1329 auto UniformsPerVF = Uniforms.find(VF); 1330 assert(UniformsPerVF != Uniforms.end() && 1331 "VF not yet analyzed for uniformity"); 1332 return UniformsPerVF->second.count(I); 1333 } 1334 1335 /// Returns true if \p I is known to be scalar after vectorization. 1336 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { 1337 if (VF.isScalar()) 1338 return true; 1339 1340 // Cost model is not run in the VPlan-native path - return conservative 1341 // result until this changes. 1342 if (EnableVPlanNativePath) 1343 return false; 1344 1345 auto ScalarsPerVF = Scalars.find(VF); 1346 assert(ScalarsPerVF != Scalars.end() && 1347 "Scalar values are not calculated for VF"); 1348 return ScalarsPerVF->second.count(I); 1349 } 1350 1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1352 /// for vectorization factor \p VF. 1353 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { 1354 return VF.isVector() && MinBWs.contains(I) && 1355 !isProfitableToScalarize(I, VF) && 1356 !isScalarAfterVectorization(I, VF); 1357 } 1358 1359 /// Decision that was taken during cost calculation for memory instruction. 1360 enum InstWidening { 1361 CM_Unknown, 1362 CM_Widen, // For consecutive accesses with stride +1. 1363 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1364 CM_Interleave, 1365 CM_GatherScatter, 1366 CM_Scalarize, 1367 CM_VectorCall, 1368 CM_IntrinsicCall 1369 }; 1370 1371 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1372 /// instruction \p I and vector width \p VF. 1373 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, 1374 InstructionCost Cost) { 1375 assert(VF.isVector() && "Expected VF >=2"); 1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1377 } 1378 1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1380 /// interleaving group \p Grp and vector width \p VF. 1381 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, 1382 ElementCount VF, InstWidening W, 1383 InstructionCost Cost) { 1384 assert(VF.isVector() && "Expected VF >=2"); 1385 /// Broadcast this decicion to all instructions inside the group. 1386 /// But the cost will be assigned to one instruction only. 1387 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1388 if (auto *I = Grp->getMember(i)) { 1389 if (Grp->getInsertPos() == I) 1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1391 else 1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1393 } 1394 } 1395 } 1396 1397 /// Return the cost model decision for the given instruction \p I and vector 1398 /// width \p VF. Return CM_Unknown if this instruction did not pass 1399 /// through the cost modeling. 1400 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { 1401 assert(VF.isVector() && "Expected VF to be a vector VF"); 1402 // Cost model is not run in the VPlan-native path - return conservative 1403 // result until this changes. 1404 if (EnableVPlanNativePath) 1405 return CM_GatherScatter; 1406 1407 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1408 auto Itr = WideningDecisions.find(InstOnVF); 1409 if (Itr == WideningDecisions.end()) 1410 return CM_Unknown; 1411 return Itr->second.first; 1412 } 1413 1414 /// Return the vectorization cost for the given instruction \p I and vector 1415 /// width \p VF. 1416 InstructionCost getWideningCost(Instruction *I, ElementCount VF) { 1417 assert(VF.isVector() && "Expected VF >=2"); 1418 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF); 1419 assert(WideningDecisions.contains(InstOnVF) && 1420 "The cost is not calculated"); 1421 return WideningDecisions[InstOnVF].second; 1422 } 1423 1424 struct CallWideningDecision { 1425 InstWidening Kind; 1426 Function *Variant; 1427 Intrinsic::ID IID; 1428 std::optional<unsigned> MaskPos; 1429 InstructionCost Cost; 1430 }; 1431 1432 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, 1433 Function *Variant, Intrinsic::ID IID, 1434 std::optional<unsigned> MaskPos, 1435 InstructionCost Cost) { 1436 assert(!VF.isScalar() && "Expected vector VF"); 1437 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, 1438 MaskPos, Cost}; 1439 } 1440 1441 CallWideningDecision getCallWideningDecision(CallInst *CI, 1442 ElementCount VF) const { 1443 assert(!VF.isScalar() && "Expected vector VF"); 1444 return CallWideningDecisions.at(std::make_pair(CI, VF)); 1445 } 1446 1447 /// Return True if instruction \p I is an optimizable truncate whose operand 1448 /// is an induction variable. Such a truncate will be removed by adding a new 1449 /// induction variable with the destination type. 1450 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { 1451 // If the instruction is not a truncate, return false. 1452 auto *Trunc = dyn_cast<TruncInst>(I); 1453 if (!Trunc) 1454 return false; 1455 1456 // Get the source and destination types of the truncate. 1457 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1458 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1459 1460 // If the truncate is free for the given types, return false. Replacing a 1461 // free truncate with an induction variable would add an induction variable 1462 // update instruction to each iteration of the loop. We exclude from this 1463 // check the primary induction variable since it will need an update 1464 // instruction regardless. 1465 Value *Op = Trunc->getOperand(0); 1466 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1467 return false; 1468 1469 // If the truncated value is not an induction variable, return false. 1470 return Legal->isInductionPhi(Op); 1471 } 1472 1473 /// Collects the instructions to scalarize for each predicated instruction in 1474 /// the loop. 1475 void collectInstsToScalarize(ElementCount VF); 1476 1477 /// Collect Uniform and Scalar values for the given \p VF. 1478 /// The sets depend on CM decision for Load/Store instructions 1479 /// that may be vectorized as interleave, gather-scatter or scalarized. 1480 /// Also make a decision on what to do about call instructions in the loop 1481 /// at that VF -- scalarize, call a known vector routine, or call a 1482 /// vector intrinsic. 1483 void collectUniformsAndScalars(ElementCount VF) { 1484 // Do the analysis once. 1485 if (VF.isScalar() || Uniforms.contains(VF)) 1486 return; 1487 setCostBasedWideningDecision(VF); 1488 setVectorizedCallDecision(VF); 1489 collectLoopUniforms(VF); 1490 collectLoopScalars(VF); 1491 } 1492 1493 /// Returns true if the target machine supports masked store operation 1494 /// for the given \p DataType and kind of access to \p Ptr. 1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { 1496 return Legal->isConsecutivePtr(DataType, Ptr) && 1497 TTI.isLegalMaskedStore(DataType, Alignment); 1498 } 1499 1500 /// Returns true if the target machine supports masked load operation 1501 /// for the given \p DataType and kind of access to \p Ptr. 1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { 1503 return Legal->isConsecutivePtr(DataType, Ptr) && 1504 TTI.isLegalMaskedLoad(DataType, Alignment); 1505 } 1506 1507 /// Returns true if the target machine can represent \p V as a masked gather 1508 /// or scatter operation. 1509 bool isLegalGatherOrScatter(Value *V, ElementCount VF) { 1510 bool LI = isa<LoadInst>(V); 1511 bool SI = isa<StoreInst>(V); 1512 if (!LI && !SI) 1513 return false; 1514 auto *Ty = getLoadStoreType(V); 1515 Align Align = getLoadStoreAlignment(V); 1516 if (VF.isVector()) 1517 Ty = VectorType::get(Ty, VF); 1518 return (LI && TTI.isLegalMaskedGather(Ty, Align)) || 1519 (SI && TTI.isLegalMaskedScatter(Ty, Align)); 1520 } 1521 1522 /// Returns true if the target machine supports all of the reduction 1523 /// variables found for the given VF. 1524 bool canVectorizeReductions(ElementCount VF) const { 1525 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 1526 const RecurrenceDescriptor &RdxDesc = Reduction.second; 1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF); 1528 })); 1529 } 1530 1531 /// Given costs for both strategies, return true if the scalar predication 1532 /// lowering should be used for div/rem. This incorporates an override 1533 /// option so it is not simply a cost comparison. 1534 bool isDivRemScalarWithPredication(InstructionCost ScalarCost, 1535 InstructionCost SafeDivisorCost) const { 1536 switch (ForceSafeDivisor) { 1537 case cl::BOU_UNSET: 1538 return ScalarCost < SafeDivisorCost; 1539 case cl::BOU_TRUE: 1540 return false; 1541 case cl::BOU_FALSE: 1542 return true; 1543 }; 1544 llvm_unreachable("impossible case value"); 1545 } 1546 1547 /// Returns true if \p I is an instruction which requires predication and 1548 /// for which our chosen predication strategy is scalarization (i.e. we 1549 /// don't have an alternate strategy such as masking available). 1550 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1551 bool isScalarWithPredication(Instruction *I, ElementCount VF) const; 1552 1553 /// Returns true if \p I is an instruction that needs to be predicated 1554 /// at runtime. The result is independent of the predication mechanism. 1555 /// Superset of instructions that return true for isScalarWithPredication. 1556 bool isPredicatedInst(Instruction *I) const; 1557 1558 /// Return the costs for our two available strategies for lowering a 1559 /// div/rem operation which requires speculating at least one lane. 1560 /// First result is for scalarization (will be invalid for scalable 1561 /// vectors); second is for the safe-divisor strategy. 1562 std::pair<InstructionCost, InstructionCost> 1563 getDivRemSpeculationCost(Instruction *I, 1564 ElementCount VF) const; 1565 1566 /// Returns true if \p I is a memory instruction with consecutive memory 1567 /// access that can be widened. 1568 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); 1569 1570 /// Returns true if \p I is a memory instruction in an interleaved-group 1571 /// of memory accesses that can be vectorized with wide vector loads/stores 1572 /// and shuffles. 1573 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); 1574 1575 /// Check if \p Instr belongs to any interleaved access group. 1576 bool isAccessInterleaved(Instruction *Instr) { 1577 return InterleaveInfo.isInterleaved(Instr); 1578 } 1579 1580 /// Get the interleaved access group that \p Instr belongs to. 1581 const InterleaveGroup<Instruction> * 1582 getInterleavedAccessGroup(Instruction *Instr) { 1583 return InterleaveInfo.getInterleaveGroup(Instr); 1584 } 1585 1586 /// Returns true if we're required to use a scalar epilogue for at least 1587 /// the final iteration of the original loop. 1588 bool requiresScalarEpilogue(bool IsVectorizing) const { 1589 if (!isScalarEpilogueAllowed()) 1590 return false; 1591 // If we might exit from anywhere but the latch, must run the exiting 1592 // iteration in scalar form. 1593 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) 1594 return true; 1595 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); 1596 } 1597 1598 /// Returns true if we're required to use a scalar epilogue for at least 1599 /// the final iteration of the original loop for all VFs in \p Range. 1600 /// A scalar epilogue must either be required for all VFs in \p Range or for 1601 /// none. 1602 bool requiresScalarEpilogue(VFRange Range) const { 1603 auto RequiresScalarEpilogue = [this](ElementCount VF) { 1604 return requiresScalarEpilogue(VF.isVector()); 1605 }; 1606 bool IsRequired = all_of(Range, RequiresScalarEpilogue); 1607 assert( 1608 (IsRequired || none_of(Range, RequiresScalarEpilogue)) && 1609 "all VFs in range must agree on whether a scalar epilogue is required"); 1610 return IsRequired; 1611 } 1612 1613 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1614 /// loop hint annotation. 1615 bool isScalarEpilogueAllowed() const { 1616 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1617 } 1618 1619 /// Returns the TailFoldingStyle that is best for the current loop. 1620 TailFoldingStyle 1621 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { 1622 if (!CanFoldTailByMasking) 1623 return TailFoldingStyle::None; 1624 1625 if (ForceTailFoldingStyle.getNumOccurrences()) 1626 return ForceTailFoldingStyle; 1627 1628 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); 1629 } 1630 1631 /// Returns true if all loop blocks should be masked to fold tail loop. 1632 bool foldTailByMasking() const { 1633 return getTailFoldingStyle() != TailFoldingStyle::None; 1634 } 1635 1636 /// Returns true if the instructions in this block requires predication 1637 /// for any reason, e.g. because tail folding now requires a predicate 1638 /// or because the block in the original loop was predicated. 1639 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { 1640 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1641 } 1642 1643 /// Returns true if the Phi is part of an inloop reduction. 1644 bool isInLoopReduction(PHINode *Phi) const { 1645 return InLoopReductions.contains(Phi); 1646 } 1647 1648 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1649 /// with factor VF. Return the cost of the instruction, including 1650 /// scalarization overhead if it's needed. 1651 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; 1652 1653 /// Estimate cost of a call instruction CI if it were vectorized with factor 1654 /// VF. Return the cost of the instruction, including scalarization overhead 1655 /// if it's needed. 1656 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; 1657 1658 /// Invalidates decisions already taken by the cost model. 1659 void invalidateCostModelingDecisions() { 1660 WideningDecisions.clear(); 1661 CallWideningDecisions.clear(); 1662 Uniforms.clear(); 1663 Scalars.clear(); 1664 } 1665 1666 /// The vectorization cost is a combination of the cost itself and a boolean 1667 /// indicating whether any of the contributing operations will actually 1668 /// operate on vector values after type legalization in the backend. If this 1669 /// latter value is false, then all operations will be scalarized (i.e. no 1670 /// vectorization has actually taken place). 1671 using VectorizationCostTy = std::pair<InstructionCost, bool>; 1672 1673 /// Returns the expected execution cost. The unit of the cost does 1674 /// not matter because we use the 'cost' units to compare different 1675 /// vector widths. The cost that is returned is *not* normalized by 1676 /// the factor width. If \p Invalid is not nullptr, this function 1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for 1678 /// each instruction that has an Invalid cost for the given VF. 1679 VectorizationCostTy 1680 expectedCost(ElementCount VF, 1681 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); 1682 1683 bool hasPredStores() const { return NumPredStores > 0; } 1684 1685 /// Returns true if epilogue vectorization is considered profitable, and 1686 /// false otherwise. 1687 /// \p VF is the vectorization factor chosen for the original loop. 1688 bool isEpilogueVectorizationProfitable(const ElementCount VF) const; 1689 1690private: 1691 unsigned NumPredStores = 0; 1692 1693 /// \return An upper bound for the vectorization factors for both 1694 /// fixed and scalable vectorization, where the minimum-known number of 1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1696 /// disabled or unsupported, then the scalable part will be equal to 1697 /// ElementCount::getScalable(0). 1698 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, 1699 ElementCount UserVF, 1700 bool FoldTailByMasking); 1701 1702 /// \return the maximized element count based on the targets vector 1703 /// registers and the loop trip-count, but limited to a maximum safe VF. 1704 /// This is a helper function of computeFeasibleMaxVF. 1705 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, 1706 unsigned SmallestType, 1707 unsigned WidestType, 1708 ElementCount MaxSafeVF, 1709 bool FoldTailByMasking); 1710 1711 /// \return the maximum legal scalable VF, based on the safe max number 1712 /// of elements. 1713 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); 1714 1715 /// Returns the execution time cost of an instruction for a given vector 1716 /// width. Vector width of one means scalar. 1717 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); 1718 1719 /// The cost-computation logic from getInstructionCost which provides 1720 /// the vector type as an output parameter. 1721 InstructionCost getInstructionCost(Instruction *I, ElementCount VF, 1722 Type *&VectorTy); 1723 1724 /// Return the cost of instructions in an inloop reduction pattern, if I is 1725 /// part of that pattern. 1726 std::optional<InstructionCost> 1727 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, 1728 TTI::TargetCostKind CostKind) const; 1729 1730 /// Calculate vectorization cost of memory instruction \p I. 1731 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); 1732 1733 /// The cost computation for scalarized memory instruction. 1734 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); 1735 1736 /// The cost computation for interleaving group of memory instructions. 1737 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); 1738 1739 /// The cost computation for Gather/Scatter instruction. 1740 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); 1741 1742 /// The cost computation for widening instruction \p I with consecutive 1743 /// memory access. 1744 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); 1745 1746 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1747 /// Load: scalar load + broadcast. 1748 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1749 /// element) 1750 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); 1751 1752 /// Estimate the overhead of scalarizing an instruction. This is a 1753 /// convenience wrapper for the type-based getScalarizationOverhead API. 1754 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, 1755 TTI::TargetCostKind CostKind) const; 1756 1757 /// Returns true if an artificially high cost for emulated masked memrefs 1758 /// should be used. 1759 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); 1760 1761 /// Map of scalar integer values to the smallest bitwidth they can be legally 1762 /// represented as. The vector equivalents of these values should be truncated 1763 /// to this type. 1764 MapVector<Instruction *, uint64_t> MinBWs; 1765 1766 /// A type representing the costs for instructions if they were to be 1767 /// scalarized rather than vectorized. The entries are Instruction-Cost 1768 /// pairs. 1769 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; 1770 1771 /// A set containing all BasicBlocks that are known to present after 1772 /// vectorization as a predicated block. 1773 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> 1774 PredicatedBBsAfterVectorization; 1775 1776 /// Records whether it is allowed to have the original scalar loop execute at 1777 /// least once. This may be needed as a fallback loop in case runtime 1778 /// aliasing/dependence checks fail, or to handle the tail/remainder 1779 /// iterations when the trip count is unknown or doesn't divide by the VF, 1780 /// or as a peel-loop to handle gaps in interleave-groups. 1781 /// Under optsize and when the trip count is very small we don't allow any 1782 /// iterations to execute in the scalar loop. 1783 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1784 1785 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1786 bool CanFoldTailByMasking = false; 1787 1788 /// A map holding scalar costs for different vectorization factors. The 1789 /// presence of a cost for an instruction in the mapping indicates that the 1790 /// instruction will be scalarized when vectorizing with the associated 1791 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1792 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; 1793 1794 /// Holds the instructions known to be uniform after vectorization. 1795 /// The data is collected per VF. 1796 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; 1797 1798 /// Holds the instructions known to be scalar after vectorization. 1799 /// The data is collected per VF. 1800 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; 1801 1802 /// Holds the instructions (address computations) that are forced to be 1803 /// scalarized. 1804 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1805 1806 /// PHINodes of the reductions that should be expanded in-loop. 1807 SmallPtrSet<PHINode *, 4> InLoopReductions; 1808 1809 /// A Map of inloop reduction operations and their immediate chain operand. 1810 /// FIXME: This can be removed once reductions can be costed correctly in 1811 /// VPlan. This was added to allow quick lookup of the inloop operations. 1812 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; 1813 1814 /// Returns the expected difference in cost from scalarizing the expression 1815 /// feeding a predicated instruction \p PredInst. The instructions to 1816 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1817 /// non-negative return value implies the expression will be scalarized. 1818 /// Currently, only single-use chains are considered for scalarization. 1819 InstructionCost computePredInstDiscount(Instruction *PredInst, 1820 ScalarCostsTy &ScalarCosts, 1821 ElementCount VF); 1822 1823 /// Collect the instructions that are uniform after vectorization. An 1824 /// instruction is uniform if we represent it with a single scalar value in 1825 /// the vectorized loop corresponding to each vector iteration. Examples of 1826 /// uniform instructions include pointer operands of consecutive or 1827 /// interleaved memory accesses. Note that although uniformity implies an 1828 /// instruction will be scalar, the reverse is not true. In general, a 1829 /// scalarized instruction will be represented by VF scalar values in the 1830 /// vectorized loop, each corresponding to an iteration of the original 1831 /// scalar loop. 1832 void collectLoopUniforms(ElementCount VF); 1833 1834 /// Collect the instructions that are scalar after vectorization. An 1835 /// instruction is scalar if it is known to be uniform or will be scalarized 1836 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1837 /// to the list if they are used by a load/store instruction that is marked as 1838 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1839 /// VF values in the vectorized loop, each corresponding to an iteration of 1840 /// the original scalar loop. 1841 void collectLoopScalars(ElementCount VF); 1842 1843 /// Keeps cost model vectorization decision and cost for instructions. 1844 /// Right now it is used for memory instructions only. 1845 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, 1846 std::pair<InstWidening, InstructionCost>>; 1847 1848 DecisionList WideningDecisions; 1849 1850 using CallDecisionList = 1851 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; 1852 1853 CallDecisionList CallWideningDecisions; 1854 1855 /// Returns true if \p V is expected to be vectorized and it needs to be 1856 /// extracted. 1857 bool needsExtract(Value *V, ElementCount VF) const { 1858 Instruction *I = dyn_cast<Instruction>(V); 1859 if (VF.isScalar() || !I || !TheLoop->contains(I) || 1860 TheLoop->isLoopInvariant(I)) 1861 return false; 1862 1863 // Assume we can vectorize V (and hence we need extraction) if the 1864 // scalars are not computed yet. This can happen, because it is called 1865 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1866 // the scalars are collected. That should be a safe assumption in most 1867 // cases, because we check if the operands have vectorizable types 1868 // beforehand in LoopVectorizationLegality. 1869 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); 1870 }; 1871 1872 /// Returns a range containing only operands needing to be extracted. 1873 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1874 ElementCount VF) const { 1875 return SmallVector<Value *, 4>(make_filter_range( 1876 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1877 } 1878 1879public: 1880 /// The loop that we evaluate. 1881 Loop *TheLoop; 1882 1883 /// Predicated scalar evolution analysis. 1884 PredicatedScalarEvolution &PSE; 1885 1886 /// Loop Info analysis. 1887 LoopInfo *LI; 1888 1889 /// Vectorization legality. 1890 LoopVectorizationLegality *Legal; 1891 1892 /// Vector target information. 1893 const TargetTransformInfo &TTI; 1894 1895 /// Target Library Info. 1896 const TargetLibraryInfo *TLI; 1897 1898 /// Demanded bits analysis. 1899 DemandedBits *DB; 1900 1901 /// Assumption cache. 1902 AssumptionCache *AC; 1903 1904 /// Interface to emit optimization remarks. 1905 OptimizationRemarkEmitter *ORE; 1906 1907 const Function *TheFunction; 1908 1909 /// Loop Vectorize Hint. 1910 const LoopVectorizeHints *Hints; 1911 1912 /// The interleave access information contains groups of interleaved accesses 1913 /// with the same stride and close to each other. 1914 InterleavedAccessInfo &InterleaveInfo; 1915 1916 /// Values to ignore in the cost model. 1917 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1918 1919 /// Values to ignore in the cost model when VF > 1. 1920 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1921 1922 /// All element types found in the loop. 1923 SmallPtrSet<Type *, 16> ElementTypesInLoop; 1924}; 1925} // end namespace llvm 1926 1927namespace { 1928/// Helper struct to manage generating runtime checks for vectorization. 1929/// 1930/// The runtime checks are created up-front in temporary blocks to allow better 1931/// estimating the cost and un-linked from the existing IR. After deciding to 1932/// vectorize, the checks are moved back. If deciding not to vectorize, the 1933/// temporary blocks are completely removed. 1934class GeneratedRTChecks { 1935 /// Basic block which contains the generated SCEV checks, if any. 1936 BasicBlock *SCEVCheckBlock = nullptr; 1937 1938 /// The value representing the result of the generated SCEV checks. If it is 1939 /// nullptr, either no SCEV checks have been generated or they have been used. 1940 Value *SCEVCheckCond = nullptr; 1941 1942 /// Basic block which contains the generated memory runtime checks, if any. 1943 BasicBlock *MemCheckBlock = nullptr; 1944 1945 /// The value representing the result of the generated memory runtime checks. 1946 /// If it is nullptr, either no memory runtime checks have been generated or 1947 /// they have been used. 1948 Value *MemRuntimeCheckCond = nullptr; 1949 1950 DominatorTree *DT; 1951 LoopInfo *LI; 1952 TargetTransformInfo *TTI; 1953 1954 SCEVExpander SCEVExp; 1955 SCEVExpander MemCheckExp; 1956 1957 bool CostTooHigh = false; 1958 const bool AddBranchWeights; 1959 1960 Loop *OuterLoop = nullptr; 1961 1962public: 1963 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, 1964 TargetTransformInfo *TTI, const DataLayout &DL, 1965 bool AddBranchWeights) 1966 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), 1967 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} 1968 1969 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1970 /// accurately estimate the cost of the runtime checks. The blocks are 1971 /// un-linked from the IR and is added back during vector code generation. If 1972 /// there is no vector code generation, the check blocks are removed 1973 /// completely. 1974 void Create(Loop *L, const LoopAccessInfo &LAI, 1975 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { 1976 1977 // Hard cutoff to limit compile-time increase in case a very large number of 1978 // runtime checks needs to be generated. 1979 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1980 // profile info. 1981 CostTooHigh = 1982 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; 1983 if (CostTooHigh) 1984 return; 1985 1986 BasicBlock *LoopHeader = L->getHeader(); 1987 BasicBlock *Preheader = L->getLoopPreheader(); 1988 1989 // Use SplitBlock to create blocks for SCEV & memory runtime checks to 1990 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1991 // may be used by SCEVExpander. The blocks will be un-linked from their 1992 // predecessors and removed from LI & DT at the end of the function. 1993 if (!UnionPred.isAlwaysTrue()) { 1994 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, 1995 nullptr, "vector.scevcheck"); 1996 1997 SCEVCheckCond = SCEVExp.expandCodeForPredicate( 1998 &UnionPred, SCEVCheckBlock->getTerminator()); 1999 } 2000 2001 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); 2002 if (RtPtrChecking.Need) { 2003 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; 2004 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, 2005 "vector.memcheck"); 2006 2007 auto DiffChecks = RtPtrChecking.getDiffChecks(); 2008 if (DiffChecks) { 2009 Value *RuntimeVF = nullptr; 2010 MemRuntimeCheckCond = addDiffRuntimeChecks( 2011 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp, 2012 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { 2013 if (!RuntimeVF) 2014 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF); 2015 return RuntimeVF; 2016 }, 2017 IC); 2018 } else { 2019 MemRuntimeCheckCond = addRuntimeChecks( 2020 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(), 2021 MemCheckExp, VectorizerParams::HoistRuntimeChecks); 2022 } 2023 assert(MemRuntimeCheckCond && 2024 "no RT checks generated although RtPtrChecking " 2025 "claimed checks are required"); 2026 } 2027 2028 if (!MemCheckBlock && !SCEVCheckBlock) 2029 return; 2030 2031 // Unhook the temporary block with the checks, update various places 2032 // accordingly. 2033 if (SCEVCheckBlock) 2034 SCEVCheckBlock->replaceAllUsesWith(Preheader); 2035 if (MemCheckBlock) 2036 MemCheckBlock->replaceAllUsesWith(Preheader); 2037 2038 if (SCEVCheckBlock) { 2039 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2040 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); 2041 Preheader->getTerminator()->eraseFromParent(); 2042 } 2043 if (MemCheckBlock) { 2044 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); 2045 new UnreachableInst(Preheader->getContext(), MemCheckBlock); 2046 Preheader->getTerminator()->eraseFromParent(); 2047 } 2048 2049 DT->changeImmediateDominator(LoopHeader, Preheader); 2050 if (MemCheckBlock) { 2051 DT->eraseNode(MemCheckBlock); 2052 LI->removeBlock(MemCheckBlock); 2053 } 2054 if (SCEVCheckBlock) { 2055 DT->eraseNode(SCEVCheckBlock); 2056 LI->removeBlock(SCEVCheckBlock); 2057 } 2058 2059 // Outer loop is used as part of the later cost calculations. 2060 OuterLoop = L->getParentLoop(); 2061 } 2062 2063 InstructionCost getCost() { 2064 if (SCEVCheckBlock || MemCheckBlock) 2065 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n"); 2066 2067 if (CostTooHigh) { 2068 InstructionCost Cost; 2069 Cost.setInvalid(); 2070 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n"); 2071 return Cost; 2072 } 2073 2074 InstructionCost RTCheckCost = 0; 2075 if (SCEVCheckBlock) 2076 for (Instruction &I : *SCEVCheckBlock) { 2077 if (SCEVCheckBlock->getTerminator() == &I) 2078 continue; 2079 InstructionCost C = 2080 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2081 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2082 RTCheckCost += C; 2083 } 2084 if (MemCheckBlock) { 2085 InstructionCost MemCheckCost = 0; 2086 for (Instruction &I : *MemCheckBlock) { 2087 if (MemCheckBlock->getTerminator() == &I) 2088 continue; 2089 InstructionCost C = 2090 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput); 2091 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n"); 2092 MemCheckCost += C; 2093 } 2094 2095 // If the runtime memory checks are being created inside an outer loop 2096 // we should find out if these checks are outer loop invariant. If so, 2097 // the checks will likely be hoisted out and so the effective cost will 2098 // reduce according to the outer loop trip count. 2099 if (OuterLoop) { 2100 ScalarEvolution *SE = MemCheckExp.getSE(); 2101 // TODO: If profitable, we could refine this further by analysing every 2102 // individual memory check, since there could be a mixture of loop 2103 // variant and invariant checks that mean the final condition is 2104 // variant. 2105 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond); 2106 if (SE->isLoopInvariant(Cond, OuterLoop)) { 2107 // It seems reasonable to assume that we can reduce the effective 2108 // cost of the checks even when we know nothing about the trip 2109 // count. Assume that the outer loop executes at least twice. 2110 unsigned BestTripCount = 2; 2111 2112 // If exact trip count is known use that. 2113 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop)) 2114 BestTripCount = SmallTC; 2115 else if (LoopVectorizeWithBlockFrequency) { 2116 // Else use profile data if available. 2117 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop)) 2118 BestTripCount = *EstimatedTC; 2119 } 2120 2121 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; 2122 2123 // Let's ensure the cost is always at least 1. 2124 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(), 2125 (InstructionCost::CostType)1); 2126 2127 LLVM_DEBUG(dbgs() 2128 << "We expect runtime memory checks to be hoisted " 2129 << "out of the outer loop. Cost reduced from " 2130 << MemCheckCost << " to " << NewMemCheckCost << '\n'); 2131 2132 MemCheckCost = NewMemCheckCost; 2133 } 2134 } 2135 2136 RTCheckCost += MemCheckCost; 2137 } 2138 2139 if (SCEVCheckBlock || MemCheckBlock) 2140 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost 2141 << "\n"); 2142 2143 return RTCheckCost; 2144 } 2145 2146 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2147 /// unused. 2148 ~GeneratedRTChecks() { 2149 SCEVExpanderCleaner SCEVCleaner(SCEVExp); 2150 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); 2151 if (!SCEVCheckCond) 2152 SCEVCleaner.markResultUsed(); 2153 2154 if (!MemRuntimeCheckCond) 2155 MemCheckCleaner.markResultUsed(); 2156 2157 if (MemRuntimeCheckCond) { 2158 auto &SE = *MemCheckExp.getSE(); 2159 // Memory runtime check generation creates compares that use expanded 2160 // values. Remove them before running the SCEVExpanderCleaners. 2161 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { 2162 if (MemCheckExp.isInsertedInstruction(&I)) 2163 continue; 2164 SE.forgetValue(&I); 2165 I.eraseFromParent(); 2166 } 2167 } 2168 MemCheckCleaner.cleanup(); 2169 SCEVCleaner.cleanup(); 2170 2171 if (SCEVCheckCond) 2172 SCEVCheckBlock->eraseFromParent(); 2173 if (MemRuntimeCheckCond) 2174 MemCheckBlock->eraseFromParent(); 2175 } 2176 2177 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2178 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2179 /// depending on the generated condition. 2180 BasicBlock *emitSCEVChecks(BasicBlock *Bypass, 2181 BasicBlock *LoopVectorPreHeader, 2182 BasicBlock *LoopExitBlock) { 2183 if (!SCEVCheckCond) 2184 return nullptr; 2185 2186 Value *Cond = SCEVCheckCond; 2187 // Mark the check as used, to prevent it from being removed during cleanup. 2188 SCEVCheckCond = nullptr; 2189 if (auto *C = dyn_cast<ConstantInt>(Cond)) 2190 if (C->isZero()) 2191 return nullptr; 2192 2193 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2194 2195 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); 2196 // Create new preheader for vector loop. 2197 if (OuterLoop) 2198 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); 2199 2200 SCEVCheckBlock->getTerminator()->eraseFromParent(); 2201 SCEVCheckBlock->moveBefore(LoopVectorPreHeader); 2202 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2203 SCEVCheckBlock); 2204 2205 DT->addNewBlock(SCEVCheckBlock, Pred); 2206 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); 2207 2208 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); 2209 if (AddBranchWeights) 2210 setBranchWeights(BI, SCEVCheckBypassWeights); 2211 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); 2212 return SCEVCheckBlock; 2213 } 2214 2215 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2216 /// the branches to branch to the vector preheader or \p Bypass, depending on 2217 /// the generated condition. 2218 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, 2219 BasicBlock *LoopVectorPreHeader) { 2220 // Check if we generated code that checks in runtime if arrays overlap. 2221 if (!MemRuntimeCheckCond) 2222 return nullptr; 2223 2224 auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); 2225 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, 2226 MemCheckBlock); 2227 2228 DT->addNewBlock(MemCheckBlock, Pred); 2229 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); 2230 MemCheckBlock->moveBefore(LoopVectorPreHeader); 2231 2232 if (OuterLoop) 2233 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); 2234 2235 BranchInst &BI = 2236 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); 2237 if (AddBranchWeights) { 2238 setBranchWeights(BI, MemCheckBypassWeights); 2239 } 2240 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); 2241 MemCheckBlock->getTerminator()->setDebugLoc( 2242 Pred->getTerminator()->getDebugLoc()); 2243 2244 // Mark the check as used, to prevent it from being removed during cleanup. 2245 MemRuntimeCheckCond = nullptr; 2246 return MemCheckBlock; 2247 } 2248}; 2249} // namespace 2250 2251static bool useActiveLaneMask(TailFoldingStyle Style) { 2252 return Style == TailFoldingStyle::Data || 2253 Style == TailFoldingStyle::DataAndControlFlow || 2254 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2255} 2256 2257static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { 2258 return Style == TailFoldingStyle::DataAndControlFlow || 2259 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 2260} 2261 2262// Return true if \p OuterLp is an outer loop annotated with hints for explicit 2263// vectorization. The loop needs to be annotated with #pragma omp simd 2264// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2265// vector length information is not provided, vectorization is not considered 2266// explicit. Interleave hints are not allowed either. These limitations will be 2267// relaxed in the future. 2268// Please, note that we are currently forced to abuse the pragma 'clang 2269// vectorize' semantics. This pragma provides *auto-vectorization hints* 2270// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2271// provides *explicit vectorization hints* (LV can bypass legal checks and 2272// assume that vectorization is legal). However, both hints are implemented 2273// using the same metadata (llvm.loop.vectorize, processed by 2274// LoopVectorizeHints). This will be fixed in the future when the native IR 2275// representation for pragma 'omp simd' is introduced. 2276static bool isExplicitVecOuterLoop(Loop *OuterLp, 2277 OptimizationRemarkEmitter *ORE) { 2278 assert(!OuterLp->isInnermost() && "This is not an outer loop"); 2279 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 2280 2281 // Only outer loops with an explicit vectorization hint are supported. 2282 // Unannotated outer loops are ignored. 2283 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 2284 return false; 2285 2286 Function *Fn = OuterLp->getHeader()->getParent(); 2287 if (!Hints.allowVectorization(Fn, OuterLp, 2288 true /*VectorizeOnlyWhenForced*/)) { 2289 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 2290 return false; 2291 } 2292 2293 if (Hints.getInterleave() > 1) { 2294 // TODO: Interleave support is future work. 2295 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 2296 "outer loops.\n"); 2297 Hints.emitRemarkWithHints(); 2298 return false; 2299 } 2300 2301 return true; 2302} 2303 2304static void collectSupportedLoops(Loop &L, LoopInfo *LI, 2305 OptimizationRemarkEmitter *ORE, 2306 SmallVectorImpl<Loop *> &V) { 2307 // Collect inner loops and outer loops without irreducible control flow. For 2308 // now, only collect outer loops that have explicit vectorization hints. If we 2309 // are stress testing the VPlan H-CFG construction, we collect the outermost 2310 // loop of every loop nest. 2311 if (L.isInnermost() || VPlanBuildStressTest || 2312 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 2313 LoopBlocksRPO RPOT(&L); 2314 RPOT.perform(LI); 2315 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 2316 V.push_back(&L); 2317 // TODO: Collect inner loops inside marked outer loops in case 2318 // vectorization fails for the outer loop. Do not invoke 2319 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 2320 // already known to be reducible. We can use an inherited attribute for 2321 // that. 2322 return; 2323 } 2324 } 2325 for (Loop *InnerL : L) 2326 collectSupportedLoops(*InnerL, LI, ORE, V); 2327} 2328 2329//===----------------------------------------------------------------------===// 2330// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2331// LoopVectorizationCostModel and LoopVectorizationPlanner. 2332//===----------------------------------------------------------------------===// 2333 2334/// Compute the transformed value of Index at offset StartValue using step 2335/// StepValue. 2336/// For integer induction, returns StartValue + Index * StepValue. 2337/// For pointer induction, returns StartValue[Index * StepValue]. 2338/// FIXME: The newly created binary instructions should contain nsw/nuw 2339/// flags, which can be found from the original scalar operations. 2340static Value * 2341emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, 2342 Value *Step, 2343 InductionDescriptor::InductionKind InductionKind, 2344 const BinaryOperator *InductionBinOp) { 2345 Type *StepTy = Step->getType(); 2346 Value *CastedIndex = StepTy->isIntegerTy() 2347 ? B.CreateSExtOrTrunc(Index, StepTy) 2348 : B.CreateCast(Instruction::SIToFP, Index, StepTy); 2349 if (CastedIndex != Index) { 2350 CastedIndex->setName(CastedIndex->getName() + ".cast"); 2351 Index = CastedIndex; 2352 } 2353 2354 // Note: the IR at this point is broken. We cannot use SE to create any new 2355 // SCEV and then expand it, hoping that SCEV's simplification will give us 2356 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2357 // lead to various SCEV crashes. So all we can do is to use builder and rely 2358 // on InstCombine for future simplifications. Here we handle some trivial 2359 // cases only. 2360 auto CreateAdd = [&B](Value *X, Value *Y) { 2361 assert(X->getType() == Y->getType() && "Types don't match!"); 2362 if (auto *CX = dyn_cast<ConstantInt>(X)) 2363 if (CX->isZero()) 2364 return Y; 2365 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2366 if (CY->isZero()) 2367 return X; 2368 return B.CreateAdd(X, Y); 2369 }; 2370 2371 // We allow X to be a vector type, in which case Y will potentially be 2372 // splatted into a vector with the same element count. 2373 auto CreateMul = [&B](Value *X, Value *Y) { 2374 assert(X->getType()->getScalarType() == Y->getType() && 2375 "Types don't match!"); 2376 if (auto *CX = dyn_cast<ConstantInt>(X)) 2377 if (CX->isOne()) 2378 return Y; 2379 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2380 if (CY->isOne()) 2381 return X; 2382 VectorType *XVTy = dyn_cast<VectorType>(X->getType()); 2383 if (XVTy && !isa<VectorType>(Y->getType())) 2384 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y); 2385 return B.CreateMul(X, Y); 2386 }; 2387 2388 switch (InductionKind) { 2389 case InductionDescriptor::IK_IntInduction: { 2390 assert(!isa<VectorType>(Index->getType()) && 2391 "Vector indices not supported for integer inductions yet"); 2392 assert(Index->getType() == StartValue->getType() && 2393 "Index type does not match StartValue type"); 2394 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne()) 2395 return B.CreateSub(StartValue, Index); 2396 auto *Offset = CreateMul(Index, Step); 2397 return CreateAdd(StartValue, Offset); 2398 } 2399 case InductionDescriptor::IK_PtrInduction: 2400 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step)); 2401 case InductionDescriptor::IK_FpInduction: { 2402 assert(!isa<VectorType>(Index->getType()) && 2403 "Vector indices not supported for FP inductions yet"); 2404 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2405 assert(InductionBinOp && 2406 (InductionBinOp->getOpcode() == Instruction::FAdd || 2407 InductionBinOp->getOpcode() == Instruction::FSub) && 2408 "Original bin op should be defined for FP induction"); 2409 2410 Value *MulExp = B.CreateFMul(Step, Index); 2411 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2412 "induction"); 2413 } 2414 case InductionDescriptor::IK_NoInduction: 2415 return nullptr; 2416 } 2417 llvm_unreachable("invalid enum"); 2418} 2419 2420std::optional<unsigned> getMaxVScale(const Function &F, 2421 const TargetTransformInfo &TTI) { 2422 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) 2423 return MaxVScale; 2424 2425 if (F.hasFnAttribute(Attribute::VScaleRange)) 2426 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); 2427 2428 return std::nullopt; 2429} 2430 2431/// For the given VF and UF and maximum trip count computed for the loop, return 2432/// whether the induction variable might overflow in the vectorized loop. If not, 2433/// then we know a runtime overflow check always evaluates to false and can be 2434/// removed. 2435static bool isIndvarOverflowCheckKnownFalse( 2436 const LoopVectorizationCostModel *Cost, 2437 ElementCount VF, std::optional<unsigned> UF = std::nullopt) { 2438 // Always be conservative if we don't know the exact unroll factor. 2439 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); 2440 2441 Type *IdxTy = Cost->Legal->getWidestInductionType(); 2442 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask(); 2443 2444 // We know the runtime overflow check is known false iff the (max) trip-count 2445 // is known and (max) trip-count + (VF * UF) does not overflow in the type of 2446 // the vector loop induction variable. 2447 if (unsigned TC = 2448 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { 2449 uint64_t MaxVF = VF.getKnownMinValue(); 2450 if (VF.isScalable()) { 2451 std::optional<unsigned> MaxVScale = 2452 getMaxVScale(*Cost->TheFunction, Cost->TTI); 2453 if (!MaxVScale) 2454 return false; 2455 MaxVF *= *MaxVScale; 2456 } 2457 2458 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF); 2459 } 2460 2461 return false; 2462} 2463 2464// Return whether we allow using masked interleave-groups (for dealing with 2465// strided loads/stores that reside in predicated blocks, or for dealing 2466// with gaps). 2467static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2468 // If an override option has been passed in for interleaved accesses, use it. 2469 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2470 return EnableMaskedInterleavedMemAccesses; 2471 2472 return TTI.enableMaskedInterleavedAccessVectorization(); 2473} 2474 2475// Try to vectorize the interleave group that \p Instr belongs to. 2476// 2477// E.g. Translate following interleaved load group (factor = 3): 2478// for (i = 0; i < N; i+=3) { 2479// R = Pic[i]; // Member of index 0 2480// G = Pic[i+1]; // Member of index 1 2481// B = Pic[i+2]; // Member of index 2 2482// ... // do something to R, G, B 2483// } 2484// To: 2485// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2486// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements 2487// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements 2488// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements 2489// 2490// Or translate following interleaved store group (factor = 3): 2491// for (i = 0; i < N; i+=3) { 2492// ... do something to R, G, B 2493// Pic[i] = R; // Member of index 0 2494// Pic[i+1] = G; // Member of index 1 2495// Pic[i+2] = B; // Member of index 2 2496// } 2497// To: 2498// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2499// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> 2500// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2501// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2502// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2503void InnerLoopVectorizer::vectorizeInterleaveGroup( 2504 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, 2505 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, 2506 VPValue *BlockInMask, bool NeedsMaskForGaps) { 2507 Instruction *Instr = Group->getInsertPos(); 2508 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2509 2510 // Prepare for the vector type of the interleaved load/store. 2511 Type *ScalarTy = getLoadStoreType(Instr); 2512 unsigned InterleaveFactor = Group->getFactor(); 2513 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); 2514 2515 // Prepare for the new pointers. 2516 SmallVector<Value *, 2> AddrParts; 2517 unsigned Index = Group->getIndex(Instr); 2518 2519 // TODO: extend the masked interleaved-group support to reversed access. 2520 assert((!BlockInMask || !Group->isReverse()) && 2521 "Reversed masked interleave-group not supported."); 2522 2523 Value *Idx; 2524 // If the group is reverse, adjust the index to refer to the last vector lane 2525 // instead of the first. We adjust the index from the first vector lane, 2526 // rather than directly getting the pointer for lane VF - 1, because the 2527 // pointer operand of the interleaved access is supposed to be uniform. For 2528 // uniform instructions, we're only required to generate a value for the 2529 // first vector lane in each unroll iteration. 2530 if (Group->isReverse()) { 2531 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF); 2532 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); 2533 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor())); 2534 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index)); 2535 Idx = Builder.CreateNeg(Idx); 2536 } else 2537 Idx = Builder.getInt32(-Index); 2538 2539 for (unsigned Part = 0; Part < UF; Part++) { 2540 Value *AddrPart = State.get(Addr, VPIteration(Part, 0)); 2541 if (auto *I = dyn_cast<Instruction>(AddrPart)) 2542 State.setDebugLocFrom(I->getDebugLoc()); 2543 2544 // Notice current instruction could be any index. Need to adjust the address 2545 // to the member of index 0. 2546 // 2547 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2548 // b = A[i]; // Member of index 0 2549 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2550 // 2551 // E.g. A[i+1] = a; // Member of index 1 2552 // A[i] = b; // Member of index 0 2553 // A[i+2] = c; // Member of index 2 (Current instruction) 2554 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2555 2556 bool InBounds = false; 2557 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2558 InBounds = gep->isInBounds(); 2559 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds); 2560 AddrParts.push_back(AddrPart); 2561 } 2562 2563 State.setDebugLocFrom(Instr->getDebugLoc()); 2564 Value *PoisonVec = PoisonValue::get(VecTy); 2565 2566 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( 2567 unsigned Part, Value *MaskForGaps) -> Value * { 2568 if (VF.isScalable()) { 2569 assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); 2570 assert(InterleaveFactor == 2 && 2571 "Unsupported deinterleave factor for scalable vectors"); 2572 auto *BlockInMaskPart = State.get(BlockInMask, Part); 2573 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; 2574 auto *MaskTy = 2575 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true); 2576 return Builder.CreateIntrinsic( 2577 MaskTy, Intrinsic::experimental_vector_interleave2, Ops, 2578 /*FMFSource=*/nullptr, "interleaved.mask"); 2579 } 2580 2581 if (!BlockInMask) 2582 return MaskForGaps; 2583 2584 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2585 Value *ShuffledMask = Builder.CreateShuffleVector( 2586 BlockInMaskPart, 2587 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), 2588 "interleaved.mask"); 2589 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2590 MaskForGaps) 2591 : ShuffledMask; 2592 }; 2593 2594 // Vectorize the interleaved load group. 2595 if (isa<LoadInst>(Instr)) { 2596 Value *MaskForGaps = nullptr; 2597 if (NeedsMaskForGaps) { 2598 MaskForGaps = 2599 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2600 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2601 } 2602 2603 // For each unroll part, create a wide load for the group. 2604 SmallVector<Value *, 2> NewLoads; 2605 for (unsigned Part = 0; Part < UF; Part++) { 2606 Instruction *NewLoad; 2607 if (BlockInMask || MaskForGaps) { 2608 assert(useMaskedInterleavedAccesses(*TTI) && 2609 "masked interleaved groups are not allowed."); 2610 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2611 NewLoad = 2612 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(), 2613 GroupMask, PoisonVec, "wide.masked.vec"); 2614 } 2615 else 2616 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2617 Group->getAlign(), "wide.vec"); 2618 Group->addMetadata(NewLoad); 2619 NewLoads.push_back(NewLoad); 2620 } 2621 2622 if (VecTy->isScalableTy()) { 2623 assert(InterleaveFactor == 2 && 2624 "Unsupported deinterleave factor for scalable vectors"); 2625 2626 for (unsigned Part = 0; Part < UF; ++Part) { 2627 // Scalable vectors cannot use arbitrary shufflevectors (only splats), 2628 // so must use intrinsics to deinterleave. 2629 Value *DI = Builder.CreateIntrinsic( 2630 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], 2631 /*FMFSource=*/nullptr, "strided.vec"); 2632 unsigned J = 0; 2633 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2634 Instruction *Member = Group->getMember(I); 2635 2636 if (!Member) 2637 continue; 2638 2639 Value *StridedVec = Builder.CreateExtractValue(DI, I); 2640 // If this member has different type, cast the result type. 2641 if (Member->getType() != ScalarTy) { 2642 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2643 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2644 } 2645 2646 if (Group->isReverse()) 2647 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2648 2649 State.set(VPDefs[J], StridedVec, Part); 2650 ++J; 2651 } 2652 } 2653 2654 return; 2655 } 2656 2657 // For each member in the group, shuffle out the appropriate data from the 2658 // wide loads. 2659 unsigned J = 0; 2660 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2661 Instruction *Member = Group->getMember(I); 2662 2663 // Skip the gaps in the group. 2664 if (!Member) 2665 continue; 2666 2667 auto StrideMask = 2668 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); 2669 for (unsigned Part = 0; Part < UF; Part++) { 2670 Value *StridedVec = Builder.CreateShuffleVector( 2671 NewLoads[Part], StrideMask, "strided.vec"); 2672 2673 // If this member has different type, cast the result type. 2674 if (Member->getType() != ScalarTy) { 2675 assert(!VF.isScalable() && "VF is assumed to be non scalable."); 2676 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2677 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2678 } 2679 2680 if (Group->isReverse()) 2681 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse"); 2682 2683 State.set(VPDefs[J], StridedVec, Part); 2684 } 2685 ++J; 2686 } 2687 return; 2688 } 2689 2690 // The sub vector type for current instruction. 2691 auto *SubVT = VectorType::get(ScalarTy, VF); 2692 2693 // Vectorize the interleaved store group. 2694 Value *MaskForGaps = 2695 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); 2696 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && 2697 "masked interleaved groups are not allowed."); 2698 assert((!MaskForGaps || !VF.isScalable()) && 2699 "masking gaps for scalable vectors is not yet supported."); 2700 for (unsigned Part = 0; Part < UF; Part++) { 2701 // Collect the stored vector from each member. 2702 SmallVector<Value *, 4> StoredVecs; 2703 unsigned StoredIdx = 0; 2704 for (unsigned i = 0; i < InterleaveFactor; i++) { 2705 assert((Group->getMember(i) || MaskForGaps) && 2706 "Fail to get a member from an interleaved store group"); 2707 Instruction *Member = Group->getMember(i); 2708 2709 // Skip the gaps in the group. 2710 if (!Member) { 2711 Value *Undef = PoisonValue::get(SubVT); 2712 StoredVecs.push_back(Undef); 2713 continue; 2714 } 2715 2716 Value *StoredVec = State.get(StoredValues[StoredIdx], Part); 2717 ++StoredIdx; 2718 2719 if (Group->isReverse()) 2720 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse"); 2721 2722 // If this member has different type, cast it to a unified type. 2723 2724 if (StoredVec->getType() != SubVT) 2725 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2726 2727 StoredVecs.push_back(StoredVec); 2728 } 2729 2730 // Interleave all the smaller vectors into one wider vector. 2731 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); 2732 Instruction *NewStoreInstr; 2733 if (BlockInMask || MaskForGaps) { 2734 Value *GroupMask = CreateGroupMask(Part, MaskForGaps); 2735 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], 2736 Group->getAlign(), GroupMask); 2737 } else 2738 NewStoreInstr = 2739 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2740 2741 Group->addMetadata(NewStoreInstr); 2742 } 2743} 2744 2745void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, 2746 VPReplicateRecipe *RepRecipe, 2747 const VPIteration &Instance, 2748 VPTransformState &State) { 2749 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2750 2751 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for 2752 // the first lane and part. 2753 if (isa<NoAliasScopeDeclInst>(Instr)) 2754 if (!Instance.isFirstIteration()) 2755 return; 2756 2757 // Does this instruction return a value ? 2758 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2759 2760 Instruction *Cloned = Instr->clone(); 2761 if (!IsVoidRetTy) { 2762 Cloned->setName(Instr->getName() + ".cloned"); 2763#if !defined(NDEBUG) 2764 // Verify that VPlan type inference results agree with the type of the 2765 // generated values. 2766 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && 2767 "inferred type and type from generated instructions do not match"); 2768#endif 2769 } 2770 2771 RepRecipe->setFlags(Cloned); 2772 2773 if (auto DL = Instr->getDebugLoc()) 2774 State.setDebugLocFrom(DL); 2775 2776 // Replace the operands of the cloned instructions with their scalar 2777 // equivalents in the new loop. 2778 for (const auto &I : enumerate(RepRecipe->operands())) { 2779 auto InputInstance = Instance; 2780 VPValue *Operand = I.value(); 2781 if (vputils::isUniformAfterVectorization(Operand)) 2782 InputInstance.Lane = VPLane::getFirstLane(); 2783 Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); 2784 } 2785 State.addNewMetadata(Cloned, Instr); 2786 2787 // Place the cloned scalar in the new loop. 2788 State.Builder.Insert(Cloned); 2789 2790 State.set(RepRecipe, Cloned, Instance); 2791 2792 // If we just cloned a new assumption, add it the assumption cache. 2793 if (auto *II = dyn_cast<AssumeInst>(Cloned)) 2794 AC->registerAssumption(II); 2795 2796 // End if-block. 2797 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); 2798 if (IfPredicateInstr) 2799 PredicatedInstructions.push_back(Cloned); 2800} 2801 2802Value * 2803InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { 2804 if (VectorTripCount) 2805 return VectorTripCount; 2806 2807 Value *TC = getTripCount(); 2808 IRBuilder<> Builder(InsertBlock->getTerminator()); 2809 2810 Type *Ty = TC->getType(); 2811 // This is where we can make the step a runtime constant. 2812 Value *Step = createStepForVF(Builder, Ty, VF, UF); 2813 2814 // If the tail is to be folded by masking, round the number of iterations N 2815 // up to a multiple of Step instead of rounding down. This is done by first 2816 // adding Step-1 and then rounding down. Note that it's ok if this addition 2817 // overflows: the vector induction variable will eventually wrap to zero given 2818 // that it starts at zero and its Step is a power of two; the loop will then 2819 // exit, with the last early-exit vector comparison also producing all-true. 2820 // For scalable vectors the VF is not guaranteed to be a power of 2, but this 2821 // is accounted for in emitIterationCountCheck that adds an overflow check. 2822 if (Cost->foldTailByMasking()) { 2823 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && 2824 "VF*UF must be a power of 2 when folding tail by masking"); 2825 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); 2826 TC = Builder.CreateAdd( 2827 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up"); 2828 } 2829 2830 // Now we need to generate the expression for the part of the loop that the 2831 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2832 // iterations are not required for correctness, or N - Step, otherwise. Step 2833 // is equal to the vectorization factor (number of SIMD elements) times the 2834 // unroll factor (number of SIMD instructions). 2835 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2836 2837 // There are cases where we *must* run at least one iteration in the remainder 2838 // loop. See the cost model for when this can happen. If the step evenly 2839 // divides the trip count, we set the remainder to be equal to the step. If 2840 // the step does not evenly divide the trip count, no adjustment is necessary 2841 // since there will already be scalar iterations. Note that the minimum 2842 // iterations check ensures that N >= Step. 2843 if (Cost->requiresScalarEpilogue(VF.isVector())) { 2844 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2845 R = Builder.CreateSelect(IsZero, Step, R); 2846 } 2847 2848 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2849 2850 return VectorTripCount; 2851} 2852 2853Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2854 const DataLayout &DL) { 2855 // Verify that V is a vector type with same number of elements as DstVTy. 2856 auto *DstFVTy = cast<VectorType>(DstVTy); 2857 auto VF = DstFVTy->getElementCount(); 2858 auto *SrcVecTy = cast<VectorType>(V->getType()); 2859 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); 2860 Type *SrcElemTy = SrcVecTy->getElementType(); 2861 Type *DstElemTy = DstFVTy->getElementType(); 2862 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2863 "Vector elements must have same size"); 2864 2865 // Do a direct cast if element types are castable. 2866 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2867 return Builder.CreateBitOrPointerCast(V, DstFVTy); 2868 } 2869 // V cannot be directly casted to desired vector type. 2870 // May happen when V is a floating point vector but DstVTy is a vector of 2871 // pointers or vice-versa. Handle this using a two-step bitcast using an 2872 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2873 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2874 "Only one type should be a pointer type"); 2875 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2876 "Only one type should be a floating point type"); 2877 Type *IntTy = 2878 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2879 auto *VecIntTy = VectorType::get(IntTy, VF); 2880 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2881 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); 2882} 2883 2884void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { 2885 Value *Count = getTripCount(); 2886 // Reuse existing vector loop preheader for TC checks. 2887 // Note that new preheader block is generated for vector loop. 2888 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2889 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2890 2891 // Generate code to check if the loop's trip count is less than VF * UF, or 2892 // equal to it in case a scalar epilogue is required; this implies that the 2893 // vector trip count is zero. This check also covers the case where adding one 2894 // to the backedge-taken count overflowed leading to an incorrect trip count 2895 // of zero. In this case we will also jump to the scalar loop. 2896 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE 2897 : ICmpInst::ICMP_ULT; 2898 2899 // If tail is to be folded, vector loop takes care of all iterations. 2900 Type *CountTy = Count->getType(); 2901 Value *CheckMinIters = Builder.getFalse(); 2902 auto CreateStep = [&]() -> Value * { 2903 // Create step with max(MinProTripCount, UF * VF). 2904 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) 2905 return createStepForVF(Builder, CountTy, VF, UF); 2906 2907 Value *MinProfTC = 2908 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1); 2909 if (!VF.isScalable()) 2910 return MinProfTC; 2911 return Builder.CreateBinaryIntrinsic( 2912 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); 2913 }; 2914 2915 TailFoldingStyle Style = Cost->getTailFoldingStyle(); 2916 if (Style == TailFoldingStyle::None) 2917 CheckMinIters = 2918 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); 2919 else if (VF.isScalable() && 2920 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && 2921 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { 2922 // vscale is not necessarily a power-of-2, which means we cannot guarantee 2923 // an overflow to zero when updating induction variables and so an 2924 // additional overflow check is required before entering the vector loop. 2925 2926 // Get the maximum unsigned value for the type. 2927 Value *MaxUIntTripCount = 2928 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask()); 2929 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); 2930 2931 // Don't execute the vector loop if (UMax - n) < (VF * UF). 2932 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); 2933 } 2934 2935 // Create new preheader for vector loop. 2936 LoopVectorPreHeader = 2937 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2938 "vector.ph"); 2939 2940 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2941 DT->getNode(Bypass)->getIDom()) && 2942 "TC check is expected to dominate Bypass"); 2943 2944 // Update dominator for Bypass & LoopExit (if needed). 2945 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2946 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2947 // If there is an epilogue which must run, there's no edge from the 2948 // middle block to exit blocks and thus no need to update the immediate 2949 // dominator of the exit blocks. 2950 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2951 2952 BranchInst &BI = 2953 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 2954 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 2955 setBranchWeights(BI, MinItersBypassWeights); 2956 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 2957 LoopBypassBlocks.push_back(TCCheckBlock); 2958} 2959 2960BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { 2961 BasicBlock *const SCEVCheckBlock = 2962 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); 2963 if (!SCEVCheckBlock) 2964 return nullptr; 2965 2966 assert(!(SCEVCheckBlock->getParent()->hasOptSize() || 2967 (OptForSizeBasedOnProfile && 2968 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && 2969 "Cannot SCEV check stride or overflow when optimizing for size"); 2970 2971 2972 // Update dominator only if this is first RT check. 2973 if (LoopBypassBlocks.empty()) { 2974 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2975 if (!Cost->requiresScalarEpilogue(VF.isVector())) 2976 // If there is an epilogue which must run, there's no edge from the 2977 // middle block to exit blocks and thus no need to update the immediate 2978 // dominator of the exit blocks. 2979 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2980 } 2981 2982 LoopBypassBlocks.push_back(SCEVCheckBlock); 2983 AddedSafetyChecks = true; 2984 return SCEVCheckBlock; 2985} 2986 2987BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { 2988 // VPlan-native path does not do any analysis for runtime checks currently. 2989 if (EnableVPlanNativePath) 2990 return nullptr; 2991 2992 BasicBlock *const MemCheckBlock = 2993 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); 2994 2995 // Check if we generated code that checks in runtime if arrays overlap. We put 2996 // the checks into a separate block to make the more common case of few 2997 // elements faster. 2998 if (!MemCheckBlock) 2999 return nullptr; 3000 3001 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { 3002 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 3003 "Cannot emit memory checks when optimizing for size, unless forced " 3004 "to vectorize."); 3005 ORE->emit([&]() { 3006 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 3007 OrigLoop->getStartLoc(), 3008 OrigLoop->getHeader()) 3009 << "Code-size may be reduced by not forcing " 3010 "vectorization, or by source-code modifications " 3011 "eliminating the need for runtime checks " 3012 "(e.g., adding 'restrict')."; 3013 }); 3014 } 3015 3016 LoopBypassBlocks.push_back(MemCheckBlock); 3017 3018 AddedSafetyChecks = true; 3019 3020 return MemCheckBlock; 3021} 3022 3023void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { 3024 LoopScalarBody = OrigLoop->getHeader(); 3025 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 3026 assert(LoopVectorPreHeader && "Invalid loop structure"); 3027 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr 3028 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && 3029 "multiple exit loop without required epilogue?"); 3030 3031 LoopMiddleBlock = 3032 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 3033 LI, nullptr, Twine(Prefix) + "middle.block"); 3034 LoopScalarPreHeader = 3035 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 3036 nullptr, Twine(Prefix) + "scalar.ph"); 3037 3038 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3039 3040 // Set up the middle block terminator. Two cases: 3041 // 1) If we know that we must execute the scalar epilogue, emit an 3042 // unconditional branch. 3043 // 2) Otherwise, we must have a single unique exit block (due to how we 3044 // implement the multiple exit case). In this case, set up a conditional 3045 // branch from the middle block to the loop scalar preheader, and the 3046 // exit block. completeLoopSkeleton will update the condition to use an 3047 // iteration check, if required to decide whether to execute the remainder. 3048 BranchInst *BrInst = 3049 Cost->requiresScalarEpilogue(VF.isVector()) 3050 ? BranchInst::Create(LoopScalarPreHeader) 3051 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, 3052 Builder.getTrue()); 3053 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); 3054 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3055 3056 // Update dominator for loop exit. During skeleton creation, only the vector 3057 // pre-header and the middle block are created. The vector loop is entirely 3058 // created during VPlan exection. 3059 if (!Cost->requiresScalarEpilogue(VF.isVector())) 3060 // If there is an epilogue which must run, there's no edge from the 3061 // middle block to exit blocks and thus no need to update the immediate 3062 // dominator of the exit blocks. 3063 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 3064} 3065 3066PHINode *InnerLoopVectorizer::createInductionResumeValue( 3067 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, 3068 ArrayRef<BasicBlock *> BypassBlocks, 3069 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3070 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3071 assert(VectorTripCount && "Expected valid arguments"); 3072 3073 Instruction *OldInduction = Legal->getPrimaryInduction(); 3074 Value *&EndValue = IVEndValues[OrigPhi]; 3075 Value *EndValueFromAdditionalBypass = AdditionalBypass.second; 3076 if (OrigPhi == OldInduction) { 3077 // We know what the end value is. 3078 EndValue = VectorTripCount; 3079 } else { 3080 IRBuilder<> B(LoopVectorPreHeader->getTerminator()); 3081 3082 // Fast-math-flags propagate from the original induction instruction. 3083 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3084 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3085 3086 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), 3087 Step, II.getKind(), II.getInductionBinOp()); 3088 EndValue->setName("ind.end"); 3089 3090 // Compute the end value for the additional bypass (if applicable). 3091 if (AdditionalBypass.first) { 3092 B.SetInsertPoint(AdditionalBypass.first, 3093 AdditionalBypass.first->getFirstInsertionPt()); 3094 EndValueFromAdditionalBypass = 3095 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(), 3096 Step, II.getKind(), II.getInductionBinOp()); 3097 EndValueFromAdditionalBypass->setName("ind.end"); 3098 } 3099 } 3100 3101 // Create phi nodes to merge from the backedge-taken check block. 3102 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3103 LoopScalarPreHeader->getTerminator()); 3104 // Copy original phi DL over to the new one. 3105 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3106 3107 // The new PHI merges the original incoming value, in case of a bypass, 3108 // or the value at the end of the vectorized loop. 3109 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3110 3111 // Fix the scalar body counter (PHI node). 3112 // The old induction's phi node in the scalar body needs the truncated 3113 // value. 3114 for (BasicBlock *BB : BypassBlocks) 3115 BCResumeVal->addIncoming(II.getStartValue(), BB); 3116 3117 if (AdditionalBypass.first) 3118 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first, 3119 EndValueFromAdditionalBypass); 3120 return BCResumeVal; 3121} 3122 3123/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 3124/// expansion results. 3125static Value *getExpandedStep(const InductionDescriptor &ID, 3126 const SCEV2ValueTy &ExpandedSCEVs) { 3127 const SCEV *Step = ID.getStep(); 3128 if (auto *C = dyn_cast<SCEVConstant>(Step)) 3129 return C->getValue(); 3130 if (auto *U = dyn_cast<SCEVUnknown>(Step)) 3131 return U->getValue(); 3132 auto I = ExpandedSCEVs.find(Step); 3133 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point"); 3134 return I->second; 3135} 3136 3137void InnerLoopVectorizer::createInductionResumeValues( 3138 const SCEV2ValueTy &ExpandedSCEVs, 3139 std::pair<BasicBlock *, Value *> AdditionalBypass) { 3140 assert(((AdditionalBypass.first && AdditionalBypass.second) || 3141 (!AdditionalBypass.first && !AdditionalBypass.second)) && 3142 "Inconsistent information about additional bypass."); 3143 // We are going to resume the execution of the scalar loop. 3144 // Go over all of the induction variables that we found and fix the 3145 // PHIs that are left in the scalar version of the loop. 3146 // The starting values of PHI nodes depend on the counter of the last 3147 // iteration in the vectorized loop. 3148 // If we come from a bypass edge then we need to start from the original 3149 // start value. 3150 for (const auto &InductionEntry : Legal->getInductionVars()) { 3151 PHINode *OrigPhi = InductionEntry.first; 3152 const InductionDescriptor &II = InductionEntry.second; 3153 PHINode *BCResumeVal = createInductionResumeValue( 3154 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks, 3155 AdditionalBypass); 3156 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3157 } 3158} 3159 3160BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { 3161 // The trip counts should be cached by now. 3162 Value *Count = getTripCount(); 3163 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 3164 3165 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); 3166 3167 // Add a check in the middle block to see if we have completed 3168 // all of the iterations in the first vector loop. Three cases: 3169 // 1) If we require a scalar epilogue, there is no conditional branch as 3170 // we unconditionally branch to the scalar preheader. Do nothing. 3171 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. 3172 // Thus if tail is to be folded, we know we don't need to run the 3173 // remainder and we can use the previous value for the condition (true). 3174 // 3) Otherwise, construct a runtime check. 3175 if (!Cost->requiresScalarEpilogue(VF.isVector()) && 3176 !Cost->foldTailByMasking()) { 3177 // Here we use the same DebugLoc as the scalar loop latch terminator instead 3178 // of the corresponding compare because they may have ended up with 3179 // different line numbers and we want to avoid awkward line stepping while 3180 // debugging. Eg. if the compare has got a line number inside the loop. 3181 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant 3182 // operands. Perform simplification directly on VPlan once the branch is 3183 // modeled there. 3184 IRBuilder<> B(LoopMiddleBlock->getTerminator()); 3185 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); 3186 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); 3187 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); 3188 BI.setCondition(CmpN); 3189 if (hasBranchWeightMD(*ScalarLatchTerm)) { 3190 // Assume that `Count % VectorTripCount` is equally distributed. 3191 unsigned TripCount = UF * VF.getKnownMinValue(); 3192 assert(TripCount > 0 && "trip count should not be zero"); 3193 const uint32_t Weights[] = {1, TripCount - 1}; 3194 setBranchWeights(BI, Weights); 3195 } 3196 } 3197 3198#ifdef EXPENSIVE_CHECKS 3199 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3200#endif 3201 3202 return LoopVectorPreHeader; 3203} 3204 3205std::pair<BasicBlock *, Value *> 3206InnerLoopVectorizer::createVectorizedLoopSkeleton( 3207 const SCEV2ValueTy &ExpandedSCEVs) { 3208 /* 3209 In this function we generate a new loop. The new loop will contain 3210 the vectorized instructions while the old loop will continue to run the 3211 scalar remainder. 3212 3213 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 3214 / | preheader are expanded here. Eventually all required SCEV 3215 / | expansion should happen here. 3216 / v 3217 | [ ] <-- vector loop bypass (may consist of multiple blocks). 3218 | / | 3219 | / v 3220 || [ ] <-- vector pre header. 3221 |/ | 3222 | v 3223 | [ ] \ 3224 | [ ]_| <-- vector loop (created during VPlan execution). 3225 | | 3226 | v 3227 \ -[ ] <--- middle-block. 3228 \/ | 3229 /\ v 3230 | ->[ ] <--- new preheader. 3231 | | 3232 (opt) v <-- edge from middle to exit iff epilogue is not required. 3233 | [ ] \ 3234 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). 3235 \ | 3236 \ v 3237 >[ ] <-- exit block(s). 3238 ... 3239 */ 3240 3241 // Create an empty vector loop, and prepare basic blocks for the runtime 3242 // checks. 3243 createVectorLoopSkeleton(""); 3244 3245 // Now, compare the new count to zero. If it is zero skip the vector loop and 3246 // jump to the scalar loop. This check also covers the case where the 3247 // backedge-taken count is uint##_max: adding one to it will overflow leading 3248 // to an incorrect trip count of zero. In this (rare) case we will also jump 3249 // to the scalar loop. 3250 emitIterationCountCheck(LoopScalarPreHeader); 3251 3252 // Generate the code to check any assumptions that we've made for SCEV 3253 // expressions. 3254 emitSCEVChecks(LoopScalarPreHeader); 3255 3256 // Generate the code that checks in runtime if arrays overlap. We put the 3257 // checks into a separate block to make the more common case of few elements 3258 // faster. 3259 emitMemRuntimeChecks(LoopScalarPreHeader); 3260 3261 // Emit phis for the new starting index of the scalar loop. 3262 createInductionResumeValues(ExpandedSCEVs); 3263 3264 return {completeLoopSkeleton(), nullptr}; 3265} 3266 3267// Fix up external users of the induction variable. At this point, we are 3268// in LCSSA form, with all external PHIs that use the IV having one input value, 3269// coming from the remainder loop. We need those PHIs to also have a correct 3270// value for the IV when arriving directly from the middle block. 3271void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3272 const InductionDescriptor &II, 3273 Value *VectorTripCount, Value *EndValue, 3274 BasicBlock *MiddleBlock, 3275 BasicBlock *VectorHeader, VPlan &Plan, 3276 VPTransformState &State) { 3277 // There are two kinds of external IV usages - those that use the value 3278 // computed in the last iteration (the PHI) and those that use the penultimate 3279 // value (the value that feeds into the phi from the loop latch). 3280 // We allow both, but they, obviously, have different values. 3281 3282 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block"); 3283 3284 DenseMap<Value *, Value *> MissingVals; 3285 3286 // An external user of the last iteration's value should see the value that 3287 // the remainder loop uses to initialize its own IV. 3288 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3289 for (User *U : PostInc->users()) { 3290 Instruction *UI = cast<Instruction>(U); 3291 if (!OrigLoop->contains(UI)) { 3292 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3293 MissingVals[UI] = EndValue; 3294 } 3295 } 3296 3297 // An external user of the penultimate value need to see EndValue - Step. 3298 // The simplest way to get this is to recompute it from the constituent SCEVs, 3299 // that is Start + (Step * (CRD - 1)). 3300 for (User *U : OrigPhi->users()) { 3301 auto *UI = cast<Instruction>(U); 3302 if (!OrigLoop->contains(UI)) { 3303 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3304 IRBuilder<> B(MiddleBlock->getTerminator()); 3305 3306 // Fast-math-flags propagate from the original induction instruction. 3307 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp())) 3308 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); 3309 3310 Value *CountMinusOne = B.CreateSub( 3311 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); 3312 CountMinusOne->setName("cmo"); 3313 3314 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep()); 3315 assert(StepVPV && "step must have been expanded during VPlan execution"); 3316 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() 3317 : State.get(StepVPV, {0, 0}); 3318 Value *Escape = 3319 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, 3320 II.getKind(), II.getInductionBinOp()); 3321 Escape->setName("ind.escape"); 3322 MissingVals[UI] = Escape; 3323 } 3324 } 3325 3326 for (auto &I : MissingVals) { 3327 PHINode *PHI = cast<PHINode>(I.first); 3328 // One corner case we have to handle is two IVs "chasing" each-other, 3329 // that is %IV2 = phi [...], [ %IV1, %latch ] 3330 // In this case, if IV1 has an external use, we need to avoid adding both 3331 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3332 // don't already have an incoming value for the middle block. 3333 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) { 3334 PHI->addIncoming(I.second, MiddleBlock); 3335 Plan.removeLiveOut(PHI); 3336 } 3337 } 3338} 3339 3340namespace { 3341 3342struct CSEDenseMapInfo { 3343 static bool canHandle(const Instruction *I) { 3344 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3345 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3346 } 3347 3348 static inline Instruction *getEmptyKey() { 3349 return DenseMapInfo<Instruction *>::getEmptyKey(); 3350 } 3351 3352 static inline Instruction *getTombstoneKey() { 3353 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3354 } 3355 3356 static unsigned getHashValue(const Instruction *I) { 3357 assert(canHandle(I) && "Unknown instruction!"); 3358 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3359 I->value_op_end())); 3360 } 3361 3362 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3363 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3364 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3365 return LHS == RHS; 3366 return LHS->isIdenticalTo(RHS); 3367 } 3368}; 3369 3370} // end anonymous namespace 3371 3372///Perform cse of induction variable instructions. 3373static void cse(BasicBlock *BB) { 3374 // Perform simple cse. 3375 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3376 for (Instruction &In : llvm::make_early_inc_range(*BB)) { 3377 if (!CSEDenseMapInfo::canHandle(&In)) 3378 continue; 3379 3380 // Check if we can replace this instruction with any of the 3381 // visited instructions. 3382 if (Instruction *V = CSEMap.lookup(&In)) { 3383 In.replaceAllUsesWith(V); 3384 In.eraseFromParent(); 3385 continue; 3386 } 3387 3388 CSEMap[&In] = &In; 3389 } 3390} 3391 3392InstructionCost 3393LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3394 ElementCount VF) const { 3395 // We only need to calculate a cost if the VF is scalar; for actual vectors 3396 // we should already have a pre-calculated cost at each VF. 3397 if (!VF.isScalar()) 3398 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; 3399 3400 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 3401 Type *RetTy = CI->getType(); 3402 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 3403 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) 3404 return *RedCost; 3405 3406 SmallVector<Type *, 4> Tys; 3407 for (auto &ArgOp : CI->args()) 3408 Tys.push_back(ArgOp->getType()); 3409 3410 InstructionCost ScalarCallCost = 3411 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); 3412 3413 // If this is an intrinsic we may have a lower cost for it. 3414 if (getVectorIntrinsicIDForCall(CI, TLI)) { 3415 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); 3416 return std::min(ScalarCallCost, IntrinsicCost); 3417 } 3418 return ScalarCallCost; 3419} 3420 3421static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { 3422 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) 3423 return Elt; 3424 return VectorType::get(Elt, VF); 3425} 3426 3427InstructionCost 3428LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3429 ElementCount VF) const { 3430 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3431 assert(ID && "Expected intrinsic call!"); 3432 Type *RetTy = MaybeVectorizeType(CI->getType(), VF); 3433 FastMathFlags FMF; 3434 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3435 FMF = FPMO->getFastMathFlags(); 3436 3437 SmallVector<const Value *> Arguments(CI->args()); 3438 FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); 3439 SmallVector<Type *> ParamTys; 3440 std::transform(FTy->param_begin(), FTy->param_end(), 3441 std::back_inserter(ParamTys), 3442 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); 3443 3444 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, 3445 dyn_cast<IntrinsicInst>(CI)); 3446 return TTI.getIntrinsicInstrCost(CostAttrs, 3447 TargetTransformInfo::TCK_RecipThroughput); 3448} 3449 3450static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3451 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3452 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3453 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3454} 3455 3456static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3457 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType()); 3458 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType()); 3459 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3460} 3461 3462void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, 3463 VPlan &Plan) { 3464 // Fix widened non-induction PHIs by setting up the PHI operands. 3465 if (EnableVPlanNativePath) 3466 fixNonInductionPHIs(Plan, State); 3467 3468 // At this point every instruction in the original loop is widened to a 3469 // vector form. Now we need to fix the recurrences in the loop. These PHI 3470 // nodes are currently empty because we did not want to introduce cycles. 3471 // This is the second stage of vectorizing recurrences. Note that fixing 3472 // reduction phis are already modeled in VPlan. 3473 // TODO: Also model fixing fixed-order recurrence phis in VPlan. 3474 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); 3475 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock(); 3476 for (VPRecipeBase &R : HeaderVPBB->phis()) { 3477 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) 3478 fixFixedOrderRecurrence(FOR, State); 3479 } 3480 3481 // Forget the original basic block. 3482 PSE.getSE()->forgetLoop(OrigLoop); 3483 PSE.getSE()->forgetBlockAndLoopDispositions(); 3484 3485 // After vectorization, the exit blocks of the original loop will have 3486 // additional predecessors. Invalidate SCEVs for the exit phis in case SE 3487 // looked through single-entry phis. 3488 SmallVector<BasicBlock *> ExitBlocks; 3489 OrigLoop->getExitBlocks(ExitBlocks); 3490 for (BasicBlock *Exit : ExitBlocks) 3491 for (PHINode &PN : Exit->phis()) 3492 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); 3493 3494 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); 3495 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]); 3496 if (Cost->requiresScalarEpilogue(VF.isVector())) { 3497 // No edge from the middle block to the unique exit block has been inserted 3498 // and there is nothing to fix from vector loop; phis should have incoming 3499 // from scalar loop only. 3500 } else { 3501 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking 3502 // the cost model. 3503 3504 // If we inserted an edge from the middle block to the unique exit block, 3505 // update uses outside the loop (phis) to account for the newly inserted 3506 // edge. 3507 3508 // Fix-up external users of the induction variables. 3509 for (const auto &Entry : Legal->getInductionVars()) 3510 fixupIVUsers(Entry.first, Entry.second, 3511 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), 3512 IVEndValues[Entry.first], LoopMiddleBlock, 3513 VectorLoop->getHeader(), Plan, State); 3514 } 3515 3516 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated 3517 // in the exit block, so update the builder. 3518 State.Builder.SetInsertPoint(State.CFG.ExitBB, 3519 State.CFG.ExitBB->getFirstNonPHIIt()); 3520 for (const auto &KV : Plan.getLiveOuts()) 3521 KV.second->fixPhi(Plan, State); 3522 3523 for (Instruction *PI : PredicatedInstructions) 3524 sinkScalarOperands(&*PI); 3525 3526 // Remove redundant induction instructions. 3527 cse(VectorLoop->getHeader()); 3528 3529 // Set/update profile weights for the vector and remainder loops as original 3530 // loop iterations are now distributed among them. Note that original loop 3531 // represented by LoopScalarBody becomes remainder loop after vectorization. 3532 // 3533 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3534 // end up getting slightly roughened result but that should be OK since 3535 // profile is not inherently precise anyway. Note also possible bypass of 3536 // vector code caused by legality checks is ignored, assigning all the weight 3537 // to the vector loop, optimistically. 3538 // 3539 // For scalable vectorization we can't know at compile time how many iterations 3540 // of the loop are handled in one vector iteration, so instead assume a pessimistic 3541 // vscale of '1'. 3542 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop, 3543 LI->getLoopFor(LoopScalarBody), 3544 VF.getKnownMinValue() * UF); 3545} 3546 3547void InnerLoopVectorizer::fixFixedOrderRecurrence( 3548 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { 3549 // This is the second phase of vectorizing first-order recurrences. An 3550 // overview of the transformation is described below. Suppose we have the 3551 // following loop. 3552 // 3553 // for (int i = 0; i < n; ++i) 3554 // b[i] = a[i] - a[i - 1]; 3555 // 3556 // There is a first-order recurrence on "a". For this loop, the shorthand 3557 // scalar IR looks like: 3558 // 3559 // scalar.ph: 3560 // s_init = a[-1] 3561 // br scalar.body 3562 // 3563 // scalar.body: 3564 // i = phi [0, scalar.ph], [i+1, scalar.body] 3565 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3566 // s2 = a[i] 3567 // b[i] = s2 - s1 3568 // br cond, scalar.body, ... 3569 // 3570 // In this example, s1 is a recurrence because it's value depends on the 3571 // previous iteration. In the first phase of vectorization, we created a 3572 // vector phi v1 for s1. We now complete the vectorization and produce the 3573 // shorthand vector IR shown below (for VF = 4, UF = 1). 3574 // 3575 // vector.ph: 3576 // v_init = vector(..., ..., ..., a[-1]) 3577 // br vector.body 3578 // 3579 // vector.body 3580 // i = phi [0, vector.ph], [i+4, vector.body] 3581 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3582 // v2 = a[i, i+1, i+2, i+3]; 3583 // v3 = vector(v1(3), v2(0, 1, 2)) 3584 // b[i, i+1, i+2, i+3] = v2 - v3 3585 // br cond, vector.body, middle.block 3586 // 3587 // middle.block: 3588 // x = v2(3) 3589 // br scalar.ph 3590 // 3591 // scalar.ph: 3592 // s_init = phi [x, middle.block], [a[-1], otherwise] 3593 // br scalar.body 3594 // 3595 // After execution completes the vector loop, we extract the next value of 3596 // the recurrence (x) to use as the initial value in the scalar loop. 3597 3598 // Extract the last vector element in the middle block. This will be the 3599 // initial value for the recurrence when jumping to the scalar loop. 3600 VPValue *PreviousDef = PhiR->getBackedgeValue(); 3601 Value *Incoming = State.get(PreviousDef, UF - 1); 3602 auto *ExtractForScalar = Incoming; 3603 auto *IdxTy = Builder.getInt32Ty(); 3604 Value *RuntimeVF = nullptr; 3605 if (VF.isVector()) { 3606 auto *One = ConstantInt::get(IdxTy, 1); 3607 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3608 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF); 3609 auto *LastIdx = Builder.CreateSub(RuntimeVF, One); 3610 ExtractForScalar = 3611 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract"); 3612 } 3613 3614 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin()); 3615 assert(PhiR->getNumUsers() == 1 && 3616 RecurSplice->getOpcode() == 3617 VPInstruction::FirstOrderRecurrenceSplice && 3618 "recurrence phi must have a single user: FirstOrderRecurrenceSplice"); 3619 SmallVector<VPLiveOut *> LiveOuts; 3620 for (VPUser *U : RecurSplice->users()) 3621 if (auto *LiveOut = dyn_cast<VPLiveOut>(U)) 3622 LiveOuts.push_back(LiveOut); 3623 3624 if (!LiveOuts.empty()) { 3625 // Extract the second last element in the middle block if the 3626 // Phi is used outside the loop. We need to extract the phi itself 3627 // and not the last element (the phi update in the current iteration). This 3628 // will be the value when jumping to the exit block from the 3629 // LoopMiddleBlock, when the scalar loop is not run at all. 3630 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3631 if (VF.isVector()) { 3632 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2)); 3633 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3634 Incoming, Idx, "vector.recur.extract.for.phi"); 3635 } else { 3636 assert(UF > 1 && "VF and UF cannot both be 1"); 3637 // When loop is unrolled without vectorizing, initialize 3638 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled 3639 // value of `Incoming`. This is analogous to the vectorized case above: 3640 // extracting the second last element when VF > 1. 3641 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2); 3642 } 3643 3644 for (VPLiveOut *LiveOut : LiveOuts) { 3645 assert(!Cost->requiresScalarEpilogue(VF.isVector())); 3646 PHINode *LCSSAPhi = LiveOut->getPhi(); 3647 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3648 State.Plan->removeLiveOut(LCSSAPhi); 3649 } 3650 } 3651 3652 // Fix the initial value of the original recurrence in the scalar loop. 3653 Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); 3654 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue()); 3655 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3656 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); 3657 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3658 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3659 Start->addIncoming(Incoming, BB); 3660 } 3661 3662 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3663 Phi->setName("scalar.recur"); 3664} 3665 3666void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3667 // The basic block and loop containing the predicated instruction. 3668 auto *PredBB = PredInst->getParent(); 3669 auto *VectorLoop = LI->getLoopFor(PredBB); 3670 3671 // Initialize a worklist with the operands of the predicated instruction. 3672 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3673 3674 // Holds instructions that we need to analyze again. An instruction may be 3675 // reanalyzed if we don't yet know if we can sink it or not. 3676 SmallVector<Instruction *, 8> InstsToReanalyze; 3677 3678 // Returns true if a given use occurs in the predicated block. Phi nodes use 3679 // their operands in their corresponding predecessor blocks. 3680 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3681 auto *I = cast<Instruction>(U.getUser()); 3682 BasicBlock *BB = I->getParent(); 3683 if (auto *Phi = dyn_cast<PHINode>(I)) 3684 BB = Phi->getIncomingBlock( 3685 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3686 return BB == PredBB; 3687 }; 3688 3689 // Iteratively sink the scalarized operands of the predicated instruction 3690 // into the block we created for it. When an instruction is sunk, it's 3691 // operands are then added to the worklist. The algorithm ends after one pass 3692 // through the worklist doesn't sink a single instruction. 3693 bool Changed; 3694 do { 3695 // Add the instructions that need to be reanalyzed to the worklist, and 3696 // reset the changed indicator. 3697 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3698 InstsToReanalyze.clear(); 3699 Changed = false; 3700 3701 while (!Worklist.empty()) { 3702 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3703 3704 // We can't sink an instruction if it is a phi node, is not in the loop, 3705 // may have side effects or may read from memory. 3706 // TODO Could dor more granular checking to allow sinking a load past non-store instructions. 3707 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) || 3708 I->mayHaveSideEffects() || I->mayReadFromMemory()) 3709 continue; 3710 3711 // If the instruction is already in PredBB, check if we can sink its 3712 // operands. In that case, VPlan's sinkScalarOperands() succeeded in 3713 // sinking the scalar instruction I, hence it appears in PredBB; but it 3714 // may have failed to sink I's operands (recursively), which we try 3715 // (again) here. 3716 if (I->getParent() == PredBB) { 3717 Worklist.insert(I->op_begin(), I->op_end()); 3718 continue; 3719 } 3720 3721 // It's legal to sink the instruction if all its uses occur in the 3722 // predicated block. Otherwise, there's nothing to do yet, and we may 3723 // need to reanalyze the instruction. 3724 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3725 InstsToReanalyze.push_back(I); 3726 continue; 3727 } 3728 3729 // Move the instruction to the beginning of the predicated block, and add 3730 // it's operands to the worklist. 3731 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3732 Worklist.insert(I->op_begin(), I->op_end()); 3733 3734 // The sinking may have enabled other instructions to be sunk, so we will 3735 // need to iterate. 3736 Changed = true; 3737 } 3738 } while (Changed); 3739} 3740 3741void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, 3742 VPTransformState &State) { 3743 auto Iter = vp_depth_first_deep(Plan.getEntry()); 3744 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) { 3745 for (VPRecipeBase &P : VPBB->phis()) { 3746 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P); 3747 if (!VPPhi) 3748 continue; 3749 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0)); 3750 // Make sure the builder has a valid insert point. 3751 Builder.SetInsertPoint(NewPhi); 3752 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { 3753 VPValue *Inc = VPPhi->getIncomingValue(i); 3754 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); 3755 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); 3756 } 3757 } 3758 } 3759} 3760 3761bool InnerLoopVectorizer::useOrderedReductions( 3762 const RecurrenceDescriptor &RdxDesc) { 3763 return Cost->useOrderedReductions(RdxDesc); 3764} 3765 3766void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { 3767 // We should not collect Scalars more than once per VF. Right now, this 3768 // function is called from collectUniformsAndScalars(), which already does 3769 // this check. Collecting Scalars for VF=1 does not make any sense. 3770 assert(VF.isVector() && !Scalars.contains(VF) && 3771 "This function should not be visited twice for the same VF"); 3772 3773 // This avoids any chances of creating a REPLICATE recipe during planning 3774 // since that would result in generation of scalarized code during execution, 3775 // which is not supported for scalable vectors. 3776 if (VF.isScalable()) { 3777 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3778 return; 3779 } 3780 3781 SmallSetVector<Instruction *, 8> Worklist; 3782 3783 // These sets are used to seed the analysis with pointers used by memory 3784 // accesses that will remain scalar. 3785 SmallSetVector<Instruction *, 8> ScalarPtrs; 3786 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 3787 auto *Latch = TheLoop->getLoopLatch(); 3788 3789 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 3790 // The pointer operands of loads and stores will be scalar as long as the 3791 // memory access is not a gather or scatter operation. The value operand of a 3792 // store will remain scalar if the store is scalarized. 3793 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 3794 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 3795 assert(WideningDecision != CM_Unknown && 3796 "Widening decision should be ready at this moment"); 3797 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 3798 if (Ptr == Store->getValueOperand()) 3799 return WideningDecision == CM_Scalarize; 3800 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 3801 "Ptr is neither a value or pointer operand"); 3802 return WideningDecision != CM_GatherScatter; 3803 }; 3804 3805 // A helper that returns true if the given value is a bitcast or 3806 // getelementptr instruction contained in the loop. 3807 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 3808 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 3809 isa<GetElementPtrInst>(V)) && 3810 !TheLoop->isLoopInvariant(V); 3811 }; 3812 3813 // A helper that evaluates a memory access's use of a pointer. If the use will 3814 // be a scalar use and the pointer is only used by memory accesses, we place 3815 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3816 // PossibleNonScalarPtrs. 3817 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 3818 // We only care about bitcast and getelementptr instructions contained in 3819 // the loop. 3820 if (!isLoopVaryingBitCastOrGEP(Ptr)) 3821 return; 3822 3823 // If the pointer has already been identified as scalar (e.g., if it was 3824 // also identified as uniform), there's nothing to do. 3825 auto *I = cast<Instruction>(Ptr); 3826 if (Worklist.count(I)) 3827 return; 3828 3829 // If the use of the pointer will be a scalar use, and all users of the 3830 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3831 // place the pointer in PossibleNonScalarPtrs. 3832 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 3833 return isa<LoadInst>(U) || isa<StoreInst>(U); 3834 })) 3835 ScalarPtrs.insert(I); 3836 else 3837 PossibleNonScalarPtrs.insert(I); 3838 }; 3839 3840 // We seed the scalars analysis with three classes of instructions: (1) 3841 // instructions marked uniform-after-vectorization and (2) bitcast, 3842 // getelementptr and (pointer) phi instructions used by memory accesses 3843 // requiring a scalar use. 3844 // 3845 // (1) Add to the worklist all instructions that have been identified as 3846 // uniform-after-vectorization. 3847 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 3848 3849 // (2) Add to the worklist all bitcast and getelementptr instructions used by 3850 // memory accesses requiring a scalar use. The pointer operands of loads and 3851 // stores will be scalar as long as the memory accesses is not a gather or 3852 // scatter operation. The value operand of a store will remain scalar if the 3853 // store is scalarized. 3854 for (auto *BB : TheLoop->blocks()) 3855 for (auto &I : *BB) { 3856 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3857 evaluatePtrUse(Load, Load->getPointerOperand()); 3858 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3859 evaluatePtrUse(Store, Store->getPointerOperand()); 3860 evaluatePtrUse(Store, Store->getValueOperand()); 3861 } 3862 } 3863 for (auto *I : ScalarPtrs) 3864 if (!PossibleNonScalarPtrs.count(I)) { 3865 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 3866 Worklist.insert(I); 3867 } 3868 3869 // Insert the forced scalars. 3870 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3871 // induction variable when the PHI user is scalarized. 3872 auto ForcedScalar = ForcedScalars.find(VF); 3873 if (ForcedScalar != ForcedScalars.end()) 3874 for (auto *I : ForcedScalar->second) { 3875 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); 3876 Worklist.insert(I); 3877 } 3878 3879 // Expand the worklist by looking through any bitcasts and getelementptr 3880 // instructions we've already identified as scalar. This is similar to the 3881 // expansion step in collectLoopUniforms(); however, here we're only 3882 // expanding to include additional bitcasts and getelementptr instructions. 3883 unsigned Idx = 0; 3884 while (Idx != Worklist.size()) { 3885 Instruction *Dst = Worklist[Idx++]; 3886 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 3887 continue; 3888 auto *Src = cast<Instruction>(Dst->getOperand(0)); 3889 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 3890 auto *J = cast<Instruction>(U); 3891 return !TheLoop->contains(J) || Worklist.count(J) || 3892 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 3893 isScalarUse(J, Src)); 3894 })) { 3895 Worklist.insert(Src); 3896 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 3897 } 3898 } 3899 3900 // An induction variable will remain scalar if all users of the induction 3901 // variable and induction variable update remain scalar. 3902 for (const auto &Induction : Legal->getInductionVars()) { 3903 auto *Ind = Induction.first; 3904 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 3905 3906 // If tail-folding is applied, the primary induction variable will be used 3907 // to feed a vector compare. 3908 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) 3909 continue; 3910 3911 // Returns true if \p Indvar is a pointer induction that is used directly by 3912 // load/store instruction \p I. 3913 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, 3914 Instruction *I) { 3915 return Induction.second.getKind() == 3916 InductionDescriptor::IK_PtrInduction && 3917 (isa<LoadInst>(I) || isa<StoreInst>(I)) && 3918 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); 3919 }; 3920 3921 // Determine if all users of the induction variable are scalar after 3922 // vectorization. 3923 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 3924 auto *I = cast<Instruction>(U); 3925 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 3926 IsDirectLoadStoreFromPtrIndvar(Ind, I); 3927 }); 3928 if (!ScalarInd) 3929 continue; 3930 3931 // Determine if all users of the induction variable update instruction are 3932 // scalar after vectorization. 3933 auto ScalarIndUpdate = 3934 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 3935 auto *I = cast<Instruction>(U); 3936 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 3937 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); 3938 }); 3939 if (!ScalarIndUpdate) 3940 continue; 3941 3942 // The induction variable and its update instruction will remain scalar. 3943 Worklist.insert(Ind); 3944 Worklist.insert(IndUpdate); 3945 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 3946 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 3947 << "\n"); 3948 } 3949 3950 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 3951} 3952 3953bool LoopVectorizationCostModel::isScalarWithPredication( 3954 Instruction *I, ElementCount VF) const { 3955 if (!isPredicatedInst(I)) 3956 return false; 3957 3958 // Do we have a non-scalar lowering for this predicated 3959 // instruction? No - it is scalar with predication. 3960 switch(I->getOpcode()) { 3961 default: 3962 return true; 3963 case Instruction::Call: 3964 if (VF.isScalar()) 3965 return true; 3966 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF)) 3967 .Kind == CM_Scalarize; 3968 case Instruction::Load: 3969 case Instruction::Store: { 3970 auto *Ptr = getLoadStorePointerOperand(I); 3971 auto *Ty = getLoadStoreType(I); 3972 Type *VTy = Ty; 3973 if (VF.isVector()) 3974 VTy = VectorType::get(Ty, VF); 3975 const Align Alignment = getLoadStoreAlignment(I); 3976 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 3977 TTI.isLegalMaskedGather(VTy, Alignment)) 3978 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 3979 TTI.isLegalMaskedScatter(VTy, Alignment)); 3980 } 3981 case Instruction::UDiv: 3982 case Instruction::SDiv: 3983 case Instruction::SRem: 3984 case Instruction::URem: { 3985 // We have the option to use the safe-divisor idiom to avoid predication. 3986 // The cost based decision here will always select safe-divisor for 3987 // scalable vectors as scalarization isn't legal. 3988 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 3989 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); 3990 } 3991 } 3992} 3993 3994bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { 3995 if (!blockNeedsPredicationForAnyReason(I->getParent())) 3996 return false; 3997 3998 // Can we prove this instruction is safe to unconditionally execute? 3999 // If not, we must use some form of predication. 4000 switch(I->getOpcode()) { 4001 default: 4002 return false; 4003 case Instruction::Load: 4004 case Instruction::Store: { 4005 if (!Legal->isMaskRequired(I)) 4006 return false; 4007 // When we know the load's address is loop invariant and the instruction 4008 // in the original scalar loop was unconditionally executed then we 4009 // don't need to mark it as a predicated instruction. Tail folding may 4010 // introduce additional predication, but we're guaranteed to always have 4011 // at least one active lane. We call Legal->blockNeedsPredication here 4012 // because it doesn't query tail-folding. For stores, we need to prove 4013 // both speculation safety (which follows from the same argument as loads), 4014 // but also must prove the value being stored is correct. The easiest 4015 // form of the later is to require that all values stored are the same. 4016 if (Legal->isInvariant(getLoadStorePointerOperand(I)) && 4017 (isa<LoadInst>(I) || 4018 (isa<StoreInst>(I) && 4019 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) && 4020 !Legal->blockNeedsPredication(I->getParent())) 4021 return false; 4022 return true; 4023 } 4024 case Instruction::UDiv: 4025 case Instruction::SDiv: 4026 case Instruction::SRem: 4027 case Instruction::URem: 4028 // TODO: We can use the loop-preheader as context point here and get 4029 // context sensitive reasoning 4030 return !isSafeToSpeculativelyExecute(I); 4031 case Instruction::Call: 4032 return Legal->isMaskRequired(I); 4033 } 4034} 4035 4036std::pair<InstructionCost, InstructionCost> 4037LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, 4038 ElementCount VF) const { 4039 assert(I->getOpcode() == Instruction::UDiv || 4040 I->getOpcode() == Instruction::SDiv || 4041 I->getOpcode() == Instruction::SRem || 4042 I->getOpcode() == Instruction::URem); 4043 assert(!isSafeToSpeculativelyExecute(I)); 4044 4045 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 4046 4047 // Scalarization isn't legal for scalable vector types 4048 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 4049 if (!VF.isScalable()) { 4050 // Get the scalarization cost and scale this amount by the probability of 4051 // executing the predicated block. If the instruction is not predicated, 4052 // we fall through to the next case. 4053 ScalarizationCost = 0; 4054 4055 // These instructions have a non-void type, so account for the phi nodes 4056 // that we will create. This cost is likely to be zero. The phi node 4057 // cost, if any, should be scaled by the block probability because it 4058 // models a copy at the end of each predicated block. 4059 ScalarizationCost += VF.getKnownMinValue() * 4060 TTI.getCFInstrCost(Instruction::PHI, CostKind); 4061 4062 // The cost of the non-predicated instruction. 4063 ScalarizationCost += VF.getKnownMinValue() * 4064 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); 4065 4066 // The cost of insertelement and extractelement instructions needed for 4067 // scalarization. 4068 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); 4069 4070 // Scale the cost by the probability of executing the predicated blocks. 4071 // This assumes the predicated block for each vector lane is equally 4072 // likely. 4073 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); 4074 } 4075 InstructionCost SafeDivisorCost = 0; 4076 4077 auto *VecTy = ToVectorTy(I->getType(), VF); 4078 4079 // The cost of the select guard to ensure all lanes are well defined 4080 // after we speculate above any internal control flow. 4081 SafeDivisorCost += TTI.getCmpSelInstrCost( 4082 Instruction::Select, VecTy, 4083 ToVectorTy(Type::getInt1Ty(I->getContext()), VF), 4084 CmpInst::BAD_ICMP_PREDICATE, CostKind); 4085 4086 // Certain instructions can be cheaper to vectorize if they have a constant 4087 // second vector operand. One example of this are shifts on x86. 4088 Value *Op2 = I->getOperand(1); 4089 auto Op2Info = TTI.getOperandInfo(Op2); 4090 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 4091 Legal->isInvariant(Op2)) 4092 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 4093 4094 SmallVector<const Value *, 4> Operands(I->operand_values()); 4095 SafeDivisorCost += TTI.getArithmeticInstrCost( 4096 I->getOpcode(), VecTy, CostKind, 4097 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 4098 Op2Info, Operands, I); 4099 return {ScalarizationCost, SafeDivisorCost}; 4100} 4101 4102bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( 4103 Instruction *I, ElementCount VF) { 4104 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4105 assert(getWideningDecision(I, VF) == CM_Unknown && 4106 "Decision should not be set yet."); 4107 auto *Group = getInterleavedAccessGroup(I); 4108 assert(Group && "Must have a group."); 4109 4110 // If the instruction's allocated size doesn't equal it's type size, it 4111 // requires padding and will be scalarized. 4112 auto &DL = I->getModule()->getDataLayout(); 4113 auto *ScalarTy = getLoadStoreType(I); 4114 if (hasIrregularType(ScalarTy, DL)) 4115 return false; 4116 4117 // If the group involves a non-integral pointer, we may not be able to 4118 // losslessly cast all values to a common type. 4119 unsigned InterleaveFactor = Group->getFactor(); 4120 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); 4121 for (unsigned i = 0; i < InterleaveFactor; i++) { 4122 Instruction *Member = Group->getMember(i); 4123 if (!Member) 4124 continue; 4125 auto *MemberTy = getLoadStoreType(Member); 4126 bool MemberNI = DL.isNonIntegralPointerType(MemberTy); 4127 // Don't coerce non-integral pointers to integers or vice versa. 4128 if (MemberNI != ScalarNI) { 4129 // TODO: Consider adding special nullptr value case here 4130 return false; 4131 } else if (MemberNI && ScalarNI && 4132 ScalarTy->getPointerAddressSpace() != 4133 MemberTy->getPointerAddressSpace()) { 4134 return false; 4135 } 4136 } 4137 4138 // Check if masking is required. 4139 // A Group may need masking for one of two reasons: it resides in a block that 4140 // needs predication, or it was decided to use masking to deal with gaps 4141 // (either a gap at the end of a load-access that may result in a speculative 4142 // load, or any gaps in a store-access). 4143 bool PredicatedAccessRequiresMasking = 4144 blockNeedsPredicationForAnyReason(I->getParent()) && 4145 Legal->isMaskRequired(I); 4146 bool LoadAccessWithGapsRequiresEpilogMasking = 4147 isa<LoadInst>(I) && Group->requiresScalarEpilogue() && 4148 !isScalarEpilogueAllowed(); 4149 bool StoreAccessWithGapsRequiresMasking = 4150 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()); 4151 if (!PredicatedAccessRequiresMasking && 4152 !LoadAccessWithGapsRequiresEpilogMasking && 4153 !StoreAccessWithGapsRequiresMasking) 4154 return true; 4155 4156 // If masked interleaving is required, we expect that the user/target had 4157 // enabled it, because otherwise it either wouldn't have been created or 4158 // it should have been invalidated by the CostModel. 4159 assert(useMaskedInterleavedAccesses(TTI) && 4160 "Masked interleave-groups for predicated accesses are not enabled."); 4161 4162 if (Group->isReverse()) 4163 return false; 4164 4165 auto *Ty = getLoadStoreType(I); 4166 const Align Alignment = getLoadStoreAlignment(I); 4167 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4168 : TTI.isLegalMaskedStore(Ty, Alignment); 4169} 4170 4171bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( 4172 Instruction *I, ElementCount VF) { 4173 // Get and ensure we have a valid memory instruction. 4174 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction"); 4175 4176 auto *Ptr = getLoadStorePointerOperand(I); 4177 auto *ScalarTy = getLoadStoreType(I); 4178 4179 // In order to be widened, the pointer should be consecutive, first of all. 4180 if (!Legal->isConsecutivePtr(ScalarTy, Ptr)) 4181 return false; 4182 4183 // If the instruction is a store located in a predicated block, it will be 4184 // scalarized. 4185 if (isScalarWithPredication(I, VF)) 4186 return false; 4187 4188 // If the instruction's allocated size doesn't equal it's type size, it 4189 // requires padding and will be scalarized. 4190 auto &DL = I->getModule()->getDataLayout(); 4191 if (hasIrregularType(ScalarTy, DL)) 4192 return false; 4193 4194 return true; 4195} 4196 4197void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { 4198 // We should not collect Uniforms more than once per VF. Right now, 4199 // this function is called from collectUniformsAndScalars(), which 4200 // already does this check. Collecting Uniforms for VF=1 does not make any 4201 // sense. 4202 4203 assert(VF.isVector() && !Uniforms.contains(VF) && 4204 "This function should not be visited twice for the same VF"); 4205 4206 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4207 // not analyze again. Uniforms.count(VF) will return 1. 4208 Uniforms[VF].clear(); 4209 4210 // We now know that the loop is vectorizable! 4211 // Collect instructions inside the loop that will remain uniform after 4212 // vectorization. 4213 4214 // Global values, params and instructions outside of current loop are out of 4215 // scope. 4216 auto isOutOfScope = [&](Value *V) -> bool { 4217 Instruction *I = dyn_cast<Instruction>(V); 4218 return (!I || !TheLoop->contains(I)); 4219 }; 4220 4221 // Worklist containing uniform instructions demanding lane 0. 4222 SetVector<Instruction *> Worklist; 4223 BasicBlock *Latch = TheLoop->getLoopLatch(); 4224 4225 // Add uniform instructions demanding lane 0 to the worklist. Instructions 4226 // that are scalar with predication must not be considered uniform after 4227 // vectorization, because that would create an erroneous replicating region 4228 // where only a single instance out of VF should be formed. 4229 // TODO: optimize such seldom cases if found important, see PR40816. 4230 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4231 if (isOutOfScope(I)) { 4232 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " 4233 << *I << "\n"); 4234 return; 4235 } 4236 if (isScalarWithPredication(I, VF)) { 4237 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4238 << *I << "\n"); 4239 return; 4240 } 4241 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4242 Worklist.insert(I); 4243 }; 4244 4245 // Start with the conditional branch. If the branch condition is an 4246 // instruction contained in the loop that is only used by the branch, it is 4247 // uniform. 4248 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4249 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4250 addToWorklistIfAllowed(Cmp); 4251 4252 auto PrevVF = VF.divideCoefficientBy(2); 4253 // Return true if all lanes perform the same memory operation, and we can 4254 // thus chose to execute only one. 4255 auto isUniformMemOpUse = [&](Instruction *I) { 4256 // If the value was already known to not be uniform for the previous 4257 // (smaller VF), it cannot be uniform for the larger VF. 4258 if (PrevVF.isVector()) { 4259 auto Iter = Uniforms.find(PrevVF); 4260 if (Iter != Uniforms.end() && !Iter->second.contains(I)) 4261 return false; 4262 } 4263 if (!Legal->isUniformMemOp(*I, VF)) 4264 return false; 4265 if (isa<LoadInst>(I)) 4266 // Loading the same address always produces the same result - at least 4267 // assuming aliasing and ordering which have already been checked. 4268 return true; 4269 // Storing the same value on every iteration. 4270 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()); 4271 }; 4272 4273 auto isUniformDecision = [&](Instruction *I, ElementCount VF) { 4274 InstWidening WideningDecision = getWideningDecision(I, VF); 4275 assert(WideningDecision != CM_Unknown && 4276 "Widening decision should be ready at this moment"); 4277 4278 if (isUniformMemOpUse(I)) 4279 return true; 4280 4281 return (WideningDecision == CM_Widen || 4282 WideningDecision == CM_Widen_Reverse || 4283 WideningDecision == CM_Interleave); 4284 }; 4285 4286 // Returns true if Ptr is the pointer operand of a memory access instruction 4287 // I, I is known to not require scalarization, and the pointer is not also 4288 // stored. 4289 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4290 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr) 4291 return false; 4292 return getLoadStorePointerOperand(I) == Ptr && 4293 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); 4294 }; 4295 4296 // Holds a list of values which are known to have at least one uniform use. 4297 // Note that there may be other uses which aren't uniform. A "uniform use" 4298 // here is something which only demands lane 0 of the unrolled iterations; 4299 // it does not imply that all lanes produce the same value (e.g. this is not 4300 // the usual meaning of uniform) 4301 SetVector<Value *> HasUniformUse; 4302 4303 // Scan the loop for instructions which are either a) known to have only 4304 // lane 0 demanded or b) are uses which demand only lane 0 of their operand. 4305 for (auto *BB : TheLoop->blocks()) 4306 for (auto &I : *BB) { 4307 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) { 4308 switch (II->getIntrinsicID()) { 4309 case Intrinsic::sideeffect: 4310 case Intrinsic::experimental_noalias_scope_decl: 4311 case Intrinsic::assume: 4312 case Intrinsic::lifetime_start: 4313 case Intrinsic::lifetime_end: 4314 if (TheLoop->hasLoopInvariantOperands(&I)) 4315 addToWorklistIfAllowed(&I); 4316 break; 4317 default: 4318 break; 4319 } 4320 } 4321 4322 // ExtractValue instructions must be uniform, because the operands are 4323 // known to be loop-invariant. 4324 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) { 4325 assert(isOutOfScope(EVI->getAggregateOperand()) && 4326 "Expected aggregate value to be loop invariant"); 4327 addToWorklistIfAllowed(EVI); 4328 continue; 4329 } 4330 4331 // If there's no pointer operand, there's nothing to do. 4332 auto *Ptr = getLoadStorePointerOperand(&I); 4333 if (!Ptr) 4334 continue; 4335 4336 if (isUniformMemOpUse(&I)) 4337 addToWorklistIfAllowed(&I); 4338 4339 if (isVectorizedMemAccessUse(&I, Ptr)) 4340 HasUniformUse.insert(Ptr); 4341 } 4342 4343 // Add to the worklist any operands which have *only* uniform (e.g. lane 0 4344 // demanding) users. Since loops are assumed to be in LCSSA form, this 4345 // disallows uses outside the loop as well. 4346 for (auto *V : HasUniformUse) { 4347 if (isOutOfScope(V)) 4348 continue; 4349 auto *I = cast<Instruction>(V); 4350 auto UsersAreMemAccesses = 4351 llvm::all_of(I->users(), [&](User *U) -> bool { 4352 return isVectorizedMemAccessUse(cast<Instruction>(U), V); 4353 }); 4354 if (UsersAreMemAccesses) 4355 addToWorklistIfAllowed(I); 4356 } 4357 4358 // Expand Worklist in topological order: whenever a new instruction 4359 // is added , its users should be already inside Worklist. It ensures 4360 // a uniform instruction will only be used by uniform instructions. 4361 unsigned idx = 0; 4362 while (idx != Worklist.size()) { 4363 Instruction *I = Worklist[idx++]; 4364 4365 for (auto *OV : I->operand_values()) { 4366 // isOutOfScope operands cannot be uniform instructions. 4367 if (isOutOfScope(OV)) 4368 continue; 4369 // First order recurrence Phi's should typically be considered 4370 // non-uniform. 4371 auto *OP = dyn_cast<PHINode>(OV); 4372 if (OP && Legal->isFixedOrderRecurrence(OP)) 4373 continue; 4374 // If all the users of the operand are uniform, then add the 4375 // operand into the uniform worklist. 4376 auto *OI = cast<Instruction>(OV); 4377 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4378 auto *J = cast<Instruction>(U); 4379 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); 4380 })) 4381 addToWorklistIfAllowed(OI); 4382 } 4383 } 4384 4385 // For an instruction to be added into Worklist above, all its users inside 4386 // the loop should also be in Worklist. However, this condition cannot be 4387 // true for phi nodes that form a cyclic dependence. We must process phi 4388 // nodes separately. An induction variable will remain uniform if all users 4389 // of the induction variable and induction variable update remain uniform. 4390 // The code below handles both pointer and non-pointer induction variables. 4391 for (const auto &Induction : Legal->getInductionVars()) { 4392 auto *Ind = Induction.first; 4393 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4394 4395 // Determine if all users of the induction variable are uniform after 4396 // vectorization. 4397 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4398 auto *I = cast<Instruction>(U); 4399 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4400 isVectorizedMemAccessUse(I, Ind); 4401 }); 4402 if (!UniformInd) 4403 continue; 4404 4405 // Determine if all users of the induction variable update instruction are 4406 // uniform after vectorization. 4407 auto UniformIndUpdate = 4408 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4409 auto *I = cast<Instruction>(U); 4410 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4411 isVectorizedMemAccessUse(I, IndUpdate); 4412 }); 4413 if (!UniformIndUpdate) 4414 continue; 4415 4416 // The induction variable and its update instruction will remain uniform. 4417 addToWorklistIfAllowed(Ind); 4418 addToWorklistIfAllowed(IndUpdate); 4419 } 4420 4421 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4422} 4423 4424bool LoopVectorizationCostModel::runtimeChecksRequired() { 4425 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4426 4427 if (Legal->getRuntimePointerChecking()->Need) { 4428 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4429 "runtime pointer checks needed. Enable vectorization of this " 4430 "loop with '#pragma clang loop vectorize(enable)' when " 4431 "compiling with -Os/-Oz", 4432 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4433 return true; 4434 } 4435 4436 if (!PSE.getPredicate().isAlwaysTrue()) { 4437 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4438 "runtime SCEV checks needed. Enable vectorization of this " 4439 "loop with '#pragma clang loop vectorize(enable)' when " 4440 "compiling with -Os/-Oz", 4441 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4442 return true; 4443 } 4444 4445 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4446 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4447 reportVectorizationFailure("Runtime stride check for small trip count", 4448 "runtime stride == 1 checks needed. Enable vectorization of " 4449 "this loop without such check by compiling with -Os/-Oz", 4450 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4451 return true; 4452 } 4453 4454 return false; 4455} 4456 4457ElementCount 4458LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { 4459 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) 4460 return ElementCount::getScalable(0); 4461 4462 if (Hints->isScalableVectorizationDisabled()) { 4463 reportVectorizationInfo("Scalable vectorization is explicitly disabled", 4464 "ScalableVectorizationDisabled", ORE, TheLoop); 4465 return ElementCount::getScalable(0); 4466 } 4467 4468 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); 4469 4470 auto MaxScalableVF = ElementCount::getScalable( 4471 std::numeric_limits<ElementCount::ScalarTy>::max()); 4472 4473 // Test that the loop-vectorizer can legalize all operations for this MaxVF. 4474 // FIXME: While for scalable vectors this is currently sufficient, this should 4475 // be replaced by a more detailed mechanism that filters out specific VFs, 4476 // instead of invalidating vectorization for a whole set of VFs based on the 4477 // MaxVF. 4478 4479 // Disable scalable vectorization if the loop contains unsupported reductions. 4480 if (!canVectorizeReductions(MaxScalableVF)) { 4481 reportVectorizationInfo( 4482 "Scalable vectorization not supported for the reduction " 4483 "operations found in this loop.", 4484 "ScalableVFUnfeasible", ORE, TheLoop); 4485 return ElementCount::getScalable(0); 4486 } 4487 4488 // Disable scalable vectorization if the loop contains any instructions 4489 // with element types not supported for scalable vectors. 4490 if (any_of(ElementTypesInLoop, [&](Type *Ty) { 4491 return !Ty->isVoidTy() && 4492 !this->TTI.isElementTypeLegalForScalableVector(Ty); 4493 })) { 4494 reportVectorizationInfo("Scalable vectorization is not supported " 4495 "for all element types found in this loop.", 4496 "ScalableVFUnfeasible", ORE, TheLoop); 4497 return ElementCount::getScalable(0); 4498 } 4499 4500 if (Legal->isSafeForAnyVectorWidth()) 4501 return MaxScalableVF; 4502 4503 // Limit MaxScalableVF by the maximum safe dependence distance. 4504 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI)) 4505 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale); 4506 else 4507 MaxScalableVF = ElementCount::getScalable(0); 4508 4509 if (!MaxScalableVF) 4510 reportVectorizationInfo( 4511 "Max legal vector width too small, scalable vectorization " 4512 "unfeasible.", 4513 "ScalableVFUnfeasible", ORE, TheLoop); 4514 4515 return MaxScalableVF; 4516} 4517 4518FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( 4519 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { 4520 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4521 unsigned SmallestType, WidestType; 4522 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4523 4524 // Get the maximum safe dependence distance in bits computed by LAA. 4525 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4526 // the memory accesses that is most restrictive (involved in the smallest 4527 // dependence distance). 4528 unsigned MaxSafeElements = 4529 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); 4530 4531 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); 4532 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); 4533 4534 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF 4535 << ".\n"); 4536 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF 4537 << ".\n"); 4538 4539 // First analyze the UserVF, fall back if the UserVF should be ignored. 4540 if (UserVF) { 4541 auto MaxSafeUserVF = 4542 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; 4543 4544 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { 4545 // If `VF=vscale x N` is safe, then so is `VF=N` 4546 if (UserVF.isScalable()) 4547 return FixedScalableVFPair( 4548 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); 4549 else 4550 return UserVF; 4551 } 4552 4553 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); 4554 4555 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 4556 // is better to ignore the hint and let the compiler choose a suitable VF. 4557 if (!UserVF.isScalable()) { 4558 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4559 << " is unsafe, clamping to max safe VF=" 4560 << MaxSafeFixedVF << ".\n"); 4561 ORE->emit([&]() { 4562 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4563 TheLoop->getStartLoc(), 4564 TheLoop->getHeader()) 4565 << "User-specified vectorization factor " 4566 << ore::NV("UserVectorizationFactor", UserVF) 4567 << " is unsafe, clamping to maximum safe vectorization factor " 4568 << ore::NV("VectorizationFactor", MaxSafeFixedVF); 4569 }); 4570 return MaxSafeFixedVF; 4571 } 4572 4573 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { 4574 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4575 << " is ignored because scalable vectors are not " 4576 "available.\n"); 4577 ORE->emit([&]() { 4578 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4579 TheLoop->getStartLoc(), 4580 TheLoop->getHeader()) 4581 << "User-specified vectorization factor " 4582 << ore::NV("UserVectorizationFactor", UserVF) 4583 << " is ignored because the target does not support scalable " 4584 "vectors. The compiler will pick a more suitable value."; 4585 }); 4586 } else { 4587 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF 4588 << " is unsafe. Ignoring scalable UserVF.\n"); 4589 ORE->emit([&]() { 4590 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", 4591 TheLoop->getStartLoc(), 4592 TheLoop->getHeader()) 4593 << "User-specified vectorization factor " 4594 << ore::NV("UserVectorizationFactor", UserVF) 4595 << " is unsafe. Ignoring the hint to let the compiler pick a " 4596 "more suitable value."; 4597 }); 4598 } 4599 } 4600 4601 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4602 << " / " << WidestType << " bits.\n"); 4603 4604 FixedScalableVFPair Result(ElementCount::getFixed(1), 4605 ElementCount::getScalable(0)); 4606 if (auto MaxVF = 4607 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4608 MaxSafeFixedVF, FoldTailByMasking)) 4609 Result.FixedVF = MaxVF; 4610 4611 if (auto MaxVF = 4612 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, 4613 MaxSafeScalableVF, FoldTailByMasking)) 4614 if (MaxVF.isScalable()) { 4615 Result.ScalableVF = MaxVF; 4616 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF 4617 << "\n"); 4618 } 4619 4620 return Result; 4621} 4622 4623FixedScalableVFPair 4624LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { 4625 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4626 // TODO: It may by useful to do since it's still likely to be dynamically 4627 // uniform if the target can skip. 4628 reportVectorizationFailure( 4629 "Not inserting runtime ptr check for divergent target", 4630 "runtime pointer checks needed. Not enabled for divergent target", 4631 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4632 return FixedScalableVFPair::getNone(); 4633 } 4634 4635 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4636 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); 4637 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4638 if (TC == 1) { 4639 reportVectorizationFailure("Single iteration (non) loop", 4640 "loop trip count is one, irrelevant for vectorization", 4641 "SingleIterationLoop", ORE, TheLoop); 4642 return FixedScalableVFPair::getNone(); 4643 } 4644 4645 switch (ScalarEpilogueStatus) { 4646 case CM_ScalarEpilogueAllowed: 4647 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4648 case CM_ScalarEpilogueNotAllowedUsePredicate: 4649 [[fallthrough]]; 4650 case CM_ScalarEpilogueNotNeededUsePredicate: 4651 LLVM_DEBUG( 4652 dbgs() << "LV: vector predicate hint/switch found.\n" 4653 << "LV: Not allowing scalar epilogue, creating predicated " 4654 << "vector loop.\n"); 4655 break; 4656 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4657 // fallthrough as a special case of OptForSize 4658 case CM_ScalarEpilogueNotAllowedOptSize: 4659 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4660 LLVM_DEBUG( 4661 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4662 else 4663 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4664 << "count.\n"); 4665 4666 // Bail if runtime checks are required, which are not good when optimising 4667 // for size. 4668 if (runtimeChecksRequired()) 4669 return FixedScalableVFPair::getNone(); 4670 4671 break; 4672 } 4673 4674 // The only loops we can vectorize without a scalar epilogue, are loops with 4675 // a bottom-test and a single exiting block. We'd have to handle the fact 4676 // that not every instruction executes on the last iteration. This will 4677 // require a lane mask which varies through the vector loop body. (TODO) 4678 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4679 // If there was a tail-folding hint/switch, but we can't fold the tail by 4680 // masking, fallback to a vectorization with a scalar epilogue. 4681 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4682 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4683 "scalar epilogue instead.\n"); 4684 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4685 return computeFeasibleMaxVF(MaxTC, UserVF, false); 4686 } 4687 return FixedScalableVFPair::getNone(); 4688 } 4689 4690 // Now try the tail folding 4691 4692 // Invalidate interleave groups that require an epilogue if we can't mask 4693 // the interleave-group. 4694 if (!useMaskedInterleavedAccesses(TTI)) { 4695 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && 4696 "No decisions should have been taken at this point"); 4697 // Note: There is no need to invalidate any cost modeling decisions here, as 4698 // non where taken so far. 4699 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4700 } 4701 4702 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true); 4703 4704 // Avoid tail folding if the trip count is known to be a multiple of any VF 4705 // we choose. 4706 std::optional<unsigned> MaxPowerOf2RuntimeVF = 4707 MaxFactors.FixedVF.getFixedValue(); 4708 if (MaxFactors.ScalableVF) { 4709 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI); 4710 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { 4711 MaxPowerOf2RuntimeVF = std::max<unsigned>( 4712 *MaxPowerOf2RuntimeVF, 4713 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); 4714 } else 4715 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. 4716 } 4717 4718 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { 4719 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && 4720 "MaxFixedVF must be a power of 2"); 4721 unsigned MaxVFtimesIC = 4722 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; 4723 ScalarEvolution *SE = PSE.getSE(); 4724 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 4725 const SCEV *ExitCount = SE->getAddExpr( 4726 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 4727 const SCEV *Rem = SE->getURemExpr( 4728 SE->applyLoopGuards(ExitCount, TheLoop), 4729 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); 4730 if (Rem->isZero()) { 4731 // Accept MaxFixedVF if we do not have a tail. 4732 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4733 return MaxFactors; 4734 } 4735 } 4736 4737 // If we don't know the precise trip count, or if the trip count that we 4738 // found modulo the vectorization factor is not zero, try to fold the tail 4739 // by masking. 4740 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4741 if (Legal->prepareToFoldTailByMasking()) { 4742 CanFoldTailByMasking = true; 4743 return MaxFactors; 4744 } 4745 4746 // If there was a tail-folding hint/switch, but we can't fold the tail by 4747 // masking, fallback to a vectorization with a scalar epilogue. 4748 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { 4749 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " 4750 "scalar epilogue instead.\n"); 4751 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 4752 return MaxFactors; 4753 } 4754 4755 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { 4756 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"); 4757 return FixedScalableVFPair::getNone(); 4758 } 4759 4760 if (TC == 0) { 4761 reportVectorizationFailure( 4762 "Unable to calculate the loop count due to complex control flow", 4763 "unable to calculate the loop count due to complex control flow", 4764 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4765 return FixedScalableVFPair::getNone(); 4766 } 4767 4768 reportVectorizationFailure( 4769 "Cannot optimize for size and vectorize at the same time.", 4770 "cannot optimize for size and vectorize at the same time. " 4771 "Enable vectorization of this loop with '#pragma clang loop " 4772 "vectorize(enable)' when compiling with -Os/-Oz", 4773 "NoTailLoopWithOptForSize", ORE, TheLoop); 4774 return FixedScalableVFPair::getNone(); 4775} 4776 4777ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( 4778 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, 4779 ElementCount MaxSafeVF, bool FoldTailByMasking) { 4780 bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); 4781 const TypeSize WidestRegister = TTI.getRegisterBitWidth( 4782 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4783 : TargetTransformInfo::RGK_FixedWidthVector); 4784 4785 // Convenience function to return the minimum of two ElementCounts. 4786 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { 4787 assert((LHS.isScalable() == RHS.isScalable()) && 4788 "Scalable flags must match"); 4789 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; 4790 }; 4791 4792 // Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4793 // Note that both WidestRegister and WidestType may not be a powers of 2. 4794 auto MaxVectorElementCount = ElementCount::get( 4795 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType), 4796 ComputeScalableMaxVF); 4797 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); 4798 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4799 << (MaxVectorElementCount * WidestType) << " bits.\n"); 4800 4801 if (!MaxVectorElementCount) { 4802 LLVM_DEBUG(dbgs() << "LV: The target has no " 4803 << (ComputeScalableMaxVF ? "scalable" : "fixed") 4804 << " vector registers.\n"); 4805 return ElementCount::getFixed(1); 4806 } 4807 4808 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); 4809 if (MaxVectorElementCount.isScalable() && 4810 TheFunction->hasFnAttribute(Attribute::VScaleRange)) { 4811 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); 4812 auto Min = Attr.getVScaleRangeMin(); 4813 WidestRegisterMinEC *= Min; 4814 } 4815 4816 // When a scalar epilogue is required, at least one iteration of the scalar 4817 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4818 // max VF that results in a dead vector loop. 4819 if (MaxTripCount > 0 && requiresScalarEpilogue(true)) 4820 MaxTripCount -= 1; 4821 4822 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && 4823 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) { 4824 // If upper bound loop trip count (TC) is known at compile time there is no 4825 // point in choosing VF greater than TC (as done in the loop below). Select 4826 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4827 // scalable, we only fall back on a fixed VF when the TC is less than or 4828 // equal to the known number of lanes. 4829 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount); 4830 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " 4831 "exceeding the constant trip count: " 4832 << ClampedUpperTripCount << "\n"); 4833 return ElementCount::get( 4834 ClampedUpperTripCount, 4835 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); 4836 } 4837 4838 TargetTransformInfo::RegisterKind RegKind = 4839 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector 4840 : TargetTransformInfo::RGK_FixedWidthVector; 4841 ElementCount MaxVF = MaxVectorElementCount; 4842 if (MaximizeBandwidth || 4843 (MaximizeBandwidth.getNumOccurrences() == 0 && 4844 (TTI.shouldMaximizeVectorBandwidth(RegKind) || 4845 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { 4846 auto MaxVectorElementCountMaxBW = ElementCount::get( 4847 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), 4848 ComputeScalableMaxVF); 4849 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); 4850 4851 // Collect all viable vectorization factors larger than the default MaxVF 4852 // (i.e. MaxVectorElementCount). 4853 SmallVector<ElementCount, 8> VFs; 4854 for (ElementCount VS = MaxVectorElementCount * 2; 4855 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) 4856 VFs.push_back(VS); 4857 4858 // For each VF calculate its register usage. 4859 auto RUs = calculateRegisterUsage(VFs); 4860 4861 // Select the largest VF which doesn't require more registers than existing 4862 // ones. 4863 for (int i = RUs.size() - 1; i >= 0; --i) { 4864 bool Selected = true; 4865 for (auto &pair : RUs[i].MaxLocalUsers) { 4866 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 4867 if (pair.second > TargetNumRegisters) 4868 Selected = false; 4869 } 4870 if (Selected) { 4871 MaxVF = VFs[i]; 4872 break; 4873 } 4874 } 4875 if (ElementCount MinVF = 4876 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) { 4877 if (ElementCount::isKnownLT(MaxVF, MinVF)) { 4878 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4879 << ") with target's minimum: " << MinVF << '\n'); 4880 MaxVF = MinVF; 4881 } 4882 } 4883 4884 // Invalidate any widening decisions we might have made, in case the loop 4885 // requires prediction (decided later), but we have already made some 4886 // load/store widening decisions. 4887 invalidateCostModelingDecisions(); 4888 } 4889 return MaxVF; 4890} 4891 4892/// Convenience function that returns the value of vscale_range iff 4893/// vscale_range.min == vscale_range.max or otherwise returns the value 4894/// returned by the corresponding TTI method. 4895static std::optional<unsigned> 4896getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { 4897 const Function *Fn = L->getHeader()->getParent(); 4898 if (Fn->hasFnAttribute(Attribute::VScaleRange)) { 4899 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); 4900 auto Min = Attr.getVScaleRangeMin(); 4901 auto Max = Attr.getVScaleRangeMax(); 4902 if (Max && Min == Max) 4903 return Max; 4904 } 4905 4906 return TTI.getVScaleForTuning(); 4907} 4908 4909bool LoopVectorizationPlanner::isMoreProfitable( 4910 const VectorizationFactor &A, const VectorizationFactor &B) const { 4911 InstructionCost CostA = A.Cost; 4912 InstructionCost CostB = B.Cost; 4913 4914 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); 4915 4916 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { 4917 // If the trip count is a known (possibly small) constant, the trip count 4918 // will be rounded up to an integer number of iterations under 4919 // FoldTailByMasking. The total cost in that case will be 4920 // VecCost*ceil(TripCount/VF). When not folding the tail, the total 4921 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4922 // some extra overheads, but for the purpose of comparing the costs of 4923 // different VFs we can use this to compare the total loop-body cost 4924 // expected after vectorization. 4925 auto GetCostForTC = [MaxTripCount, this](unsigned VF, 4926 InstructionCost VectorCost, 4927 InstructionCost ScalarCost) { 4928 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) 4929 : VectorCost * (MaxTripCount / VF) + 4930 ScalarCost * (MaxTripCount % VF); 4931 }; 4932 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); 4933 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); 4934 4935 return RTCostA < RTCostB; 4936 } 4937 4938 // Improve estimate for the vector width if it is scalable. 4939 unsigned EstimatedWidthA = A.Width.getKnownMinValue(); 4940 unsigned EstimatedWidthB = B.Width.getKnownMinValue(); 4941 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) { 4942 if (A.Width.isScalable()) 4943 EstimatedWidthA *= *VScale; 4944 if (B.Width.isScalable()) 4945 EstimatedWidthB *= *VScale; 4946 } 4947 4948 // Assume vscale may be larger than 1 (or the value being tuned for), 4949 // so that scalable vectorization is slightly favorable over fixed-width 4950 // vectorization. 4951 if (A.Width.isScalable() && !B.Width.isScalable()) 4952 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); 4953 4954 // To avoid the need for FP division: 4955 // (CostA / A.Width) < (CostB / B.Width) 4956 // <=> (CostA * B.Width) < (CostB * A.Width) 4957 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); 4958} 4959 4960static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts, 4961 OptimizationRemarkEmitter *ORE, 4962 Loop *TheLoop) { 4963 if (InvalidCosts.empty()) 4964 return; 4965 4966 // Emit a report of VFs with invalid costs in the loop. 4967 4968 // Group the remarks per instruction, keeping the instruction order from 4969 // InvalidCosts. 4970 std::map<Instruction *, unsigned> Numbering; 4971 unsigned I = 0; 4972 for (auto &Pair : InvalidCosts) 4973 if (!Numbering.count(Pair.first)) 4974 Numbering[Pair.first] = I++; 4975 4976 // Sort the list, first on instruction(number) then on VF. 4977 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { 4978 if (Numbering[A.first] != Numbering[B.first]) 4979 return Numbering[A.first] < Numbering[B.first]; 4980 ElementCountComparator ECC; 4981 return ECC(A.second, B.second); 4982 }); 4983 4984 // For a list of ordered instruction-vf pairs: 4985 // [(load, vf1), (load, vf2), (store, vf1)] 4986 // Group the instructions together to emit separate remarks for: 4987 // load (vf1, vf2) 4988 // store (vf1) 4989 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); 4990 auto Subset = ArrayRef<InstructionVFPair>(); 4991 do { 4992 if (Subset.empty()) 4993 Subset = Tail.take_front(1); 4994 4995 Instruction *I = Subset.front().first; 4996 4997 // If the next instruction is different, or if there are no other pairs, 4998 // emit a remark for the collated subset. e.g. 4999 // [(load, vf1), (load, vf2))] 5000 // to emit: 5001 // remark: invalid costs for 'load' at VF=(vf, vf2) 5002 if (Subset == Tail || Tail[Subset.size()].first != I) { 5003 std::string OutString; 5004 raw_string_ostream OS(OutString); 5005 assert(!Subset.empty() && "Unexpected empty range"); 5006 OS << "Instruction with invalid costs prevented vectorization at VF=("; 5007 for (const auto &Pair : Subset) 5008 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second; 5009 OS << "):"; 5010 if (auto *CI = dyn_cast<CallInst>(I)) 5011 OS << " call to " << CI->getCalledFunction()->getName(); 5012 else 5013 OS << " " << I->getOpcodeName(); 5014 OS.flush(); 5015 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I); 5016 Tail = Tail.drop_front(Subset.size()); 5017 Subset = {}; 5018 } else 5019 // Grow the subset by one element 5020 Subset = Tail.take_front(Subset.size() + 1); 5021 } while (!Tail.empty()); 5022} 5023 5024VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( 5025 const ElementCountSet &VFCandidates) { 5026 InstructionCost ExpectedCost = 5027 CM.expectedCost(ElementCount::getFixed(1)).first; 5028 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); 5029 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); 5030 assert(VFCandidates.count(ElementCount::getFixed(1)) && 5031 "Expected Scalar VF to be a candidate"); 5032 5033 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, 5034 ExpectedCost); 5035 VectorizationFactor ChosenFactor = ScalarCost; 5036 5037 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; 5038 if (ForceVectorization && VFCandidates.size() > 1) { 5039 // Ignore scalar width, because the user explicitly wants vectorization. 5040 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5041 // evaluation. 5042 ChosenFactor.Cost = InstructionCost::getMax(); 5043 } 5044 5045 SmallVector<InstructionVFPair> InvalidCosts; 5046 for (const auto &i : VFCandidates) { 5047 // The cost for scalar VF=1 is already calculated, so ignore it. 5048 if (i.isScalar()) 5049 continue; 5050 5051 LoopVectorizationCostModel::VectorizationCostTy C = 5052 CM.expectedCost(i, &InvalidCosts); 5053 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); 5054 5055#ifndef NDEBUG 5056 unsigned AssumedMinimumVscale = 5057 getVScaleForTuning(OrigLoop, TTI).value_or(1); 5058 unsigned Width = 5059 Candidate.Width.isScalable() 5060 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale 5061 : Candidate.Width.getFixedValue(); 5062 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5063 << " costs: " << (Candidate.Cost / Width)); 5064 if (i.isScalable()) 5065 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " 5066 << AssumedMinimumVscale << ")"); 5067 LLVM_DEBUG(dbgs() << ".\n"); 5068#endif 5069 5070 if (!C.second && !ForceVectorization) { 5071 LLVM_DEBUG( 5072 dbgs() << "LV: Not considering vector loop of width " << i 5073 << " because it will not generate any vector instructions.\n"); 5074 continue; 5075 } 5076 5077 // If profitable add it to ProfitableVF list. 5078 if (isMoreProfitable(Candidate, ScalarCost)) 5079 ProfitableVFs.push_back(Candidate); 5080 5081 if (isMoreProfitable(Candidate, ChosenFactor)) 5082 ChosenFactor = Candidate; 5083 } 5084 5085 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); 5086 5087 if (!EnableCondStoresVectorization && CM.hasPredStores()) { 5088 reportVectorizationFailure( 5089 "There are conditional stores.", 5090 "store that is conditionally executed prevents vectorization", 5091 "ConditionalStore", ORE, OrigLoop); 5092 ChosenFactor = ScalarCost; 5093 } 5094 5095 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && 5096 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() 5097 << "LV: Vectorization seems to be not beneficial, " 5098 << "but was forced by a user.\n"); 5099 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); 5100 return ChosenFactor; 5101} 5102 5103bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( 5104 ElementCount VF) const { 5105 // Cross iteration phis such as reductions need special handling and are 5106 // currently unsupported. 5107 if (any_of(OrigLoop->getHeader()->phis(), 5108 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) 5109 return false; 5110 5111 // Phis with uses outside of the loop require special handling and are 5112 // currently unsupported. 5113 for (const auto &Entry : Legal->getInductionVars()) { 5114 // Look for uses of the value of the induction at the last iteration. 5115 Value *PostInc = 5116 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 5117 for (User *U : PostInc->users()) 5118 if (!OrigLoop->contains(cast<Instruction>(U))) 5119 return false; 5120 // Look for uses of penultimate value of the induction. 5121 for (User *U : Entry.first->users()) 5122 if (!OrigLoop->contains(cast<Instruction>(U))) 5123 return false; 5124 } 5125 5126 // Epilogue vectorization code has not been auditted to ensure it handles 5127 // non-latch exits properly. It may be fine, but it needs auditted and 5128 // tested. 5129 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) 5130 return false; 5131 5132 return true; 5133} 5134 5135bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( 5136 const ElementCount VF) const { 5137 // FIXME: We need a much better cost-model to take different parameters such 5138 // as register pressure, code size increase and cost of extra branches into 5139 // account. For now we apply a very crude heuristic and only consider loops 5140 // with vectorization factors larger than a certain value. 5141 5142 // Allow the target to opt out entirely. 5143 if (!TTI.preferEpilogueVectorization()) 5144 return false; 5145 5146 // We also consider epilogue vectorization unprofitable for targets that don't 5147 // consider interleaving beneficial (eg. MVE). 5148 if (TTI.getMaxInterleaveFactor(VF) <= 1) 5149 return false; 5150 5151 unsigned Multiplier = 1; 5152 if (VF.isScalable()) 5153 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1); 5154 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) 5155 return true; 5156 return false; 5157} 5158 5159VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( 5160 const ElementCount MainLoopVF, unsigned IC) { 5161 VectorizationFactor Result = VectorizationFactor::Disabled(); 5162 if (!EnableEpilogueVectorization) { 5163 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); 5164 return Result; 5165 } 5166 5167 if (!CM.isScalarEpilogueAllowed()) { 5168 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " 5169 "epilogue is allowed.\n"); 5170 return Result; 5171 } 5172 5173 // Not really a cost consideration, but check for unsupported cases here to 5174 // simplify the logic. 5175 if (!isCandidateForEpilogueVectorization(MainLoopVF)) { 5176 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " 5177 "is not a supported candidate.\n"); 5178 return Result; 5179 } 5180 5181 if (EpilogueVectorizationForceVF > 1) { 5182 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); 5183 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); 5184 if (hasPlanWithVF(ForcedEC)) 5185 return {ForcedEC, 0, 0}; 5186 else { 5187 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " 5188 "viable.\n"); 5189 return Result; 5190 } 5191 } 5192 5193 if (OrigLoop->getHeader()->getParent()->hasOptSize() || 5194 OrigLoop->getHeader()->getParent()->hasMinSize()) { 5195 LLVM_DEBUG( 5196 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); 5197 return Result; 5198 } 5199 5200 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { 5201 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " 5202 "this loop\n"); 5203 return Result; 5204 } 5205 5206 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 5207 // the main loop handles 8 lanes per iteration. We could still benefit from 5208 // vectorizing the epilogue loop with VF=4. 5209 ElementCount EstimatedRuntimeVF = MainLoopVF; 5210 if (MainLoopVF.isScalable()) { 5211 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); 5212 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) 5213 EstimatedRuntimeVF *= *VScale; 5214 } 5215 5216 ScalarEvolution &SE = *PSE.getSE(); 5217 Type *TCType = Legal->getWidestInductionType(); 5218 const SCEV *RemainingIterations = nullptr; 5219 for (auto &NextVF : ProfitableVFs) { 5220 // Skip candidate VFs without a corresponding VPlan. 5221 if (!hasPlanWithVF(NextVF.Width)) 5222 continue; 5223 5224 // Skip candidate VFs with widths >= the estimate runtime VF (scalable 5225 // vectors) or the VF of the main loop (fixed vectors). 5226 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && 5227 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || 5228 ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) 5229 continue; 5230 5231 // If NextVF is greater than the number of remaining iterations, the 5232 // epilogue loop would be dead. Skip such factors. 5233 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { 5234 // TODO: extend to support scalable VFs. 5235 if (!RemainingIterations) { 5236 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop); 5237 RemainingIterations = SE.getURemExpr( 5238 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); 5239 } 5240 if (SE.isKnownPredicate( 5241 CmpInst::ICMP_UGT, 5242 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), 5243 RemainingIterations)) 5244 continue; 5245 } 5246 5247 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) 5248 Result = NextVF; 5249 } 5250 5251 if (Result != VectorizationFactor::Disabled()) 5252 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " 5253 << Result.Width << "\n"); 5254 return Result; 5255} 5256 5257std::pair<unsigned, unsigned> 5258LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5259 unsigned MinWidth = -1U; 5260 unsigned MaxWidth = 8; 5261 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5262 // For in-loop reductions, no element types are added to ElementTypesInLoop 5263 // if there are no loads/stores in the loop. In this case, check through the 5264 // reduction variables to determine the maximum width. 5265 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { 5266 // Reset MaxWidth so that we can find the smallest type used by recurrences 5267 // in the loop. 5268 MaxWidth = -1U; 5269 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { 5270 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; 5271 // When finding the min width used by the recurrence we need to account 5272 // for casts on the input operands of the recurrence. 5273 MaxWidth = std::min<unsigned>( 5274 MaxWidth, std::min<unsigned>( 5275 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), 5276 RdxDesc.getRecurrenceType()->getScalarSizeInBits())); 5277 } 5278 } else { 5279 for (Type *T : ElementTypesInLoop) { 5280 MinWidth = std::min<unsigned>( 5281 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5282 MaxWidth = std::max<unsigned>( 5283 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue()); 5284 } 5285 } 5286 return {MinWidth, MaxWidth}; 5287} 5288 5289void LoopVectorizationCostModel::collectElementTypesForWidening() { 5290 ElementTypesInLoop.clear(); 5291 // For each block. 5292 for (BasicBlock *BB : TheLoop->blocks()) { 5293 // For each instruction in the loop. 5294 for (Instruction &I : BB->instructionsWithoutDebug()) { 5295 Type *T = I.getType(); 5296 5297 // Skip ignored values. 5298 if (ValuesToIgnore.count(&I)) 5299 continue; 5300 5301 // Only examine Loads, Stores and PHINodes. 5302 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5303 continue; 5304 5305 // Examine PHI nodes that are reduction variables. Update the type to 5306 // account for the recurrence type. 5307 if (auto *PN = dyn_cast<PHINode>(&I)) { 5308 if (!Legal->isReductionVariable(PN)) 5309 continue; 5310 const RecurrenceDescriptor &RdxDesc = 5311 Legal->getReductionVars().find(PN)->second; 5312 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || 5313 TTI.preferInLoopReduction(RdxDesc.getOpcode(), 5314 RdxDesc.getRecurrenceType(), 5315 TargetTransformInfo::ReductionFlags())) 5316 continue; 5317 T = RdxDesc.getRecurrenceType(); 5318 } 5319 5320 // Examine the stored values. 5321 if (auto *ST = dyn_cast<StoreInst>(&I)) 5322 T = ST->getValueOperand()->getType(); 5323 5324 assert(T->isSized() && 5325 "Expected the load/store/recurrence type to be sized"); 5326 5327 ElementTypesInLoop.insert(T); 5328 } 5329 } 5330} 5331 5332unsigned 5333LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, 5334 InstructionCost LoopCost) { 5335 // -- The interleave heuristics -- 5336 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5337 // There are many micro-architectural considerations that we can't predict 5338 // at this level. For example, frontend pressure (on decode or fetch) due to 5339 // code size, or the number and capabilities of the execution ports. 5340 // 5341 // We use the following heuristics to select the interleave count: 5342 // 1. If the code has reductions, then we interleave to break the cross 5343 // iteration dependency. 5344 // 2. If the loop is really small, then we interleave to reduce the loop 5345 // overhead. 5346 // 3. We don't interleave if we think that we will spill registers to memory 5347 // due to the increased register pressure. 5348 5349 if (!isScalarEpilogueAllowed()) 5350 return 1; 5351 5352 // We used the distance for the interleave count. 5353 if (!Legal->isSafeForAnyVectorWidth()) 5354 return 1; 5355 5356 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5357 const bool HasReductions = !Legal->getReductionVars().empty(); 5358 // Do not interleave loops with a relatively small known or estimated trip 5359 // count. But we will interleave when InterleaveSmallLoopScalarReduction is 5360 // enabled, and the code has scalar reductions(HasReductions && VF = 1), 5361 // because with the above conditions interleaving can expose ILP and break 5362 // cross iteration dependences for reductions. 5363 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && 5364 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) 5365 return 1; 5366 5367 // If we did not calculate the cost for VF (because the user selected the VF) 5368 // then we calculate the cost of VF here. 5369 if (LoopCost == 0) { 5370 LoopCost = expectedCost(VF).first; 5371 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); 5372 5373 // Loop body is free and there is no need for interleaving. 5374 if (LoopCost == 0) 5375 return 1; 5376 } 5377 5378 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5379 // We divide by these constants so assume that we have at least one 5380 // instruction that uses at least one register. 5381 for (auto& pair : R.MaxLocalUsers) { 5382 pair.second = std::max(pair.second, 1U); 5383 } 5384 5385 // We calculate the interleave count using the following formula. 5386 // Subtract the number of loop invariants from the number of available 5387 // registers. These registers are used by all of the interleaved instances. 5388 // Next, divide the remaining registers by the number of registers that is 5389 // required by the loop, in order to estimate how many parallel instances 5390 // fit without causing spills. All of this is rounded down if necessary to be 5391 // a power of two. We want power of two interleave count to simplify any 5392 // addressing operations or alignment considerations. 5393 // We also want power of two interleave counts to ensure that the induction 5394 // variable of the vector loop wraps to zero, when tail is folded by masking; 5395 // this currently happens when OptForSize, in which case IC is set to 1 above. 5396 unsigned IC = UINT_MAX; 5397 5398 for (auto& pair : R.MaxLocalUsers) { 5399 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5400 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5401 << " registers of " 5402 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5403 if (VF.isScalar()) { 5404 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5405 TargetNumRegisters = ForceTargetNumScalarRegs; 5406 } else { 5407 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5408 TargetNumRegisters = ForceTargetNumVectorRegs; 5409 } 5410 unsigned MaxLocalUsers = pair.second; 5411 unsigned LoopInvariantRegs = 0; 5412 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5413 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5414 5415 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / 5416 MaxLocalUsers); 5417 // Don't count the induction variable as interleaved. 5418 if (EnableIndVarRegisterHeur) { 5419 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5420 std::max(1U, (MaxLocalUsers - 1))); 5421 } 5422 5423 IC = std::min(IC, TmpIC); 5424 } 5425 5426 // Clamp the interleave ranges to reasonable counts. 5427 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5428 5429 // Check if the user has overridden the max. 5430 if (VF.isScalar()) { 5431 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5432 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5433 } else { 5434 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5435 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5436 } 5437 5438 unsigned EstimatedVF = VF.getKnownMinValue(); 5439 if (VF.isScalable()) { 5440 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI)) 5441 EstimatedVF *= *VScale; 5442 } 5443 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); 5444 5445 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5446 if (KnownTC) { 5447 // If trip count is known we select between two prospective ICs, where 5448 // 1) the aggressive IC is capped by the trip count divided by VF 5449 // 2) the conservative IC is capped by the trip count divided by (VF * 2) 5450 // The final IC is selected in a way that the epilogue loop trip count is 5451 // minimized while maximizing the IC itself, so that we either run the 5452 // vector loop at least once if it generates a small epilogue loop, or else 5453 // we run the vector loop at least twice. 5454 5455 unsigned InterleaveCountUB = bit_floor( 5456 std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount))); 5457 unsigned InterleaveCountLB = bit_floor(std::max( 5458 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5459 MaxInterleaveCount = InterleaveCountLB; 5460 5461 if (InterleaveCountUB != InterleaveCountLB) { 5462 unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB)); 5463 unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB)); 5464 // If both produce same scalar tail, maximize the IC to do the same work 5465 // in fewer vector loop iterations 5466 if (TailTripCountUB == TailTripCountLB) 5467 MaxInterleaveCount = InterleaveCountUB; 5468 } 5469 } else if (BestKnownTC) { 5470 // If trip count is an estimated compile time constant, limit the 5471 // IC to be capped by the trip count divided by VF * 2, such that the vector 5472 // loop runs at least twice to make interleaving seem profitable when there 5473 // is an epilogue loop present. Since exact Trip count is not known we 5474 // choose to be conservative in our IC estimate. 5475 MaxInterleaveCount = bit_floor(std::max( 5476 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount))); 5477 } 5478 5479 assert(MaxInterleaveCount > 0 && 5480 "Maximum interleave count must be greater than 0"); 5481 5482 // Clamp the calculated IC to be between the 1 and the max interleave count 5483 // that the target and trip count allows. 5484 if (IC > MaxInterleaveCount) 5485 IC = MaxInterleaveCount; 5486 else 5487 // Make sure IC is greater than 0. 5488 IC = std::max(1u, IC); 5489 5490 assert(IC > 0 && "Interleave count must be greater than 0."); 5491 5492 // Interleave if we vectorized this loop and there is a reduction that could 5493 // benefit from interleaving. 5494 if (VF.isVector() && HasReductions) { 5495 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5496 return IC; 5497 } 5498 5499 // For any scalar loop that either requires runtime checks or predication we 5500 // are better off leaving this to the unroller. Note that if we've already 5501 // vectorized the loop we will have done the runtime check and so interleaving 5502 // won't require further checks. 5503 bool ScalarInterleavingRequiresPredication = 5504 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { 5505 return Legal->blockNeedsPredication(BB); 5506 })); 5507 bool ScalarInterleavingRequiresRuntimePointerCheck = 5508 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); 5509 5510 // We want to interleave small loops in order to reduce the loop overhead and 5511 // potentially expose ILP opportunities. 5512 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' 5513 << "LV: IC is " << IC << '\n' 5514 << "LV: VF is " << VF << '\n'); 5515 const bool AggressivelyInterleaveReductions = 5516 TTI.enableAggressiveInterleaving(HasReductions); 5517 if (!ScalarInterleavingRequiresRuntimePointerCheck && 5518 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { 5519 // We assume that the cost overhead is 1 and we use the cost model 5520 // to estimate the cost of the loop and interleave until the cost of the 5521 // loop overhead is about 5% of the cost of the loop. 5522 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>( 5523 SmallLoopCost / *LoopCost.getValue())); 5524 5525 // Interleave until store/load ports (estimated by max interleave count) are 5526 // saturated. 5527 unsigned NumStores = Legal->getNumStores(); 5528 unsigned NumLoads = Legal->getNumLoads(); 5529 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5530 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5531 5532 // There is little point in interleaving for reductions containing selects 5533 // and compares when VF=1 since it may just create more overhead than it's 5534 // worth for loops with small trip counts. This is because we still have to 5535 // do the final reduction after the loop. 5536 bool HasSelectCmpReductions = 5537 HasReductions && 5538 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5539 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5540 return RecurrenceDescriptor::isAnyOfRecurrenceKind( 5541 RdxDesc.getRecurrenceKind()); 5542 }); 5543 if (HasSelectCmpReductions) { 5544 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n"); 5545 return 1; 5546 } 5547 5548 // If we have a scalar reduction (vector reductions are already dealt with 5549 // by this point), we can increase the critical path length if the loop 5550 // we're interleaving is inside another loop. For tree-wise reductions 5551 // set the limit to 2, and for ordered reductions it's best to disable 5552 // interleaving entirely. 5553 if (HasReductions && TheLoop->getLoopDepth() > 1) { 5554 bool HasOrderedReductions = 5555 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { 5556 const RecurrenceDescriptor &RdxDesc = Reduction.second; 5557 return RdxDesc.isOrdered(); 5558 }); 5559 if (HasOrderedReductions) { 5560 LLVM_DEBUG( 5561 dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); 5562 return 1; 5563 } 5564 5565 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5566 SmallIC = std::min(SmallIC, F); 5567 StoresIC = std::min(StoresIC, F); 5568 LoadsIC = std::min(LoadsIC, F); 5569 } 5570 5571 if (EnableLoadStoreRuntimeInterleave && 5572 std::max(StoresIC, LoadsIC) > SmallIC) { 5573 LLVM_DEBUG( 5574 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5575 return std::max(StoresIC, LoadsIC); 5576 } 5577 5578 // If there are scalar reductions and TTI has enabled aggressive 5579 // interleaving for reductions, we will interleave to expose ILP. 5580 if (InterleaveSmallLoopScalarReduction && VF.isScalar() && 5581 AggressivelyInterleaveReductions) { 5582 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5583 // Interleave no less than SmallIC but not as aggressive as the normal IC 5584 // to satisfy the rare situation when resources are too limited. 5585 return std::max(IC / 2, SmallIC); 5586 } else { 5587 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5588 return SmallIC; 5589 } 5590 } 5591 5592 // Interleave if this is a large loop (small loops are already dealt with by 5593 // this point) that could benefit from interleaving. 5594 if (AggressivelyInterleaveReductions) { 5595 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5596 return IC; 5597 } 5598 5599 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5600 return 1; 5601} 5602 5603SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5604LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { 5605 // This function calculates the register usage by measuring the highest number 5606 // of values that are alive at a single location. Obviously, this is a very 5607 // rough estimation. We scan the loop in a topological order in order and 5608 // assign a number to each instruction. We use RPO to ensure that defs are 5609 // met before their users. We assume that each instruction that has in-loop 5610 // users starts an interval. We record every time that an in-loop value is 5611 // used, so we have a list of the first and last occurrences of each 5612 // instruction. Next, we transpose this data structure into a multi map that 5613 // holds the list of intervals that *end* at a specific location. This multi 5614 // map allows us to perform a linear search. We scan the instructions linearly 5615 // and record each time that a new interval starts, by placing it in a set. 5616 // If we find this value in the multi-map then we remove it from the set. 5617 // The max register usage is the maximum size of the set. 5618 // We also search for instructions that are defined outside the loop, but are 5619 // used inside the loop. We need this number separately from the max-interval 5620 // usage number because when we unroll, loop-invariant values do not take 5621 // more register. 5622 LoopBlocksDFS DFS(TheLoop); 5623 DFS.perform(LI); 5624 5625 RegisterUsage RU; 5626 5627 // Each 'key' in the map opens a new interval. The values 5628 // of the map are the index of the 'last seen' usage of the 5629 // instruction that is the key. 5630 using IntervalMap = DenseMap<Instruction *, unsigned>; 5631 5632 // Maps instruction to its index. 5633 SmallVector<Instruction *, 64> IdxToInstr; 5634 // Marks the end of each interval. 5635 IntervalMap EndPoint; 5636 // Saves the list of instruction indices that are used in the loop. 5637 SmallPtrSet<Instruction *, 8> Ends; 5638 // Saves the list of values that are used in the loop but are defined outside 5639 // the loop (not including non-instruction values such as arguments and 5640 // constants). 5641 SmallSetVector<Instruction *, 8> LoopInvariants; 5642 5643 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5644 for (Instruction &I : BB->instructionsWithoutDebug()) { 5645 IdxToInstr.push_back(&I); 5646 5647 // Save the end location of each USE. 5648 for (Value *U : I.operands()) { 5649 auto *Instr = dyn_cast<Instruction>(U); 5650 5651 // Ignore non-instruction values such as arguments, constants, etc. 5652 // FIXME: Might need some motivation why these values are ignored. If 5653 // for example an argument is used inside the loop it will increase the 5654 // register pressure (so shouldn't we add it to LoopInvariants). 5655 if (!Instr) 5656 continue; 5657 5658 // If this instruction is outside the loop then record it and continue. 5659 if (!TheLoop->contains(Instr)) { 5660 LoopInvariants.insert(Instr); 5661 continue; 5662 } 5663 5664 // Overwrite previous end points. 5665 EndPoint[Instr] = IdxToInstr.size(); 5666 Ends.insert(Instr); 5667 } 5668 } 5669 } 5670 5671 // Saves the list of intervals that end with the index in 'key'. 5672 using InstrList = SmallVector<Instruction *, 2>; 5673 DenseMap<unsigned, InstrList> TransposeEnds; 5674 5675 // Transpose the EndPoints to a list of values that end at each index. 5676 for (auto &Interval : EndPoint) 5677 TransposeEnds[Interval.second].push_back(Interval.first); 5678 5679 SmallPtrSet<Instruction *, 8> OpenIntervals; 5680 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5681 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5682 5683 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5684 5685 const auto &TTICapture = TTI; 5686 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { 5687 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty)) 5688 return 0; 5689 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); 5690 }; 5691 5692 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5693 Instruction *I = IdxToInstr[i]; 5694 5695 // Remove all of the instructions that end at this location. 5696 InstrList &List = TransposeEnds[i]; 5697 for (Instruction *ToRemove : List) 5698 OpenIntervals.erase(ToRemove); 5699 5700 // Ignore instructions that are never used within the loop. 5701 if (!Ends.count(I)) 5702 continue; 5703 5704 // Skip ignored values. 5705 if (ValuesToIgnore.count(I)) 5706 continue; 5707 5708 collectInLoopReductions(); 5709 5710 // For each VF find the maximum usage of registers. 5711 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5712 // Count the number of registers used, per register class, given all open 5713 // intervals. 5714 // Note that elements in this SmallMapVector will be default constructed 5715 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5716 // there is no previous entry for ClassID. 5717 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5718 5719 if (VFs[j].isScalar()) { 5720 for (auto *Inst : OpenIntervals) { 5721 unsigned ClassID = 5722 TTI.getRegisterClassForType(false, Inst->getType()); 5723 // FIXME: The target might use more than one register for the type 5724 // even in the scalar case. 5725 RegUsage[ClassID] += 1; 5726 } 5727 } else { 5728 collectUniformsAndScalars(VFs[j]); 5729 for (auto *Inst : OpenIntervals) { 5730 // Skip ignored values for VF > 1. 5731 if (VecValuesToIgnore.count(Inst)) 5732 continue; 5733 if (isScalarAfterVectorization(Inst, VFs[j])) { 5734 unsigned ClassID = 5735 TTI.getRegisterClassForType(false, Inst->getType()); 5736 // FIXME: The target might use more than one register for the type 5737 // even in the scalar case. 5738 RegUsage[ClassID] += 1; 5739 } else { 5740 unsigned ClassID = 5741 TTI.getRegisterClassForType(true, Inst->getType()); 5742 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5743 } 5744 } 5745 } 5746 5747 for (auto& pair : RegUsage) { 5748 auto &Entry = MaxUsages[j][pair.first]; 5749 Entry = std::max(Entry, pair.second); 5750 } 5751 } 5752 5753 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5754 << OpenIntervals.size() << '\n'); 5755 5756 // Add the current instruction to the list of open intervals. 5757 OpenIntervals.insert(I); 5758 } 5759 5760 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5761 // Note that elements in this SmallMapVector will be default constructed 5762 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5763 // there is no previous entry for ClassID. 5764 SmallMapVector<unsigned, unsigned, 4> Invariant; 5765 5766 for (auto *Inst : LoopInvariants) { 5767 // FIXME: The target might use more than one register for the type 5768 // even in the scalar case. 5769 bool IsScalar = all_of(Inst->users(), [&](User *U) { 5770 auto *I = cast<Instruction>(U); 5771 return TheLoop != LI->getLoopFor(I->getParent()) || 5772 isScalarAfterVectorization(I, VFs[i]); 5773 }); 5774 5775 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; 5776 unsigned ClassID = 5777 TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); 5778 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); 5779 } 5780 5781 LLVM_DEBUG({ 5782 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5783 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5784 << " item\n"; 5785 for (const auto &pair : MaxUsages[i]) { 5786 dbgs() << "LV(REG): RegisterClass: " 5787 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5788 << " registers\n"; 5789 } 5790 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5791 << " item\n"; 5792 for (const auto &pair : Invariant) { 5793 dbgs() << "LV(REG): RegisterClass: " 5794 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5795 << " registers\n"; 5796 } 5797 }); 5798 5799 RU.LoopInvariantRegs = Invariant; 5800 RU.MaxLocalUsers = MaxUsages[i]; 5801 RUs[i] = RU; 5802 } 5803 5804 return RUs; 5805} 5806 5807bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, 5808 ElementCount VF) { 5809 // TODO: Cost model for emulated masked load/store is completely 5810 // broken. This hack guides the cost model to use an artificially 5811 // high enough value to practically disable vectorization with such 5812 // operations, except where previously deployed legality hack allowed 5813 // using very low cost values. This is to avoid regressions coming simply 5814 // from moving "masked load/store" check from legality to cost model. 5815 // Masked Load/Gather emulation was previously never allowed. 5816 // Limited number of Masked Store/Scatter emulation was allowed. 5817 assert((isPredicatedInst(I)) && 5818 "Expecting a scalar emulated instruction"); 5819 return isa<LoadInst>(I) || 5820 (isa<StoreInst>(I) && 5821 NumPredStores > NumberOfStoresToPredicate); 5822} 5823 5824void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { 5825 // If we aren't vectorizing the loop, or if we've already collected the 5826 // instructions to scalarize, there's nothing to do. Collection may already 5827 // have occurred if we have a user-selected VF and are now computing the 5828 // expected cost for interleaving. 5829 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) 5830 return; 5831 5832 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5833 // not profitable to scalarize any instructions, the presence of VF in the 5834 // map will indicate that we've analyzed it already. 5835 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5836 5837 PredicatedBBsAfterVectorization[VF].clear(); 5838 5839 // Find all the instructions that are scalar with predication in the loop and 5840 // determine if it would be better to not if-convert the blocks they are in. 5841 // If so, we also record the instructions to scalarize. 5842 for (BasicBlock *BB : TheLoop->blocks()) { 5843 if (!blockNeedsPredicationForAnyReason(BB)) 5844 continue; 5845 for (Instruction &I : *BB) 5846 if (isScalarWithPredication(&I, VF)) { 5847 ScalarCostsTy ScalarCosts; 5848 // Do not apply discount if scalable, because that would lead to 5849 // invalid scalarization costs. 5850 // Do not apply discount logic if hacked cost is needed 5851 // for emulated masked memrefs. 5852 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && 5853 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5854 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5855 // Remember that BB will remain after vectorization. 5856 PredicatedBBsAfterVectorization[VF].insert(BB); 5857 } 5858 } 5859} 5860 5861InstructionCost LoopVectorizationCostModel::computePredInstDiscount( 5862 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { 5863 assert(!isUniformAfterVectorization(PredInst, VF) && 5864 "Instruction marked uniform-after-vectorization will be predicated"); 5865 5866 // Initialize the discount to zero, meaning that the scalar version and the 5867 // vector version cost the same. 5868 InstructionCost Discount = 0; 5869 5870 // Holds instructions to analyze. The instructions we visit are mapped in 5871 // ScalarCosts. Those instructions are the ones that would be scalarized if 5872 // we find that the scalar version costs less. 5873 SmallVector<Instruction *, 8> Worklist; 5874 5875 // Returns true if the given instruction can be scalarized. 5876 auto canBeScalarized = [&](Instruction *I) -> bool { 5877 // We only attempt to scalarize instructions forming a single-use chain 5878 // from the original predicated block that would otherwise be vectorized. 5879 // Although not strictly necessary, we give up on instructions we know will 5880 // already be scalar to avoid traversing chains that are unlikely to be 5881 // beneficial. 5882 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5883 isScalarAfterVectorization(I, VF)) 5884 return false; 5885 5886 // If the instruction is scalar with predication, it will be analyzed 5887 // separately. We ignore it within the context of PredInst. 5888 if (isScalarWithPredication(I, VF)) 5889 return false; 5890 5891 // If any of the instruction's operands are uniform after vectorization, 5892 // the instruction cannot be scalarized. This prevents, for example, a 5893 // masked load from being scalarized. 5894 // 5895 // We assume we will only emit a value for lane zero of an instruction 5896 // marked uniform after vectorization, rather than VF identical values. 5897 // Thus, if we scalarize an instruction that uses a uniform, we would 5898 // create uses of values corresponding to the lanes we aren't emitting code 5899 // for. This behavior can be changed by allowing getScalarValue to clone 5900 // the lane zero values for uniforms rather than asserting. 5901 for (Use &U : I->operands()) 5902 if (auto *J = dyn_cast<Instruction>(U.get())) 5903 if (isUniformAfterVectorization(J, VF)) 5904 return false; 5905 5906 // Otherwise, we can scalarize the instruction. 5907 return true; 5908 }; 5909 5910 // Compute the expected cost discount from scalarizing the entire expression 5911 // feeding the predicated instruction. We currently only consider expressions 5912 // that are single-use instruction chains. 5913 Worklist.push_back(PredInst); 5914 while (!Worklist.empty()) { 5915 Instruction *I = Worklist.pop_back_val(); 5916 5917 // If we've already analyzed the instruction, there's nothing to do. 5918 if (ScalarCosts.contains(I)) 5919 continue; 5920 5921 // Compute the cost of the vector instruction. Note that this cost already 5922 // includes the scalarization overhead of the predicated instruction. 5923 InstructionCost VectorCost = getInstructionCost(I, VF).first; 5924 5925 // Compute the cost of the scalarized instruction. This cost is the cost of 5926 // the instruction as if it wasn't if-converted and instead remained in the 5927 // predicated block. We will scale this cost by block probability after 5928 // computing the scalarization overhead. 5929 InstructionCost ScalarCost = 5930 VF.getFixedValue() * 5931 getInstructionCost(I, ElementCount::getFixed(1)).first; 5932 5933 // Compute the scalarization overhead of needed insertelement instructions 5934 // and phi nodes. 5935 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 5936 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { 5937 ScalarCost += TTI.getScalarizationOverhead( 5938 cast<VectorType>(ToVectorTy(I->getType(), VF)), 5939 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, 5940 /*Extract*/ false, CostKind); 5941 ScalarCost += 5942 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); 5943 } 5944 5945 // Compute the scalarization overhead of needed extractelement 5946 // instructions. For each of the instruction's operands, if the operand can 5947 // be scalarized, add it to the worklist; otherwise, account for the 5948 // overhead. 5949 for (Use &U : I->operands()) 5950 if (auto *J = dyn_cast<Instruction>(U.get())) { 5951 assert(VectorType::isValidElementType(J->getType()) && 5952 "Instruction has non-scalar type"); 5953 if (canBeScalarized(J)) 5954 Worklist.push_back(J); 5955 else if (needsExtract(J, VF)) { 5956 ScalarCost += TTI.getScalarizationOverhead( 5957 cast<VectorType>(ToVectorTy(J->getType(), VF)), 5958 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, 5959 /*Extract*/ true, CostKind); 5960 } 5961 } 5962 5963 // Scale the total scalar cost by block probability. 5964 ScalarCost /= getReciprocalPredBlockProb(); 5965 5966 // Compute the discount. A non-negative discount means the vector version 5967 // of the instruction costs more, and scalarizing would be beneficial. 5968 Discount += VectorCost - ScalarCost; 5969 ScalarCosts[I] = ScalarCost; 5970 } 5971 5972 return Discount; 5973} 5974 5975LoopVectorizationCostModel::VectorizationCostTy 5976LoopVectorizationCostModel::expectedCost( 5977 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { 5978 VectorizationCostTy Cost; 5979 5980 // For each block. 5981 for (BasicBlock *BB : TheLoop->blocks()) { 5982 VectorizationCostTy BlockCost; 5983 5984 // For each instruction in the old loop. 5985 for (Instruction &I : BB->instructionsWithoutDebug()) { 5986 // Skip ignored values. 5987 if (ValuesToIgnore.count(&I) || 5988 (VF.isVector() && VecValuesToIgnore.count(&I))) 5989 continue; 5990 5991 VectorizationCostTy C = getInstructionCost(&I, VF); 5992 5993 // Check if we should override the cost. 5994 if (C.first.isValid() && 5995 ForceTargetInstructionCost.getNumOccurrences() > 0) 5996 C.first = InstructionCost(ForceTargetInstructionCost); 5997 5998 // Keep a list of instructions with invalid costs. 5999 if (Invalid && !C.first.isValid()) 6000 Invalid->emplace_back(&I, VF); 6001 6002 BlockCost.first += C.first; 6003 BlockCost.second |= C.second; 6004 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 6005 << " for VF " << VF << " For instruction: " << I 6006 << '\n'); 6007 } 6008 6009 // If we are vectorizing a predicated block, it will have been 6010 // if-converted. This means that the block's instructions (aside from 6011 // stores and instructions that may divide by zero) will now be 6012 // unconditionally executed. For the scalar case, we may not always execute 6013 // the predicated block, if it is an if-else block. Thus, scale the block's 6014 // cost by the probability of executing it. blockNeedsPredication from 6015 // Legal is used so as to not include all blocks in tail folded loops. 6016 if (VF.isScalar() && Legal->blockNeedsPredication(BB)) 6017 BlockCost.first /= getReciprocalPredBlockProb(); 6018 6019 Cost.first += BlockCost.first; 6020 Cost.second |= BlockCost.second; 6021 } 6022 6023 return Cost; 6024} 6025 6026/// Gets Address Access SCEV after verifying that the access pattern 6027/// is loop invariant except the induction variable dependence. 6028/// 6029/// This SCEV can be sent to the Target in order to estimate the address 6030/// calculation cost. 6031static const SCEV *getAddressAccessSCEV( 6032 Value *Ptr, 6033 LoopVectorizationLegality *Legal, 6034 PredicatedScalarEvolution &PSE, 6035 const Loop *TheLoop) { 6036 6037 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 6038 if (!Gep) 6039 return nullptr; 6040 6041 // We are looking for a gep with all loop invariant indices except for one 6042 // which should be an induction variable. 6043 auto SE = PSE.getSE(); 6044 unsigned NumOperands = Gep->getNumOperands(); 6045 for (unsigned i = 1; i < NumOperands; ++i) { 6046 Value *Opd = Gep->getOperand(i); 6047 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 6048 !Legal->isInductionVariable(Opd)) 6049 return nullptr; 6050 } 6051 6052 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 6053 return PSE.getSCEV(Ptr); 6054} 6055 6056InstructionCost 6057LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 6058 ElementCount VF) { 6059 assert(VF.isVector() && 6060 "Scalarization cost of instruction implies vectorization."); 6061 if (VF.isScalable()) 6062 return InstructionCost::getInvalid(); 6063 6064 Type *ValTy = getLoadStoreType(I); 6065 auto SE = PSE.getSE(); 6066 6067 unsigned AS = getLoadStoreAddressSpace(I); 6068 Value *Ptr = getLoadStorePointerOperand(I); 6069 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 6070 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 6071 // that it is being called from this specific place. 6072 6073 // Figure out whether the access is strided and get the stride value 6074 // if it's known in compile time 6075 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 6076 6077 // Get the cost of the scalar memory instruction and address computation. 6078 InstructionCost Cost = 6079 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 6080 6081 // Don't pass *I here, since it is scalar but will actually be part of a 6082 // vectorized loop where the user of it is a vectorized instruction. 6083 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6084 const Align Alignment = getLoadStoreAlignment(I); 6085 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), 6086 ValTy->getScalarType(), 6087 Alignment, AS, CostKind); 6088 6089 // Get the overhead of the extractelement and insertelement instructions 6090 // we might create due to scalarization. 6091 Cost += getScalarizationOverhead(I, VF, CostKind); 6092 6093 // If we have a predicated load/store, it will need extra i1 extracts and 6094 // conditional branches, but may not be executed for each vector lane. Scale 6095 // the cost by the probability of executing the predicated block. 6096 if (isPredicatedInst(I)) { 6097 Cost /= getReciprocalPredBlockProb(); 6098 6099 // Add the cost of an i1 extract and a branch 6100 auto *Vec_i1Ty = 6101 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); 6102 Cost += TTI.getScalarizationOverhead( 6103 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), 6104 /*Insert=*/false, /*Extract=*/true, CostKind); 6105 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); 6106 6107 if (useEmulatedMaskMemRefHack(I, VF)) 6108 // Artificially setting to a high enough value to practically disable 6109 // vectorization with such operations. 6110 Cost = 3000000; 6111 } 6112 6113 return Cost; 6114} 6115 6116InstructionCost 6117LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 6118 ElementCount VF) { 6119 Type *ValTy = getLoadStoreType(I); 6120 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6121 Value *Ptr = getLoadStorePointerOperand(I); 6122 unsigned AS = getLoadStoreAddressSpace(I); 6123 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr); 6124 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6125 6126 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6127 "Stride should be 1 or -1 for consecutive memory access"); 6128 const Align Alignment = getLoadStoreAlignment(I); 6129 InstructionCost Cost = 0; 6130 if (Legal->isMaskRequired(I)) { 6131 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6132 CostKind); 6133 } else { 6134 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6135 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, 6136 CostKind, OpInfo, I); 6137 } 6138 6139 bool Reverse = ConsecutiveStride < 0; 6140 if (Reverse) 6141 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6142 std::nullopt, CostKind, 0); 6143 return Cost; 6144} 6145 6146InstructionCost 6147LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 6148 ElementCount VF) { 6149 assert(Legal->isUniformMemOp(*I, VF)); 6150 6151 Type *ValTy = getLoadStoreType(I); 6152 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6153 const Align Alignment = getLoadStoreAlignment(I); 6154 unsigned AS = getLoadStoreAddressSpace(I); 6155 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6156 if (isa<LoadInst>(I)) { 6157 return TTI.getAddressComputationCost(ValTy) + 6158 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, 6159 CostKind) + 6160 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 6161 } 6162 StoreInst *SI = cast<StoreInst>(I); 6163 6164 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); 6165 return TTI.getAddressComputationCost(ValTy) + 6166 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, 6167 CostKind) + 6168 (isLoopInvariantStoreValue 6169 ? 0 6170 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 6171 CostKind, VF.getKnownMinValue() - 1)); 6172} 6173 6174InstructionCost 6175LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 6176 ElementCount VF) { 6177 Type *ValTy = getLoadStoreType(I); 6178 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6179 const Align Alignment = getLoadStoreAlignment(I); 6180 const Value *Ptr = getLoadStorePointerOperand(I); 6181 6182 return TTI.getAddressComputationCost(VectorTy) + 6183 TTI.getGatherScatterOpCost( 6184 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, 6185 TargetTransformInfo::TCK_RecipThroughput, I); 6186} 6187 6188InstructionCost 6189LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 6190 ElementCount VF) { 6191 Type *ValTy = getLoadStoreType(I); 6192 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF)); 6193 unsigned AS = getLoadStoreAddressSpace(I); 6194 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6195 6196 auto Group = getInterleavedAccessGroup(I); 6197 assert(Group && "Fail to get an interleaved access group."); 6198 6199 unsigned InterleaveFactor = Group->getFactor(); 6200 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 6201 6202 // Holds the indices of existing members in the interleaved group. 6203 SmallVector<unsigned, 4> Indices; 6204 for (unsigned IF = 0; IF < InterleaveFactor; IF++) 6205 if (Group->getMember(IF)) 6206 Indices.push_back(IF); 6207 6208 // Calculate the cost of the whole interleaved group. 6209 bool UseMaskForGaps = 6210 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || 6211 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor())); 6212 InstructionCost Cost = TTI.getInterleavedMemoryOpCost( 6213 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), 6214 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); 6215 6216 if (Group->isReverse()) { 6217 // TODO: Add support for reversed masked interleaved access. 6218 assert(!Legal->isMaskRequired(I) && 6219 "Reverse masked interleaved access not supported."); 6220 Cost += Group->getNumMembers() * 6221 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 6222 std::nullopt, CostKind, 0); 6223 } 6224 return Cost; 6225} 6226 6227std::optional<InstructionCost> 6228LoopVectorizationCostModel::getReductionPatternCost( 6229 Instruction *I, ElementCount VF, Type *Ty, 6230 TTI::TargetCostKind CostKind) const { 6231 using namespace llvm::PatternMatch; 6232 // Early exit for no inloop reductions 6233 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty)) 6234 return std::nullopt; 6235 auto *VectorTy = cast<VectorType>(Ty); 6236 6237 // We are looking for a pattern of, and finding the minimal acceptable cost: 6238 // reduce(mul(ext(A), ext(B))) or 6239 // reduce(mul(A, B)) or 6240 // reduce(ext(A)) or 6241 // reduce(A). 6242 // The basic idea is that we walk down the tree to do that, finding the root 6243 // reduction instruction in InLoopReductionImmediateChains. From there we find 6244 // the pattern of mul/ext and test the cost of the entire pattern vs the cost 6245 // of the components. If the reduction cost is lower then we return it for the 6246 // reduction instruction and 0 for the other instructions in the pattern. If 6247 // it is not we return an invalid cost specifying the orignal cost method 6248 // should be used. 6249 Instruction *RetI = I; 6250 if (match(RetI, m_ZExtOrSExt(m_Value()))) { 6251 if (!RetI->hasOneUser()) 6252 return std::nullopt; 6253 RetI = RetI->user_back(); 6254 } 6255 6256 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) && 6257 RetI->user_back()->getOpcode() == Instruction::Add) { 6258 RetI = RetI->user_back(); 6259 } 6260 6261 // Test if the found instruction is a reduction, and if not return an invalid 6262 // cost specifying the parent to use the original cost modelling. 6263 if (!InLoopReductionImmediateChains.count(RetI)) 6264 return std::nullopt; 6265 6266 // Find the reduction this chain is a part of and calculate the basic cost of 6267 // the reduction on its own. 6268 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); 6269 Instruction *ReductionPhi = LastChain; 6270 while (!isa<PHINode>(ReductionPhi)) 6271 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); 6272 6273 const RecurrenceDescriptor &RdxDesc = 6274 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; 6275 6276 InstructionCost BaseCost = TTI.getArithmeticReductionCost( 6277 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); 6278 6279 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a 6280 // normal fmul instruction to the cost of the fadd reduction. 6281 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) 6282 BaseCost += 6283 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); 6284 6285 // If we're using ordered reductions then we can just return the base cost 6286 // here, since getArithmeticReductionCost calculates the full ordered 6287 // reduction cost when FP reassociation is not allowed. 6288 if (useOrderedReductions(RdxDesc)) 6289 return BaseCost; 6290 6291 // Get the operand that was not the reduction chain and match it to one of the 6292 // patterns, returning the better cost if it is found. 6293 Instruction *RedOp = RetI->getOperand(1) == LastChain 6294 ? dyn_cast<Instruction>(RetI->getOperand(0)) 6295 : dyn_cast<Instruction>(RetI->getOperand(1)); 6296 6297 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); 6298 6299 Instruction *Op0, *Op1; 6300 if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6301 match(RedOp, 6302 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && 6303 match(Op0, m_ZExtOrSExt(m_Value())) && 6304 Op0->getOpcode() == Op1->getOpcode() && 6305 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && 6306 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && 6307 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { 6308 6309 // Matched reduce.add(ext(mul(ext(A), ext(B))) 6310 // Note that the extend opcodes need to all match, or if A==B they will have 6311 // been converted to zext(mul(sext(A), sext(A))) as it is known positive, 6312 // which is equally fine. 6313 bool IsUnsigned = isa<ZExtInst>(Op0); 6314 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); 6315 auto *MulType = VectorType::get(Op0->getType(), VectorTy); 6316 6317 InstructionCost ExtCost = 6318 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType, 6319 TTI::CastContextHint::None, CostKind, Op0); 6320 InstructionCost MulCost = 6321 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind); 6322 InstructionCost Ext2Cost = 6323 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType, 6324 TTI::CastContextHint::None, CostKind, RedOp); 6325 6326 InstructionCost RedCost = TTI.getMulAccReductionCost( 6327 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6328 6329 if (RedCost.isValid() && 6330 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) 6331 return I == RetI ? RedCost : 0; 6332 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && 6333 !TheLoop->isLoopInvariant(RedOp)) { 6334 // Matched reduce(ext(A)) 6335 bool IsUnsigned = isa<ZExtInst>(RedOp); 6336 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); 6337 InstructionCost RedCost = TTI.getExtendedReductionCost( 6338 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, 6339 RdxDesc.getFastMathFlags(), CostKind); 6340 6341 InstructionCost ExtCost = 6342 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, 6343 TTI::CastContextHint::None, CostKind, RedOp); 6344 if (RedCost.isValid() && RedCost < BaseCost + ExtCost) 6345 return I == RetI ? RedCost : 0; 6346 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && 6347 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { 6348 if (match(Op0, m_ZExtOrSExt(m_Value())) && 6349 Op0->getOpcode() == Op1->getOpcode() && 6350 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { 6351 bool IsUnsigned = isa<ZExtInst>(Op0); 6352 Type *Op0Ty = Op0->getOperand(0)->getType(); 6353 Type *Op1Ty = Op1->getOperand(0)->getType(); 6354 Type *LargestOpTy = 6355 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty 6356 : Op0Ty; 6357 auto *ExtType = VectorType::get(LargestOpTy, VectorTy); 6358 6359 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 6360 // different sizes. We take the largest type as the ext to reduce, and add 6361 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 6362 InstructionCost ExtCost0 = TTI.getCastInstrCost( 6363 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), 6364 TTI::CastContextHint::None, CostKind, Op0); 6365 InstructionCost ExtCost1 = TTI.getCastInstrCost( 6366 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), 6367 TTI::CastContextHint::None, CostKind, Op1); 6368 InstructionCost MulCost = 6369 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6370 6371 InstructionCost RedCost = TTI.getMulAccReductionCost( 6372 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); 6373 InstructionCost ExtraExtCost = 0; 6374 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { 6375 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; 6376 ExtraExtCost = TTI.getCastInstrCost( 6377 ExtraExtOp->getOpcode(), ExtType, 6378 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), 6379 TTI::CastContextHint::None, CostKind, ExtraExtOp); 6380 } 6381 6382 if (RedCost.isValid() && 6383 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) 6384 return I == RetI ? RedCost : 0; 6385 } else if (!match(I, m_ZExtOrSExt(m_Value()))) { 6386 // Matched reduce.add(mul()) 6387 InstructionCost MulCost = 6388 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 6389 6390 InstructionCost RedCost = TTI.getMulAccReductionCost( 6391 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind); 6392 6393 if (RedCost.isValid() && RedCost < MulCost + BaseCost) 6394 return I == RetI ? RedCost : 0; 6395 } 6396 } 6397 6398 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; 6399} 6400 6401InstructionCost 6402LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 6403 ElementCount VF) { 6404 // Calculate scalar cost only. Vectorization cost should be ready at this 6405 // moment. 6406 if (VF.isScalar()) { 6407 Type *ValTy = getLoadStoreType(I); 6408 const Align Alignment = getLoadStoreAlignment(I); 6409 unsigned AS = getLoadStoreAddressSpace(I); 6410 6411 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); 6412 return TTI.getAddressComputationCost(ValTy) + 6413 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, 6414 TTI::TCK_RecipThroughput, OpInfo, I); 6415 } 6416 return getWideningCost(I, VF); 6417} 6418 6419LoopVectorizationCostModel::VectorizationCostTy 6420LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6421 ElementCount VF) { 6422 // If we know that this instruction will remain uniform, check the cost of 6423 // the scalar version. 6424 if (isUniformAfterVectorization(I, VF)) 6425 VF = ElementCount::getFixed(1); 6426 6427 if (VF.isVector() && isProfitableToScalarize(I, VF)) 6428 return VectorizationCostTy(InstsToScalarize[VF][I], false); 6429 6430 // Forced scalars do not have any scalarization overhead. 6431 auto ForcedScalar = ForcedScalars.find(VF); 6432 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { 6433 auto InstSet = ForcedScalar->second; 6434 if (InstSet.count(I)) 6435 return VectorizationCostTy( 6436 (getInstructionCost(I, ElementCount::getFixed(1)).first * 6437 VF.getKnownMinValue()), 6438 false); 6439 } 6440 6441 Type *VectorTy; 6442 InstructionCost C = getInstructionCost(I, VF, VectorTy); 6443 6444 bool TypeNotScalarized = false; 6445 if (VF.isVector() && VectorTy->isVectorTy()) { 6446 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) { 6447 if (VF.isScalable()) 6448 // <vscale x 1 x iN> is assumed to be profitable over iN because 6449 // scalable registers are a distinct register class from scalar ones. 6450 // If we ever find a target which wants to lower scalable vectors 6451 // back to scalars, we'll need to update this code to explicitly 6452 // ask TTI about the register class uses for each part. 6453 TypeNotScalarized = NumParts <= VF.getKnownMinValue(); 6454 else 6455 TypeNotScalarized = NumParts < VF.getKnownMinValue(); 6456 } else 6457 C = InstructionCost::getInvalid(); 6458 } 6459 return VectorizationCostTy(C, TypeNotScalarized); 6460} 6461 6462InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( 6463 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { 6464 6465 // There is no mechanism yet to create a scalable scalarization loop, 6466 // so this is currently Invalid. 6467 if (VF.isScalable()) 6468 return InstructionCost::getInvalid(); 6469 6470 if (VF.isScalar()) 6471 return 0; 6472 6473 InstructionCost Cost = 0; 6474 Type *RetTy = ToVectorTy(I->getType(), VF); 6475 if (!RetTy->isVoidTy() && 6476 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 6477 Cost += TTI.getScalarizationOverhead( 6478 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), 6479 /*Insert*/ true, 6480 /*Extract*/ false, CostKind); 6481 6482 // Some targets keep addresses scalar. 6483 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6484 return Cost; 6485 6486 // Some targets support efficient element stores. 6487 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6488 return Cost; 6489 6490 // Collect operands to consider. 6491 CallInst *CI = dyn_cast<CallInst>(I); 6492 Instruction::op_range Ops = CI ? CI->args() : I->operands(); 6493 6494 // Skip operands that do not require extraction/scalarization and do not incur 6495 // any overhead. 6496 SmallVector<Type *> Tys; 6497 for (auto *V : filterExtractingOperands(Ops, VF)) 6498 Tys.push_back(MaybeVectorizeType(V->getType(), VF)); 6499 return Cost + TTI.getOperandsScalarizationOverhead( 6500 filterExtractingOperands(Ops, VF), Tys, CostKind); 6501} 6502 6503void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { 6504 if (VF.isScalar()) 6505 return; 6506 NumPredStores = 0; 6507 for (BasicBlock *BB : TheLoop->blocks()) { 6508 // For each instruction in the old loop. 6509 for (Instruction &I : *BB) { 6510 Value *Ptr = getLoadStorePointerOperand(&I); 6511 if (!Ptr) 6512 continue; 6513 6514 // TODO: We should generate better code and update the cost model for 6515 // predicated uniform stores. Today they are treated as any other 6516 // predicated store (see added test cases in 6517 // invariant-store-vectorization.ll). 6518 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF)) 6519 NumPredStores++; 6520 6521 if (Legal->isUniformMemOp(I, VF)) { 6522 auto isLegalToScalarize = [&]() { 6523 if (!VF.isScalable()) 6524 // Scalarization of fixed length vectors "just works". 6525 return true; 6526 6527 // We have dedicated lowering for unpredicated uniform loads and 6528 // stores. Note that even with tail folding we know that at least 6529 // one lane is active (i.e. generalized predication is not possible 6530 // here), and the logic below depends on this fact. 6531 if (!foldTailByMasking()) 6532 return true; 6533 6534 // For scalable vectors, a uniform memop load is always 6535 // uniform-by-parts and we know how to scalarize that. 6536 if (isa<LoadInst>(I)) 6537 return true; 6538 6539 // A uniform store isn't neccessarily uniform-by-part 6540 // and we can't assume scalarization. 6541 auto &SI = cast<StoreInst>(I); 6542 return TheLoop->isLoopInvariant(SI.getValueOperand()); 6543 }; 6544 6545 const InstructionCost GatherScatterCost = 6546 isLegalGatherOrScatter(&I, VF) ? 6547 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); 6548 6549 // Load: Scalar load + broadcast 6550 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6551 // FIXME: This cost is a significant under-estimate for tail folded 6552 // memory ops. 6553 const InstructionCost ScalarizationCost = isLegalToScalarize() ? 6554 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); 6555 6556 // Choose better solution for the current VF, Note that Invalid 6557 // costs compare as maximumal large. If both are invalid, we get 6558 // scalable invalid which signals a failure and a vectorization abort. 6559 if (GatherScatterCost < ScalarizationCost) 6560 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); 6561 else 6562 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); 6563 continue; 6564 } 6565 6566 // We assume that widening is the best solution when possible. 6567 if (memoryInstructionCanBeWidened(&I, VF)) { 6568 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); 6569 int ConsecutiveStride = Legal->isConsecutivePtr( 6570 getLoadStoreType(&I), getLoadStorePointerOperand(&I)); 6571 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6572 "Expected consecutive stride."); 6573 InstWidening Decision = 6574 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6575 setWideningDecision(&I, VF, Decision, Cost); 6576 continue; 6577 } 6578 6579 // Choose between Interleaving, Gather/Scatter or Scalarization. 6580 InstructionCost InterleaveCost = InstructionCost::getInvalid(); 6581 unsigned NumAccesses = 1; 6582 if (isAccessInterleaved(&I)) { 6583 auto Group = getInterleavedAccessGroup(&I); 6584 assert(Group && "Fail to get an interleaved access group."); 6585 6586 // Make one decision for the whole group. 6587 if (getWideningDecision(&I, VF) != CM_Unknown) 6588 continue; 6589 6590 NumAccesses = Group->getNumMembers(); 6591 if (interleavedAccessCanBeWidened(&I, VF)) 6592 InterleaveCost = getInterleaveGroupCost(&I, VF); 6593 } 6594 6595 InstructionCost GatherScatterCost = 6596 isLegalGatherOrScatter(&I, VF) 6597 ? getGatherScatterCost(&I, VF) * NumAccesses 6598 : InstructionCost::getInvalid(); 6599 6600 InstructionCost ScalarizationCost = 6601 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6602 6603 // Choose better solution for the current VF, 6604 // write down this decision and use it during vectorization. 6605 InstructionCost Cost; 6606 InstWidening Decision; 6607 if (InterleaveCost <= GatherScatterCost && 6608 InterleaveCost < ScalarizationCost) { 6609 Decision = CM_Interleave; 6610 Cost = InterleaveCost; 6611 } else if (GatherScatterCost < ScalarizationCost) { 6612 Decision = CM_GatherScatter; 6613 Cost = GatherScatterCost; 6614 } else { 6615 Decision = CM_Scalarize; 6616 Cost = ScalarizationCost; 6617 } 6618 // If the instructions belongs to an interleave group, the whole group 6619 // receives the same decision. The whole group receives the cost, but 6620 // the cost will actually be assigned to one instruction. 6621 if (auto Group = getInterleavedAccessGroup(&I)) 6622 setWideningDecision(Group, VF, Decision, Cost); 6623 else 6624 setWideningDecision(&I, VF, Decision, Cost); 6625 } 6626 } 6627 6628 // Make sure that any load of address and any other address computation 6629 // remains scalar unless there is gather/scatter support. This avoids 6630 // inevitable extracts into address registers, and also has the benefit of 6631 // activating LSR more, since that pass can't optimize vectorized 6632 // addresses. 6633 if (TTI.prefersVectorizedAddressing()) 6634 return; 6635 6636 // Start with all scalar pointer uses. 6637 SmallPtrSet<Instruction *, 8> AddrDefs; 6638 for (BasicBlock *BB : TheLoop->blocks()) 6639 for (Instruction &I : *BB) { 6640 Instruction *PtrDef = 6641 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6642 if (PtrDef && TheLoop->contains(PtrDef) && 6643 getWideningDecision(&I, VF) != CM_GatherScatter) 6644 AddrDefs.insert(PtrDef); 6645 } 6646 6647 // Add all instructions used to generate the addresses. 6648 SmallVector<Instruction *, 4> Worklist; 6649 append_range(Worklist, AddrDefs); 6650 while (!Worklist.empty()) { 6651 Instruction *I = Worklist.pop_back_val(); 6652 for (auto &Op : I->operands()) 6653 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6654 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6655 AddrDefs.insert(InstOp).second) 6656 Worklist.push_back(InstOp); 6657 } 6658 6659 for (auto *I : AddrDefs) { 6660 if (isa<LoadInst>(I)) { 6661 // Setting the desired widening decision should ideally be handled in 6662 // by cost functions, but since this involves the task of finding out 6663 // if the loaded register is involved in an address computation, it is 6664 // instead changed here when we know this is the case. 6665 InstWidening Decision = getWideningDecision(I, VF); 6666 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6667 // Scalarize a widened load of address. 6668 setWideningDecision( 6669 I, VF, CM_Scalarize, 6670 (VF.getKnownMinValue() * 6671 getMemoryInstructionCost(I, ElementCount::getFixed(1)))); 6672 else if (auto Group = getInterleavedAccessGroup(I)) { 6673 // Scalarize an interleave group of address loads. 6674 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6675 if (Instruction *Member = Group->getMember(I)) 6676 setWideningDecision( 6677 Member, VF, CM_Scalarize, 6678 (VF.getKnownMinValue() * 6679 getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); 6680 } 6681 } 6682 } else 6683 // Make sure I gets scalarized and a cost estimate without 6684 // scalarization overhead. 6685 ForcedScalars[VF].insert(I); 6686 } 6687} 6688 6689void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { 6690 assert(!VF.isScalar() && 6691 "Trying to set a vectorization decision for a scalar VF"); 6692 6693 for (BasicBlock *BB : TheLoop->blocks()) { 6694 // For each instruction in the old loop. 6695 for (Instruction &I : *BB) { 6696 CallInst *CI = dyn_cast<CallInst>(&I); 6697 6698 if (!CI) 6699 continue; 6700 6701 InstructionCost ScalarCost = InstructionCost::getInvalid(); 6702 InstructionCost VectorCost = InstructionCost::getInvalid(); 6703 InstructionCost IntrinsicCost = InstructionCost::getInvalid(); 6704 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6705 6706 Function *ScalarFunc = CI->getCalledFunction(); 6707 Type *ScalarRetTy = CI->getType(); 6708 SmallVector<Type *, 4> Tys, ScalarTys; 6709 bool MaskRequired = Legal->isMaskRequired(CI); 6710 for (auto &ArgOp : CI->args()) 6711 ScalarTys.push_back(ArgOp->getType()); 6712 6713 // Compute corresponding vector type for return value and arguments. 6714 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 6715 for (Type *ScalarTy : ScalarTys) 6716 Tys.push_back(ToVectorTy(ScalarTy, VF)); 6717 6718 // An in-loop reduction using an fmuladd intrinsic is a special case; 6719 // we don't want the normal cost for that intrinsic. 6720 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) 6721 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { 6722 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, 6723 getVectorIntrinsicIDForCall(CI, TLI), 6724 std::nullopt, *RedCost); 6725 continue; 6726 } 6727 6728 // Estimate cost of scalarized vector call. The source operands are 6729 // assumed to be vectors, so we need to extract individual elements from 6730 // there, execute VF scalar calls, and then gather the result into the 6731 // vector return value. 6732 InstructionCost ScalarCallCost = 6733 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); 6734 6735 // Compute costs of unpacking argument values for the scalar calls and 6736 // packing the return values to a vector. 6737 InstructionCost ScalarizationCost = 6738 getScalarizationOverhead(CI, VF, CostKind); 6739 6740 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; 6741 6742 // Find the cost of vectorizing the call, if we can find a suitable 6743 // vector variant of the function. 6744 bool UsesMask = false; 6745 VFInfo FuncInfo; 6746 Function *VecFunc = nullptr; 6747 // Search through any available variants for one we can use at this VF. 6748 for (VFInfo &Info : VFDatabase::getMappings(*CI)) { 6749 // Must match requested VF. 6750 if (Info.Shape.VF != VF) 6751 continue; 6752 6753 // Must take a mask argument if one is required 6754 if (MaskRequired && !Info.isMasked()) 6755 continue; 6756 6757 // Check that all parameter kinds are supported 6758 bool ParamsOk = true; 6759 for (VFParameter Param : Info.Shape.Parameters) { 6760 switch (Param.ParamKind) { 6761 case VFParamKind::Vector: 6762 break; 6763 case VFParamKind::OMP_Uniform: { 6764 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6765 // Make sure the scalar parameter in the loop is invariant. 6766 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam), 6767 TheLoop)) 6768 ParamsOk = false; 6769 break; 6770 } 6771 case VFParamKind::OMP_Linear: { 6772 Value *ScalarParam = CI->getArgOperand(Param.ParamPos); 6773 // Find the stride for the scalar parameter in this loop and see if 6774 // it matches the stride for the variant. 6775 // TODO: do we need to figure out the cost of an extract to get the 6776 // first lane? Or do we hope that it will be folded away? 6777 ScalarEvolution *SE = PSE.getSE(); 6778 const auto *SAR = 6779 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam)); 6780 6781 if (!SAR || SAR->getLoop() != TheLoop) { 6782 ParamsOk = false; 6783 break; 6784 } 6785 6786 const SCEVConstant *Step = 6787 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE)); 6788 6789 if (!Step || 6790 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) 6791 ParamsOk = false; 6792 6793 break; 6794 } 6795 case VFParamKind::GlobalPredicate: 6796 UsesMask = true; 6797 break; 6798 default: 6799 ParamsOk = false; 6800 break; 6801 } 6802 } 6803 6804 if (!ParamsOk) 6805 continue; 6806 6807 // Found a suitable candidate, stop here. 6808 VecFunc = CI->getModule()->getFunction(Info.VectorName); 6809 FuncInfo = Info; 6810 break; 6811 } 6812 6813 // Add in the cost of synthesizing a mask if one wasn't required. 6814 InstructionCost MaskCost = 0; 6815 if (VecFunc && UsesMask && !MaskRequired) 6816 MaskCost = TTI.getShuffleCost( 6817 TargetTransformInfo::SK_Broadcast, 6818 VectorType::get(IntegerType::getInt1Ty( 6819 VecFunc->getFunctionType()->getContext()), 6820 VF)); 6821 6822 if (TLI && VecFunc && !CI->isNoBuiltin()) 6823 VectorCost = 6824 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; 6825 6826 // Find the cost of an intrinsic; some targets may have instructions that 6827 // perform the operation without needing an actual call. 6828 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); 6829 if (IID != Intrinsic::not_intrinsic) 6830 IntrinsicCost = getVectorIntrinsicCost(CI, VF); 6831 6832 InstructionCost Cost = ScalarCost; 6833 InstWidening Decision = CM_Scalarize; 6834 6835 if (VectorCost <= Cost) { 6836 Cost = VectorCost; 6837 Decision = CM_VectorCall; 6838 } 6839 6840 if (IntrinsicCost <= Cost) { 6841 Cost = IntrinsicCost; 6842 Decision = CM_IntrinsicCall; 6843 } 6844 6845 setCallWideningDecision(CI, VF, Decision, VecFunc, IID, 6846 FuncInfo.getParamIndexForOptionalMask(), Cost); 6847 } 6848 } 6849} 6850 6851InstructionCost 6852LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, 6853 Type *&VectorTy) { 6854 Type *RetTy = I->getType(); 6855 if (canTruncateToMinimalBitwidth(I, VF)) 6856 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6857 auto SE = PSE.getSE(); 6858 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 6859 6860 auto hasSingleCopyAfterVectorization = [this](Instruction *I, 6861 ElementCount VF) -> bool { 6862 if (VF.isScalar()) 6863 return true; 6864 6865 auto Scalarized = InstsToScalarize.find(VF); 6866 assert(Scalarized != InstsToScalarize.end() && 6867 "VF not yet analyzed for scalarization profitability"); 6868 return !Scalarized->second.count(I) && 6869 llvm::all_of(I->users(), [&](User *U) { 6870 auto *UI = cast<Instruction>(U); 6871 return !Scalarized->second.count(UI); 6872 }); 6873 }; 6874 (void) hasSingleCopyAfterVectorization; 6875 6876 if (isScalarAfterVectorization(I, VF)) { 6877 // With the exception of GEPs and PHIs, after scalarization there should 6878 // only be one copy of the instruction generated in the loop. This is 6879 // because the VF is either 1, or any instructions that need scalarizing 6880 // have already been dealt with by the time we get here. As a result, 6881 // it means we don't have to multiply the instruction cost by VF. 6882 assert(I->getOpcode() == Instruction::GetElementPtr || 6883 I->getOpcode() == Instruction::PHI || 6884 (I->getOpcode() == Instruction::BitCast && 6885 I->getType()->isPointerTy()) || 6886 hasSingleCopyAfterVectorization(I, VF)); 6887 VectorTy = RetTy; 6888 } else 6889 VectorTy = ToVectorTy(RetTy, VF); 6890 6891 // TODO: We need to estimate the cost of intrinsic calls. 6892 switch (I->getOpcode()) { 6893 case Instruction::GetElementPtr: 6894 // We mark this instruction as zero-cost because the cost of GEPs in 6895 // vectorized code depends on whether the corresponding memory instruction 6896 // is scalarized or not. Therefore, we handle GEPs with the memory 6897 // instruction cost. 6898 return 0; 6899 case Instruction::Br: { 6900 // In cases of scalarized and predicated instructions, there will be VF 6901 // predicated blocks in the vectorized loop. Each branch around these 6902 // blocks requires also an extract of its vector compare i1 element. 6903 bool ScalarPredicatedBB = false; 6904 BranchInst *BI = cast<BranchInst>(I); 6905 if (VF.isVector() && BI->isConditional() && 6906 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) || 6907 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1)))) 6908 ScalarPredicatedBB = true; 6909 6910 if (ScalarPredicatedBB) { 6911 // Not possible to scalarize scalable vector with predicated instructions. 6912 if (VF.isScalable()) 6913 return InstructionCost::getInvalid(); 6914 // Return cost for branches around scalarized and predicated blocks. 6915 auto *Vec_i1Ty = 6916 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6917 return ( 6918 TTI.getScalarizationOverhead( 6919 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), 6920 /*Insert*/ false, /*Extract*/ true, CostKind) + 6921 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); 6922 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) 6923 // The back-edge branch will remain, as will all scalar branches. 6924 return TTI.getCFInstrCost(Instruction::Br, CostKind); 6925 else 6926 // This branch will be eliminated by if-conversion. 6927 return 0; 6928 // Note: We currently assume zero cost for an unconditional branch inside 6929 // a predicated block since it will become a fall-through, although we 6930 // may decide in the future to call TTI for all branches. 6931 } 6932 case Instruction::PHI: { 6933 auto *Phi = cast<PHINode>(I); 6934 6935 // First-order recurrences are replaced by vector shuffles inside the loop. 6936 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { 6937 SmallVector<int> Mask(VF.getKnownMinValue()); 6938 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1); 6939 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice, 6940 cast<VectorType>(VectorTy), Mask, CostKind, 6941 VF.getKnownMinValue() - 1); 6942 } 6943 6944 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6945 // converted into select instructions. We require N - 1 selects per phi 6946 // node, where N is the number of incoming values. 6947 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) 6948 return (Phi->getNumIncomingValues() - 1) * 6949 TTI.getCmpSelInstrCost( 6950 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6951 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), 6952 CmpInst::BAD_ICMP_PREDICATE, CostKind); 6953 6954 return TTI.getCFInstrCost(Instruction::PHI, CostKind); 6955 } 6956 case Instruction::UDiv: 6957 case Instruction::SDiv: 6958 case Instruction::URem: 6959 case Instruction::SRem: 6960 if (VF.isVector() && isPredicatedInst(I)) { 6961 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); 6962 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? 6963 ScalarCost : SafeDivisorCost; 6964 } 6965 // We've proven all lanes safe to speculate, fall through. 6966 [[fallthrough]]; 6967 case Instruction::Add: 6968 case Instruction::FAdd: 6969 case Instruction::Sub: 6970 case Instruction::FSub: 6971 case Instruction::Mul: 6972 case Instruction::FMul: 6973 case Instruction::FDiv: 6974 case Instruction::FRem: 6975 case Instruction::Shl: 6976 case Instruction::LShr: 6977 case Instruction::AShr: 6978 case Instruction::And: 6979 case Instruction::Or: 6980 case Instruction::Xor: { 6981 // If we're speculating on the stride being 1, the multiplication may 6982 // fold away. We can generalize this for all operations using the notion 6983 // of neutral elements. (TODO) 6984 if (I->getOpcode() == Instruction::Mul && 6985 (PSE.getSCEV(I->getOperand(0))->isOne() || 6986 PSE.getSCEV(I->getOperand(1))->isOne())) 6987 return 0; 6988 6989 // Detect reduction patterns 6990 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 6991 return *RedCost; 6992 6993 // Certain instructions can be cheaper to vectorize if they have a constant 6994 // second vector operand. One example of this are shifts on x86. 6995 Value *Op2 = I->getOperand(1); 6996 auto Op2Info = TTI.getOperandInfo(Op2); 6997 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && 6998 Legal->isInvariant(Op2)) 6999 Op2Info.Kind = TargetTransformInfo::OK_UniformValue; 7000 7001 SmallVector<const Value *, 4> Operands(I->operand_values()); 7002 auto InstrCost = TTI.getArithmeticInstrCost( 7003 I->getOpcode(), VectorTy, CostKind, 7004 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7005 Op2Info, Operands, I); 7006 7007 // Some targets can replace frem with vector library calls. 7008 InstructionCost VecCallCost = InstructionCost::getInvalid(); 7009 if (I->getOpcode() == Instruction::FRem) { 7010 LibFunc Func; 7011 if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) && 7012 TLI->isFunctionVectorizable(TLI->getName(Func), VF)) { 7013 SmallVector<Type *, 4> OpTypes; 7014 for (auto &Op : I->operands()) 7015 OpTypes.push_back(Op->getType()); 7016 VecCallCost = 7017 TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind); 7018 } 7019 } 7020 return std::min(InstrCost, VecCallCost); 7021 } 7022 case Instruction::FNeg: { 7023 return TTI.getArithmeticInstrCost( 7024 I->getOpcode(), VectorTy, CostKind, 7025 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7026 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, 7027 I->getOperand(0), I); 7028 } 7029 case Instruction::Select: { 7030 SelectInst *SI = cast<SelectInst>(I); 7031 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 7032 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 7033 7034 const Value *Op0, *Op1; 7035 using namespace llvm::PatternMatch; 7036 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) || 7037 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) { 7038 // select x, y, false --> x & y 7039 // select x, true, y --> x | y 7040 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0); 7041 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1); 7042 assert(Op0->getType()->getScalarSizeInBits() == 1 && 7043 Op1->getType()->getScalarSizeInBits() == 1); 7044 7045 SmallVector<const Value *, 2> Operands{Op0, Op1}; 7046 return TTI.getArithmeticInstrCost( 7047 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy, 7048 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I); 7049 } 7050 7051 Type *CondTy = SI->getCondition()->getType(); 7052 if (!ScalarCond) 7053 CondTy = VectorType::get(CondTy, VF); 7054 7055 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; 7056 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) 7057 Pred = Cmp->getPredicate(); 7058 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, 7059 CostKind, I); 7060 } 7061 case Instruction::ICmp: 7062 case Instruction::FCmp: { 7063 Type *ValTy = I->getOperand(0)->getType(); 7064 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 7065 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 7066 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 7067 VectorTy = ToVectorTy(ValTy, VF); 7068 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, 7069 cast<CmpInst>(I)->getPredicate(), CostKind, 7070 I); 7071 } 7072 case Instruction::Store: 7073 case Instruction::Load: { 7074 ElementCount Width = VF; 7075 if (Width.isVector()) { 7076 InstWidening Decision = getWideningDecision(I, Width); 7077 assert(Decision != CM_Unknown && 7078 "CM decision should be taken at this point"); 7079 if (getWideningCost(I, VF) == InstructionCost::getInvalid()) 7080 return InstructionCost::getInvalid(); 7081 if (Decision == CM_Scalarize) 7082 Width = ElementCount::getFixed(1); 7083 } 7084 VectorTy = ToVectorTy(getLoadStoreType(I), Width); 7085 return getMemoryInstructionCost(I, VF); 7086 } 7087 case Instruction::BitCast: 7088 if (I->getType()->isPointerTy()) 7089 return 0; 7090 [[fallthrough]]; 7091 case Instruction::ZExt: 7092 case Instruction::SExt: 7093 case Instruction::FPToUI: 7094 case Instruction::FPToSI: 7095 case Instruction::FPExt: 7096 case Instruction::PtrToInt: 7097 case Instruction::IntToPtr: 7098 case Instruction::SIToFP: 7099 case Instruction::UIToFP: 7100 case Instruction::Trunc: 7101 case Instruction::FPTrunc: { 7102 // Computes the CastContextHint from a Load/Store instruction. 7103 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { 7104 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 7105 "Expected a load or a store!"); 7106 7107 if (VF.isScalar() || !TheLoop->contains(I)) 7108 return TTI::CastContextHint::Normal; 7109 7110 switch (getWideningDecision(I, VF)) { 7111 case LoopVectorizationCostModel::CM_GatherScatter: 7112 return TTI::CastContextHint::GatherScatter; 7113 case LoopVectorizationCostModel::CM_Interleave: 7114 return TTI::CastContextHint::Interleave; 7115 case LoopVectorizationCostModel::CM_Scalarize: 7116 case LoopVectorizationCostModel::CM_Widen: 7117 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked 7118 : TTI::CastContextHint::Normal; 7119 case LoopVectorizationCostModel::CM_Widen_Reverse: 7120 return TTI::CastContextHint::Reversed; 7121 case LoopVectorizationCostModel::CM_Unknown: 7122 llvm_unreachable("Instr did not go through cost modelling?"); 7123 case LoopVectorizationCostModel::CM_VectorCall: 7124 case LoopVectorizationCostModel::CM_IntrinsicCall: 7125 llvm_unreachable_internal("Instr has invalid widening decision"); 7126 } 7127 7128 llvm_unreachable("Unhandled case!"); 7129 }; 7130 7131 unsigned Opcode = I->getOpcode(); 7132 TTI::CastContextHint CCH = TTI::CastContextHint::None; 7133 // For Trunc, the context is the only user, which must be a StoreInst. 7134 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { 7135 if (I->hasOneUse()) 7136 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin())) 7137 CCH = ComputeCCH(Store); 7138 } 7139 // For Z/Sext, the context is the operand, which must be a LoadInst. 7140 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || 7141 Opcode == Instruction::FPExt) { 7142 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0))) 7143 CCH = ComputeCCH(Load); 7144 } 7145 7146 // We optimize the truncation of induction variables having constant 7147 // integer steps. The cost of these truncations is the same as the scalar 7148 // operation. 7149 if (isOptimizableIVTruncate(I, VF)) { 7150 auto *Trunc = cast<TruncInst>(I); 7151 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 7152 Trunc->getSrcTy(), CCH, CostKind, Trunc); 7153 } 7154 7155 // Detect reduction patterns 7156 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) 7157 return *RedCost; 7158 7159 Type *SrcScalarTy = I->getOperand(0)->getType(); 7160 Type *SrcVecTy = 7161 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 7162 if (canTruncateToMinimalBitwidth(I, VF)) { 7163 // This cast is going to be shrunk. This may remove the cast or it might 7164 // turn it into slightly different cast. For example, if MinBW == 16, 7165 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 7166 // 7167 // Calculate the modified src and dest types. 7168 Type *MinVecTy = VectorTy; 7169 if (Opcode == Instruction::Trunc) { 7170 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 7171 VectorTy = 7172 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7173 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { 7174 // Leave SrcVecTy unchanged - we only shrink the destination element 7175 // type. 7176 VectorTy = 7177 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 7178 } 7179 } 7180 7181 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); 7182 } 7183 case Instruction::Call: 7184 return getVectorCallCost(cast<CallInst>(I), VF); 7185 case Instruction::ExtractValue: 7186 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); 7187 case Instruction::Alloca: 7188 // We cannot easily widen alloca to a scalable alloca, as 7189 // the result would need to be a vector of pointers. 7190 if (VF.isScalable()) 7191 return InstructionCost::getInvalid(); 7192 [[fallthrough]]; 7193 default: 7194 // This opcode is unknown. Assume that it is the same as 'mul'. 7195 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); 7196 } // end of switch. 7197} 7198 7199void LoopVectorizationCostModel::collectValuesToIgnore() { 7200 // Ignore ephemeral values. 7201 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 7202 7203 // Find all stores to invariant variables. Since they are going to sink 7204 // outside the loop we do not need calculate cost for them. 7205 for (BasicBlock *BB : TheLoop->blocks()) 7206 for (Instruction &I : *BB) { 7207 StoreInst *SI; 7208 if ((SI = dyn_cast<StoreInst>(&I)) && 7209 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 7210 ValuesToIgnore.insert(&I); 7211 } 7212 7213 // Ignore type-promoting instructions we identified during reduction 7214 // detection. 7215 for (const auto &Reduction : Legal->getReductionVars()) { 7216 const RecurrenceDescriptor &RedDes = Reduction.second; 7217 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 7218 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7219 } 7220 // Ignore type-casting instructions we identified during induction 7221 // detection. 7222 for (const auto &Induction : Legal->getInductionVars()) { 7223 const InductionDescriptor &IndDes = Induction.second; 7224 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 7225 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 7226 } 7227} 7228 7229void LoopVectorizationCostModel::collectInLoopReductions() { 7230 for (const auto &Reduction : Legal->getReductionVars()) { 7231 PHINode *Phi = Reduction.first; 7232 const RecurrenceDescriptor &RdxDesc = Reduction.second; 7233 7234 // We don't collect reductions that are type promoted (yet). 7235 if (RdxDesc.getRecurrenceType() != Phi->getType()) 7236 continue; 7237 7238 // If the target would prefer this reduction to happen "in-loop", then we 7239 // want to record it as such. 7240 unsigned Opcode = RdxDesc.getOpcode(); 7241 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && 7242 !TTI.preferInLoopReduction(Opcode, Phi->getType(), 7243 TargetTransformInfo::ReductionFlags())) 7244 continue; 7245 7246 // Check that we can correctly put the reductions into the loop, by 7247 // finding the chain of operations that leads from the phi to the loop 7248 // exit value. 7249 SmallVector<Instruction *, 4> ReductionOperations = 7250 RdxDesc.getReductionOpChain(Phi, TheLoop); 7251 bool InLoop = !ReductionOperations.empty(); 7252 7253 if (InLoop) { 7254 InLoopReductions.insert(Phi); 7255 // Add the elements to InLoopReductionImmediateChains for cost modelling. 7256 Instruction *LastChain = Phi; 7257 for (auto *I : ReductionOperations) { 7258 InLoopReductionImmediateChains[I] = LastChain; 7259 LastChain = I; 7260 } 7261 } 7262 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") 7263 << " reduction for phi: " << *Phi << "\n"); 7264 } 7265} 7266 7267VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, 7268 DebugLoc DL, const Twine &Name) { 7269 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && 7270 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); 7271 return tryInsertInstruction( 7272 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); 7273} 7274 7275// This function will select a scalable VF if the target supports scalable 7276// vectors and a fixed one otherwise. 7277// TODO: we could return a pair of values that specify the max VF and 7278// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7279// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7280// doesn't have a cost model that can choose which plan to execute if 7281// more than one is generated. 7282static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, 7283 LoopVectorizationCostModel &CM) { 7284 unsigned WidestType; 7285 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 7286 7287 TargetTransformInfo::RegisterKind RegKind = 7288 TTI.enableScalableVectorization() 7289 ? TargetTransformInfo::RGK_ScalableVector 7290 : TargetTransformInfo::RGK_FixedWidthVector; 7291 7292 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind); 7293 unsigned N = RegSize.getKnownMinValue() / WidestType; 7294 return ElementCount::get(N, RegSize.isScalable()); 7295} 7296 7297VectorizationFactor 7298LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { 7299 ElementCount VF = UserVF; 7300 // Outer loop handling: They may require CFG and instruction level 7301 // transformations before even evaluating whether vectorization is profitable. 7302 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7303 // the vectorization pipeline. 7304 if (!OrigLoop->isInnermost()) { 7305 // If the user doesn't provide a vectorization factor, determine a 7306 // reasonable one. 7307 if (UserVF.isZero()) { 7308 VF = determineVPlanVF(TTI, CM); 7309 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 7310 7311 // Make sure we have a VF > 1 for stress testing. 7312 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { 7313 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 7314 << "overriding computed VF.\n"); 7315 VF = ElementCount::getFixed(4); 7316 } 7317 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && 7318 !ForceTargetSupportsScalableVectors) { 7319 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " 7320 << "not supported by the target.\n"); 7321 reportVectorizationFailure( 7322 "Scalable vectorization requested but not supported by the target", 7323 "the scalable user-specified vectorization width for outer-loop " 7324 "vectorization cannot be used because the target does not support " 7325 "scalable vectors.", 7326 "ScalableVFUnfeasible", ORE, OrigLoop); 7327 return VectorizationFactor::Disabled(); 7328 } 7329 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7330 assert(isPowerOf2_32(VF.getKnownMinValue()) && 7331 "VF needs to be a power of two"); 7332 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") 7333 << "VF " << VF << " to build VPlans.\n"); 7334 buildVPlans(VF, VF); 7335 7336 // For VPlan build stress testing, we bail out after VPlan construction. 7337 if (VPlanBuildStressTest) 7338 return VectorizationFactor::Disabled(); 7339 7340 return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; 7341 } 7342 7343 LLVM_DEBUG( 7344 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 7345 "VPlan-native path.\n"); 7346 return VectorizationFactor::Disabled(); 7347} 7348 7349std::optional<VectorizationFactor> 7350LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { 7351 assert(OrigLoop->isInnermost() && "Inner loop expected."); 7352 CM.collectValuesToIgnore(); 7353 CM.collectElementTypesForWidening(); 7354 7355 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); 7356 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. 7357 return std::nullopt; 7358 7359 // Invalidate interleave groups if all blocks of loop will be predicated. 7360 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && 7361 !useMaskedInterleavedAccesses(TTI)) { 7362 LLVM_DEBUG( 7363 dbgs() 7364 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 7365 "which requires masked-interleaved support.\n"); 7366 if (CM.InterleaveInfo.invalidateGroups()) 7367 // Invalidating interleave groups also requires invalidating all decisions 7368 // based on them, which includes widening decisions and uniform and scalar 7369 // values. 7370 CM.invalidateCostModelingDecisions(); 7371 } 7372 7373 ElementCount MaxUserVF = 7374 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; 7375 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); 7376 if (!UserVF.isZero() && UserVFIsLegal) { 7377 assert(isPowerOf2_32(UserVF.getKnownMinValue()) && 7378 "VF needs to be a power of two"); 7379 // Collect the instructions (and their associated costs) that will be more 7380 // profitable to scalarize. 7381 CM.collectInLoopReductions(); 7382 if (CM.selectUserVectorizationFactor(UserVF)) { 7383 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 7384 buildVPlansWithVPRecipes(UserVF, UserVF); 7385 if (!hasPlanWithVF(UserVF)) { 7386 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF 7387 << ".\n"); 7388 return std::nullopt; 7389 } 7390 7391 LLVM_DEBUG(printPlans(dbgs())); 7392 return {{UserVF, 0, 0}}; 7393 } else 7394 reportVectorizationInfo("UserVF ignored because of invalid costs.", 7395 "InvalidCost", ORE, OrigLoop); 7396 } 7397 7398 // Populate the set of Vectorization Factor Candidates. 7399 ElementCountSet VFCandidates; 7400 for (auto VF = ElementCount::getFixed(1); 7401 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) 7402 VFCandidates.insert(VF); 7403 for (auto VF = ElementCount::getScalable(1); 7404 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) 7405 VFCandidates.insert(VF); 7406 7407 CM.collectInLoopReductions(); 7408 for (const auto &VF : VFCandidates) { 7409 // Collect Uniform and Scalar instructions after vectorization with VF. 7410 CM.collectUniformsAndScalars(VF); 7411 7412 // Collect the instructions (and their associated costs) that will be more 7413 // profitable to scalarize. 7414 if (VF.isVector()) 7415 CM.collectInstsToScalarize(VF); 7416 } 7417 7418 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); 7419 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); 7420 7421 LLVM_DEBUG(printPlans(dbgs())); 7422 if (!MaxFactors.hasVector()) 7423 return VectorizationFactor::Disabled(); 7424 7425 // Select the optimal vectorization factor. 7426 VectorizationFactor VF = selectVectorizationFactor(VFCandidates); 7427 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); 7428 if (!hasPlanWithVF(VF.Width)) { 7429 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width 7430 << ".\n"); 7431 return std::nullopt; 7432 } 7433 return VF; 7434} 7435 7436VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { 7437 assert(count_if(VPlans, 7438 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 7439 1 && 7440 "Best VF has not a single VPlan."); 7441 7442 for (const VPlanPtr &Plan : VPlans) { 7443 if (Plan->hasVF(VF)) 7444 return *Plan.get(); 7445 } 7446 llvm_unreachable("No plan found!"); 7447} 7448 7449static void AddRuntimeUnrollDisableMetaData(Loop *L) { 7450 SmallVector<Metadata *, 4> MDs; 7451 // Reserve first location for self reference to the LoopID metadata node. 7452 MDs.push_back(nullptr); 7453 bool IsUnrollMetadata = false; 7454 MDNode *LoopID = L->getLoopID(); 7455 if (LoopID) { 7456 // First find existing loop unrolling disable metadata. 7457 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 7458 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 7459 if (MD) { 7460 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 7461 IsUnrollMetadata = 7462 S && S->getString().starts_with("llvm.loop.unroll.disable"); 7463 } 7464 MDs.push_back(LoopID->getOperand(i)); 7465 } 7466 } 7467 7468 if (!IsUnrollMetadata) { 7469 // Add runtime unroll disable metadata. 7470 LLVMContext &Context = L->getHeader()->getContext(); 7471 SmallVector<Metadata *, 1> DisableOperands; 7472 DisableOperands.push_back( 7473 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 7474 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 7475 MDs.push_back(DisableNode); 7476 MDNode *NewLoopID = MDNode::get(Context, MDs); 7477 // Set operand 0 to refer to the loop id itself. 7478 NewLoopID->replaceOperandWith(0, NewLoopID); 7479 L->setLoopID(NewLoopID); 7480 } 7481} 7482 7483// Check if \p RedResult is a ComputeReductionResult instruction, and if it is 7484// create a merge phi node for it and add it to \p ReductionResumeValues. 7485static void createAndCollectMergePhiForReduction( 7486 VPInstruction *RedResult, 7487 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, 7488 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { 7489 if (!RedResult || 7490 RedResult->getOpcode() != VPInstruction::ComputeReductionResult) 7491 return; 7492 7493 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0)); 7494 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 7495 7496 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 7497 Value *FinalValue = 7498 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); 7499 auto *ResumePhi = 7500 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue()); 7501 7502 // TODO: bc.merge.rdx should not be created here, instead it should be 7503 // modeled in VPlan. 7504 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); 7505 // Create a phi node that merges control-flow from the backedge-taken check 7506 // block and the middle block. 7507 auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", 7508 LoopScalarPreHeader->getTerminator()); 7509 7510 // If we are fixing reductions in the epilogue loop then we should already 7511 // have created a bc.merge.rdx Phi after the main vector body. Ensure that 7512 // we carry over the incoming values correctly. 7513 for (auto *Incoming : predecessors(LoopScalarPreHeader)) { 7514 if (Incoming == LoopMiddleBlock) 7515 BCBlockPhi->addIncoming(FinalValue, Incoming); 7516 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) 7517 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), 7518 Incoming); 7519 else 7520 BCBlockPhi->addIncoming(ReductionStartValue, Incoming); 7521 } 7522 7523 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); 7524 // TODO: This fixup should instead be modeled in VPlan. 7525 // Fix the scalar loop reduction variable with the incoming reduction sum 7526 // from the vector body and from the backedge value. 7527 int IncomingEdgeBlockIdx = 7528 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 7529 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 7530 // Pick the other block. 7531 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 7532 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 7533 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 7534 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 7535 7536 ReductionResumeValues[&RdxDesc] = BCBlockPhi; 7537} 7538 7539std::pair<DenseMap<const SCEV *, Value *>, 7540 DenseMap<const RecurrenceDescriptor *, Value *>> 7541LoopVectorizationPlanner::executePlan( 7542 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, 7543 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, 7544 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { 7545 assert(BestVPlan.hasVF(BestVF) && 7546 "Trying to execute plan with unsupported VF"); 7547 assert(BestVPlan.hasUF(BestUF) && 7548 "Trying to execute plan with unsupported UF"); 7549 assert( 7550 (IsEpilogueVectorization || !ExpandedSCEVs) && 7551 "expanded SCEVs to reuse can only be used during epilogue vectorization"); 7552 7553 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF 7554 << '\n'); 7555 7556 if (!IsEpilogueVectorization) 7557 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); 7558 7559 // Perform the actual loop transformation. 7560 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, 7561 OrigLoop->getHeader()->getContext()); 7562 7563 // 0. Generate SCEV-dependent code into the preheader, including TripCount, 7564 // before making any changes to the CFG. 7565 if (!BestVPlan.getPreheader()->empty()) { 7566 State.CFG.PrevBB = OrigLoop->getLoopPreheader(); 7567 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); 7568 BestVPlan.getPreheader()->execute(&State); 7569 } 7570 if (!ILV.getTripCount()) 7571 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0})); 7572 else 7573 assert(IsEpilogueVectorization && "should only re-use the existing trip " 7574 "count during epilogue vectorization"); 7575 7576 // 1. Set up the skeleton for vectorization, including vector pre-header and 7577 // middle block. The vector loop is created during VPlan execution. 7578 Value *CanonicalIVStartValue; 7579 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = 7580 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs 7581 : State.ExpandedSCEVs); 7582 7583 // Only use noalias metadata when using memory checks guaranteeing no overlap 7584 // across all iterations. 7585 const LoopAccessInfo *LAI = ILV.Legal->getLAI(); 7586 std::unique_ptr<LoopVersioning> LVer = nullptr; 7587 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && 7588 !LAI->getRuntimePointerChecking()->getDiffChecks()) { 7589 7590 // We currently don't use LoopVersioning for the actual loop cloning but we 7591 // still use it to add the noalias metadata. 7592 // TODO: Find a better way to re-use LoopVersioning functionality to add 7593 // metadata. 7594 LVer = std::make_unique<LoopVersioning>( 7595 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, 7596 PSE.getSE()); 7597 State.LVer = &*LVer; 7598 State.LVer->prepareNoAliasMetadata(); 7599 } 7600 7601 ILV.collectPoisonGeneratingRecipes(State); 7602 7603 ILV.printDebugTracesAtStart(); 7604 7605 //===------------------------------------------------===// 7606 // 7607 // Notice: any optimization or new instruction that go 7608 // into the code below should also be implemented in 7609 // the cost-model. 7610 // 7611 //===------------------------------------------------===// 7612 7613 // 2. Copy and widen instructions from the old loop into the new loop. 7614 BestVPlan.prepareToExecute(ILV.getTripCount(), 7615 ILV.getOrCreateVectorTripCount(nullptr), 7616 CanonicalIVStartValue, State); 7617 7618 BestVPlan.execute(&State); 7619 7620 // 2.5 Collect reduction resume values. 7621 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; 7622 auto *ExitVPBB = 7623 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); 7624 for (VPRecipeBase &R : *ExitVPBB) { 7625 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R), 7626 ReductionResumeValues, State, OrigLoop, 7627 State.CFG.VPBB2IRBB[ExitVPBB]); 7628 } 7629 7630 // 2.6. Maintain Loop Hints 7631 // Keep all loop hints from the original loop on the vector loop (we'll 7632 // replace the vectorizer-specific hints below). 7633 MDNode *OrigLoopID = OrigLoop->getLoopID(); 7634 7635 std::optional<MDNode *> VectorizedLoopID = 7636 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7637 LLVMLoopVectorizeFollowupVectorized}); 7638 7639 VPBasicBlock *HeaderVPBB = 7640 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); 7641 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]); 7642 if (VectorizedLoopID) 7643 L->setLoopID(*VectorizedLoopID); 7644 else { 7645 // Keep all loop hints from the original loop on the vector loop (we'll 7646 // replace the vectorizer-specific hints below). 7647 if (MDNode *LID = OrigLoop->getLoopID()) 7648 L->setLoopID(LID); 7649 7650 LoopVectorizeHints Hints(L, true, *ORE); 7651 Hints.setAlreadyVectorized(); 7652 } 7653 TargetTransformInfo::UnrollingPreferences UP; 7654 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); 7655 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) 7656 AddRuntimeUnrollDisableMetaData(L); 7657 7658 // 3. Fix the vectorized code: take care of header phi's, live-outs, 7659 // predication, updating analyses. 7660 ILV.fixVectorizedLoop(State, BestVPlan); 7661 7662 ILV.printDebugTracesAtEnd(); 7663 7664 return {State.ExpandedSCEVs, ReductionResumeValues}; 7665} 7666 7667#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 7668void LoopVectorizationPlanner::printPlans(raw_ostream &O) { 7669 for (const auto &Plan : VPlans) 7670 if (PrintVPlansInDotFormat) 7671 Plan->printDOT(O); 7672 else 7673 Plan->print(O); 7674} 7675#endif 7676 7677//===--------------------------------------------------------------------===// 7678// EpilogueVectorizerMainLoop 7679//===--------------------------------------------------------------------===// 7680 7681/// This function is partially responsible for generating the control flow 7682/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7683std::pair<BasicBlock *, Value *> 7684EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( 7685 const SCEV2ValueTy &ExpandedSCEVs) { 7686 createVectorLoopSkeleton(""); 7687 7688 // Generate the code to check the minimum iteration count of the vector 7689 // epilogue (see below). 7690 EPI.EpilogueIterationCountCheck = 7691 emitIterationCountCheck(LoopScalarPreHeader, true); 7692 EPI.EpilogueIterationCountCheck->setName("iter.check"); 7693 7694 // Generate the code to check any assumptions that we've made for SCEV 7695 // expressions. 7696 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader); 7697 7698 // Generate the code that checks at runtime if arrays overlap. We put the 7699 // checks into a separate block to make the more common case of few elements 7700 // faster. 7701 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader); 7702 7703 // Generate the iteration count check for the main loop, *after* the check 7704 // for the epilogue loop, so that the path-length is shorter for the case 7705 // that goes directly through the vector epilogue. The longer-path length for 7706 // the main loop is compensated for, by the gain from vectorizing the larger 7707 // trip count. Note: the branch will get updated later on when we vectorize 7708 // the epilogue. 7709 EPI.MainLoopIterationCountCheck = 7710 emitIterationCountCheck(LoopScalarPreHeader, false); 7711 7712 // Generate the induction variable. 7713 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); 7714 7715 // Skip induction resume value creation here because they will be created in 7716 // the second pass for the scalar loop. The induction resume values for the 7717 // inductions in the epilogue loop are created before executing the plan for 7718 // the epilogue loop. 7719 7720 return {completeLoopSkeleton(), nullptr}; 7721} 7722 7723void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { 7724 LLVM_DEBUG({ 7725 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" 7726 << "Main Loop VF:" << EPI.MainLoopVF 7727 << ", Main Loop UF:" << EPI.MainLoopUF 7728 << ", Epilogue Loop VF:" << EPI.EpilogueVF 7729 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7730 }); 7731} 7732 7733void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { 7734 DEBUG_WITH_TYPE(VerboseDebug, { 7735 dbgs() << "intermediate fn:\n" 7736 << *OrigLoop->getHeader()->getParent() << "\n"; 7737 }); 7738} 7739 7740BasicBlock * 7741EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, 7742 bool ForEpilogue) { 7743 assert(Bypass && "Expected valid bypass basic block."); 7744 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; 7745 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; 7746 Value *Count = getTripCount(); 7747 // Reuse existing vector loop preheader for TC checks. 7748 // Note that new preheader block is generated for vector loop. 7749 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 7750 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 7751 7752 // Generate code to check if the loop's trip count is less than VF * UF of the 7753 // main vector loop. 7754 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector() 7755 : VF.isVector()) 7756 ? ICmpInst::ICMP_ULE 7757 : ICmpInst::ICMP_ULT; 7758 7759 Value *CheckMinIters = Builder.CreateICmp( 7760 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), 7761 "min.iters.check"); 7762 7763 if (!ForEpilogue) 7764 TCCheckBlock->setName("vector.main.loop.iter.check"); 7765 7766 // Create new preheader for vector loop. 7767 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), 7768 DT, LI, nullptr, "vector.ph"); 7769 7770 if (ForEpilogue) { 7771 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 7772 DT->getNode(Bypass)->getIDom()) && 7773 "TC check is expected to dominate Bypass"); 7774 7775 // Update dominator for Bypass & LoopExit. 7776 DT->changeImmediateDominator(Bypass, TCCheckBlock); 7777 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7778 // For loops with multiple exits, there's no edge from the middle block 7779 // to exit blocks (as the epilogue must run) and thus no need to update 7780 // the immediate dominator of the exit blocks. 7781 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 7782 7783 LoopBypassBlocks.push_back(TCCheckBlock); 7784 7785 // Save the trip count so we don't have to regenerate it in the 7786 // vec.epilog.iter.check. This is safe to do because the trip count 7787 // generated here dominates the vector epilog iter check. 7788 EPI.TripCount = Count; 7789 } 7790 7791 BranchInst &BI = 7792 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7793 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) 7794 setBranchWeights(BI, MinItersBypassWeights); 7795 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); 7796 7797 return TCCheckBlock; 7798} 7799 7800//===--------------------------------------------------------------------===// 7801// EpilogueVectorizerEpilogueLoop 7802//===--------------------------------------------------------------------===// 7803 7804/// This function is partially responsible for generating the control flow 7805/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7806std::pair<BasicBlock *, Value *> 7807EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( 7808 const SCEV2ValueTy &ExpandedSCEVs) { 7809 createVectorLoopSkeleton("vec.epilog."); 7810 7811 // Now, compare the remaining count and if there aren't enough iterations to 7812 // execute the vectorized epilogue skip to the scalar part. 7813 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; 7814 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); 7815 LoopVectorPreHeader = 7816 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 7817 LI, nullptr, "vec.epilog.ph"); 7818 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, 7819 VecEpilogueIterationCountCheck); 7820 7821 // Adjust the control flow taking the state info from the main loop 7822 // vectorization into account. 7823 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && 7824 "expected this to be saved from the previous pass."); 7825 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( 7826 VecEpilogueIterationCountCheck, LoopVectorPreHeader); 7827 7828 DT->changeImmediateDominator(LoopVectorPreHeader, 7829 EPI.MainLoopIterationCountCheck); 7830 7831 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( 7832 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7833 7834 if (EPI.SCEVSafetyCheck) 7835 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( 7836 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7837 if (EPI.MemSafetyCheck) 7838 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( 7839 VecEpilogueIterationCountCheck, LoopScalarPreHeader); 7840 7841 DT->changeImmediateDominator( 7842 VecEpilogueIterationCountCheck, 7843 VecEpilogueIterationCountCheck->getSinglePredecessor()); 7844 7845 DT->changeImmediateDominator(LoopScalarPreHeader, 7846 EPI.EpilogueIterationCountCheck); 7847 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) 7848 // If there is an epilogue which must run, there's no edge from the 7849 // middle block to exit blocks and thus no need to update the immediate 7850 // dominator of the exit blocks. 7851 DT->changeImmediateDominator(LoopExitBlock, 7852 EPI.EpilogueIterationCountCheck); 7853 7854 // Keep track of bypass blocks, as they feed start values to the induction and 7855 // reduction phis in the scalar loop preheader. 7856 if (EPI.SCEVSafetyCheck) 7857 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); 7858 if (EPI.MemSafetyCheck) 7859 LoopBypassBlocks.push_back(EPI.MemSafetyCheck); 7860 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); 7861 7862 // The vec.epilog.iter.check block may contain Phi nodes from inductions or 7863 // reductions which merge control-flow from the latch block and the middle 7864 // block. Update the incoming values here and move the Phi into the preheader. 7865 SmallVector<PHINode *, 4> PhisInBlock; 7866 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) 7867 PhisInBlock.push_back(&Phi); 7868 7869 for (PHINode *Phi : PhisInBlock) { 7870 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI()); 7871 Phi->replaceIncomingBlockWith( 7872 VecEpilogueIterationCountCheck->getSinglePredecessor(), 7873 VecEpilogueIterationCountCheck); 7874 7875 // If the phi doesn't have an incoming value from the 7876 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7877 // value and also those from other check blocks. This is needed for 7878 // reduction phis only. 7879 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { 7880 return EPI.EpilogueIterationCountCheck == IncB; 7881 })) 7882 continue; 7883 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); 7884 if (EPI.SCEVSafetyCheck) 7885 Phi->removeIncomingValue(EPI.SCEVSafetyCheck); 7886 if (EPI.MemSafetyCheck) 7887 Phi->removeIncomingValue(EPI.MemSafetyCheck); 7888 } 7889 7890 // Generate a resume induction for the vector epilogue and put it in the 7891 // vector epilogue preheader 7892 Type *IdxTy = Legal->getWidestInductionType(); 7893 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val"); 7894 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt()); 7895 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck); 7896 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0), 7897 EPI.MainLoopIterationCountCheck); 7898 7899 // Generate induction resume values. These variables save the new starting 7900 // indexes for the scalar loop. They are used to test if there are any tail 7901 // iterations left once the vector loop has completed. 7902 // Note that when the vectorized epilogue is skipped due to iteration count 7903 // check, then the resume value for the induction variable comes from 7904 // the trip count of the main vector loop, hence passing the AdditionalBypass 7905 // argument. 7906 createInductionResumeValues(ExpandedSCEVs, 7907 {VecEpilogueIterationCountCheck, 7908 EPI.VectorTripCount} /* AdditionalBypass */); 7909 7910 return {completeLoopSkeleton(), EPResumeVal}; 7911} 7912 7913BasicBlock * 7914EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( 7915 BasicBlock *Bypass, BasicBlock *Insert) { 7916 7917 assert(EPI.TripCount && 7918 "Expected trip count to have been safed in the first pass."); 7919 assert( 7920 (!isa<Instruction>(EPI.TripCount) || 7921 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && 7922 "saved trip count does not dominate insertion point."); 7923 Value *TC = EPI.TripCount; 7924 IRBuilder<> Builder(Insert->getTerminator()); 7925 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); 7926 7927 // Generate code to check if the loop's trip count is less than VF * UF of the 7928 // vector epilogue loop. 7929 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) 7930 ? ICmpInst::ICMP_ULE 7931 : ICmpInst::ICMP_ULT; 7932 7933 Value *CheckMinIters = 7934 Builder.CreateICmp(P, Count, 7935 createStepForVF(Builder, Count->getType(), 7936 EPI.EpilogueVF, EPI.EpilogueUF), 7937 "min.epilog.iters.check"); 7938 7939 BranchInst &BI = 7940 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); 7941 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { 7942 unsigned MainLoopStep = UF * VF.getKnownMinValue(); 7943 unsigned EpilogueLoopStep = 7944 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); 7945 // We assume the remaining `Count` is equally distributed in 7946 // [0, MainLoopStep) 7947 // So the probability for `Count < EpilogueLoopStep` should be 7948 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 7949 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); 7950 const uint32_t Weights[] = {EstimatedSkipCount, 7951 MainLoopStep - EstimatedSkipCount}; 7952 setBranchWeights(BI, Weights); 7953 } 7954 ReplaceInstWithInst(Insert->getTerminator(), &BI); 7955 7956 LoopBypassBlocks.push_back(Insert); 7957 return Insert; 7958} 7959 7960void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { 7961 LLVM_DEBUG({ 7962 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" 7963 << "Epilogue Loop VF:" << EPI.EpilogueVF 7964 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; 7965 }); 7966} 7967 7968void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { 7969 DEBUG_WITH_TYPE(VerboseDebug, { 7970 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n"; 7971 }); 7972} 7973 7974bool LoopVectorizationPlanner::getDecisionAndClampRange( 7975 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { 7976 assert(!Range.isEmpty() && "Trying to test an empty VF range."); 7977 bool PredicateAtRangeStart = Predicate(Range.Start); 7978 7979 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) 7980 if (Predicate(TmpVF) != PredicateAtRangeStart) { 7981 Range.End = TmpVF; 7982 break; 7983 } 7984 7985 return PredicateAtRangeStart; 7986} 7987 7988/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 7989/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 7990/// of VF's starting at a given VF and extending it as much as possible. Each 7991/// vectorization decision can potentially shorten this sub-range during 7992/// buildVPlan(). 7993void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, 7994 ElementCount MaxVF) { 7995 auto MaxVFTimes2 = MaxVF * 2; 7996 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 7997 VFRange SubRange = {VF, MaxVFTimes2}; 7998 VPlans.push_back(buildVPlan(SubRange)); 7999 VF = SubRange.End; 8000 } 8001} 8002 8003VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 8004 VPlan &Plan) { 8005 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 8006 8007 // Look for cached value. 8008 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 8009 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 8010 if (ECEntryIt != EdgeMaskCache.end()) 8011 return ECEntryIt->second; 8012 8013 VPValue *SrcMask = getBlockInMask(Src); 8014 8015 // The terminator has to be a branch inst! 8016 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 8017 assert(BI && "Unexpected terminator found"); 8018 8019 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 8020 return EdgeMaskCache[Edge] = SrcMask; 8021 8022 // If source is an exiting block, we know the exit edge is dynamically dead 8023 // in the vector loop, and thus we don't need to restrict the mask. Avoid 8024 // adding uses of an otherwise potentially dead instruction. 8025 if (OrigLoop->isLoopExiting(Src)) 8026 return EdgeMaskCache[Edge] = SrcMask; 8027 8028 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition()); 8029 assert(EdgeMask && "No Edge Mask found for condition"); 8030 8031 if (BI->getSuccessor(0) != Dst) 8032 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); 8033 8034 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. 8035 // The condition is 'SrcMask && EdgeMask', which is equivalent to 8036 // 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8037 // The select version does not introduce new UB if SrcMask is false and 8038 // EdgeMask is poison. Using 'and' here introduces undefined behavior. 8039 VPValue *False = Plan.getVPValueOrAddLiveIn( 8040 ConstantInt::getFalse(BI->getCondition()->getType())); 8041 EdgeMask = 8042 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); 8043 } 8044 8045 return EdgeMaskCache[Edge] = EdgeMask; 8046} 8047 8048void VPRecipeBuilder::createHeaderMask(VPlan &Plan) { 8049 BasicBlock *Header = OrigLoop->getHeader(); 8050 8051 // When not folding the tail, use nullptr to model all-true mask. 8052 if (!CM.foldTailByMasking()) { 8053 BlockMaskCache[Header] = nullptr; 8054 return; 8055 } 8056 8057 // Introduce the early-exit compare IV <= BTC to form header block mask. 8058 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8059 // constructing the desired canonical IV in the header block as its first 8060 // non-phi instructions. 8061 8062 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); 8063 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); 8064 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); 8065 HeaderVPBB->insert(IV, NewInsertionPoint); 8066 8067 VPBuilder::InsertPointGuard Guard(Builder); 8068 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); 8069 VPValue *BlockMask = nullptr; 8070 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); 8071 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); 8072 BlockMaskCache[Header] = BlockMask; 8073} 8074 8075VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { 8076 // Return the cached value. 8077 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB); 8078 assert(BCEntryIt != BlockMaskCache.end() && 8079 "Trying to access mask for block without one."); 8080 return BCEntryIt->second; 8081} 8082 8083void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) { 8084 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 8085 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed"); 8086 assert(OrigLoop->getHeader() != BB && 8087 "Loop header must have cached block mask"); 8088 8089 // All-one mask is modelled as no-mask following the convention for masked 8090 // load/store/gather/scatter. Initialize BlockMask to no-mask. 8091 VPValue *BlockMask = nullptr; 8092 // This is the block mask. We OR all incoming edges. 8093 for (auto *Predecessor : predecessors(BB)) { 8094 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 8095 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. 8096 BlockMaskCache[BB] = EdgeMask; 8097 return; 8098 } 8099 8100 if (!BlockMask) { // BlockMask has its initialized nullptr value. 8101 BlockMask = EdgeMask; 8102 continue; 8103 } 8104 8105 BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); 8106 } 8107 8108 BlockMaskCache[BB] = BlockMask; 8109} 8110 8111VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, 8112 ArrayRef<VPValue *> Operands, 8113 VFRange &Range, 8114 VPlanPtr &Plan) { 8115 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 8116 "Must be called with either a load or store"); 8117 8118 auto willWiden = [&](ElementCount VF) -> bool { 8119 LoopVectorizationCostModel::InstWidening Decision = 8120 CM.getWideningDecision(I, VF); 8121 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 8122 "CM decision should be taken at this point."); 8123 if (Decision == LoopVectorizationCostModel::CM_Interleave) 8124 return true; 8125 if (CM.isScalarAfterVectorization(I, VF) || 8126 CM.isProfitableToScalarize(I, VF)) 8127 return false; 8128 return Decision != LoopVectorizationCostModel::CM_Scalarize; 8129 }; 8130 8131 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 8132 return nullptr; 8133 8134 VPValue *Mask = nullptr; 8135 if (Legal->isMaskRequired(I)) 8136 Mask = getBlockInMask(I->getParent()); 8137 8138 // Determine if the pointer operand of the access is either consecutive or 8139 // reverse consecutive. 8140 LoopVectorizationCostModel::InstWidening Decision = 8141 CM.getWideningDecision(I, Range.Start); 8142 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; 8143 bool Consecutive = 8144 Reverse || Decision == LoopVectorizationCostModel::CM_Widen; 8145 8146 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1]; 8147 if (Consecutive) { 8148 auto *GEP = dyn_cast<GetElementPtrInst>( 8149 Ptr->getUnderlyingValue()->stripPointerCasts()); 8150 auto *VectorPtr = new VPVectorPointerRecipe( 8151 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, 8152 I->getDebugLoc()); 8153 Builder.getInsertBlock()->appendRecipe(VectorPtr); 8154 Ptr = VectorPtr; 8155 } 8156 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 8157 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, 8158 Reverse); 8159 8160 StoreInst *Store = cast<StoreInst>(I); 8161 return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask, 8162 Consecutive, Reverse); 8163} 8164 8165/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8166/// insert a recipe to expand the step for the induction recipe. 8167static VPWidenIntOrFpInductionRecipe * 8168createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, 8169 VPValue *Start, const InductionDescriptor &IndDesc, 8170 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, 8171 VFRange &Range) { 8172 assert(IndDesc.getStartValue() == 8173 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); 8174 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && 8175 "step must be loop invariant"); 8176 8177 VPValue *Step = 8178 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); 8179 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) { 8180 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); 8181 } 8182 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here"); 8183 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); 8184} 8185 8186VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( 8187 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) { 8188 8189 // Check if this is an integer or fp induction. If so, build the recipe that 8190 // produces its scalar and vector values. 8191 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) 8192 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan, 8193 *PSE.getSE(), *OrigLoop, Range); 8194 8195 // Check if this is pointer induction. If so, build the recipe for it. 8196 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { 8197 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(), 8198 *PSE.getSE()); 8199 return new VPWidenPointerInductionRecipe( 8200 Phi, Operands[0], Step, *II, 8201 LoopVectorizationPlanner::getDecisionAndClampRange( 8202 [&](ElementCount VF) { 8203 return CM.isScalarAfterVectorization(Phi, VF); 8204 }, 8205 Range)); 8206 } 8207 return nullptr; 8208} 8209 8210VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( 8211 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) { 8212 // Optimize the special case where the source is a constant integer 8213 // induction variable. Notice that we can only optimize the 'trunc' case 8214 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8215 // (c) other casts depend on pointer size. 8216 8217 // Determine whether \p K is a truncation based on an induction variable that 8218 // can be optimized. 8219 auto isOptimizableIVTruncate = 8220 [&](Instruction *K) -> std::function<bool(ElementCount)> { 8221 return [=](ElementCount VF) -> bool { 8222 return CM.isOptimizableIVTruncate(K, VF); 8223 }; 8224 }; 8225 8226 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8227 isOptimizableIVTruncate(I), Range)) { 8228 8229 auto *Phi = cast<PHINode>(I->getOperand(0)); 8230 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); 8231 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue()); 8232 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(), 8233 *OrigLoop, Range); 8234 } 8235 return nullptr; 8236} 8237 8238VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, 8239 ArrayRef<VPValue *> Operands, 8240 VPlanPtr &Plan) { 8241 // If all incoming values are equal, the incoming VPValue can be used directly 8242 // instead of creating a new VPBlendRecipe. 8243 if (llvm::all_equal(Operands)) 8244 return Operands[0]; 8245 8246 unsigned NumIncoming = Phi->getNumIncomingValues(); 8247 // For in-loop reductions, we do not need to create an additional select. 8248 VPValue *InLoopVal = nullptr; 8249 for (unsigned In = 0; In < NumIncoming; In++) { 8250 PHINode *PhiOp = 8251 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue()); 8252 if (PhiOp && CM.isInLoopReduction(PhiOp)) { 8253 assert(!InLoopVal && "Found more than one in-loop reduction!"); 8254 InLoopVal = Operands[In]; 8255 } 8256 } 8257 8258 assert((!InLoopVal || NumIncoming == 2) && 8259 "Found an in-loop reduction for PHI with unexpected number of " 8260 "incoming values"); 8261 if (InLoopVal) 8262 return Operands[Operands[0] == InLoopVal ? 1 : 0]; 8263 8264 // We know that all PHIs in non-header blocks are converted into selects, so 8265 // we don't have to worry about the insertion order and we can just use the 8266 // builder. At this point we generate the predication tree. There may be 8267 // duplications since this is a simple recursive scan, but future 8268 // optimizations will clean it up. 8269 SmallVector<VPValue *, 2> OperandsWithMask; 8270 8271 for (unsigned In = 0; In < NumIncoming; In++) { 8272 VPValue *EdgeMask = 8273 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan); 8274 assert((EdgeMask || NumIncoming == 1) && 8275 "Multiple predecessors with one having a full mask"); 8276 OperandsWithMask.push_back(Operands[In]); 8277 if (EdgeMask) 8278 OperandsWithMask.push_back(EdgeMask); 8279 } 8280 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask)); 8281} 8282 8283VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, 8284 ArrayRef<VPValue *> Operands, 8285 VFRange &Range, 8286 VPlanPtr &Plan) { 8287 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 8288 [this, CI](ElementCount VF) { 8289 return CM.isScalarWithPredication(CI, VF); 8290 }, 8291 Range); 8292 8293 if (IsPredicated) 8294 return nullptr; 8295 8296 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 8297 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 8298 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || 8299 ID == Intrinsic::pseudoprobe || 8300 ID == Intrinsic::experimental_noalias_scope_decl)) 8301 return nullptr; 8302 8303 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size())); 8304 8305 // Is it beneficial to perform intrinsic call compared to lib call? 8306 bool ShouldUseVectorIntrinsic = 8307 ID && LoopVectorizationPlanner::getDecisionAndClampRange( 8308 [&](ElementCount VF) -> bool { 8309 return CM.getCallWideningDecision(CI, VF).Kind == 8310 LoopVectorizationCostModel::CM_IntrinsicCall; 8311 }, 8312 Range); 8313 if (ShouldUseVectorIntrinsic) 8314 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID, 8315 CI->getDebugLoc()); 8316 8317 Function *Variant = nullptr; 8318 std::optional<unsigned> MaskPos; 8319 // Is better to call a vectorized version of the function than to to scalarize 8320 // the call? 8321 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( 8322 [&](ElementCount VF) -> bool { 8323 // The following case may be scalarized depending on the VF. 8324 // The flag shows whether we can use a usual Call for vectorized 8325 // version of the instruction. 8326 8327 // If we've found a variant at a previous VF, then stop looking. A 8328 // vectorized variant of a function expects input in a certain shape 8329 // -- basically the number of input registers, the number of lanes 8330 // per register, and whether there's a mask required. 8331 // We store a pointer to the variant in the VPWidenCallRecipe, so 8332 // once we have an appropriate variant it's only valid for that VF. 8333 // This will force a different vplan to be generated for each VF that 8334 // finds a valid variant. 8335 if (Variant) 8336 return false; 8337 LoopVectorizationCostModel::CallWideningDecision Decision = 8338 CM.getCallWideningDecision(CI, VF); 8339 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { 8340 Variant = Decision.Variant; 8341 MaskPos = Decision.MaskPos; 8342 return true; 8343 } 8344 8345 return false; 8346 }, 8347 Range); 8348 if (ShouldUseVectorCall) { 8349 if (MaskPos.has_value()) { 8350 // We have 2 cases that would require a mask: 8351 // 1) The block needs to be predicated, either due to a conditional 8352 // in the scalar loop or use of an active lane mask with 8353 // tail-folding, and we use the appropriate mask for the block. 8354 // 2) No mask is required for the block, but the only available 8355 // vector variant at this VF requires a mask, so we synthesize an 8356 // all-true mask. 8357 VPValue *Mask = nullptr; 8358 if (Legal->isMaskRequired(CI)) 8359 Mask = getBlockInMask(CI->getParent()); 8360 else 8361 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( 8362 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); 8363 8364 Ops.insert(Ops.begin() + *MaskPos, Mask); 8365 } 8366 8367 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), 8368 Intrinsic::not_intrinsic, CI->getDebugLoc(), 8369 Variant); 8370 } 8371 8372 return nullptr; 8373} 8374 8375bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { 8376 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && 8377 !isa<StoreInst>(I) && "Instruction should have been handled earlier"); 8378 // Instruction should be widened, unless it is scalar after vectorization, 8379 // scalarization is profitable or it is predicated. 8380 auto WillScalarize = [this, I](ElementCount VF) -> bool { 8381 return CM.isScalarAfterVectorization(I, VF) || 8382 CM.isProfitableToScalarize(I, VF) || 8383 CM.isScalarWithPredication(I, VF); 8384 }; 8385 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, 8386 Range); 8387} 8388 8389VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I, 8390 ArrayRef<VPValue *> Operands, 8391 VPBasicBlock *VPBB, VPlanPtr &Plan) { 8392 switch (I->getOpcode()) { 8393 default: 8394 return nullptr; 8395 case Instruction::SDiv: 8396 case Instruction::UDiv: 8397 case Instruction::SRem: 8398 case Instruction::URem: { 8399 // If not provably safe, use a select to form a safe divisor before widening the 8400 // div/rem operation itself. Otherwise fall through to general handling below. 8401 if (CM.isPredicatedInst(I)) { 8402 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); 8403 VPValue *Mask = getBlockInMask(I->getParent()); 8404 VPValue *One = Plan->getVPValueOrAddLiveIn( 8405 ConstantInt::get(I->getType(), 1u, false)); 8406 auto *SafeRHS = 8407 new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, 8408 I->getDebugLoc()); 8409 VPBB->appendRecipe(SafeRHS); 8410 Ops[1] = SafeRHS; 8411 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end())); 8412 } 8413 [[fallthrough]]; 8414 } 8415 case Instruction::Add: 8416 case Instruction::And: 8417 case Instruction::AShr: 8418 case Instruction::FAdd: 8419 case Instruction::FCmp: 8420 case Instruction::FDiv: 8421 case Instruction::FMul: 8422 case Instruction::FNeg: 8423 case Instruction::FRem: 8424 case Instruction::FSub: 8425 case Instruction::ICmp: 8426 case Instruction::LShr: 8427 case Instruction::Mul: 8428 case Instruction::Or: 8429 case Instruction::Select: 8430 case Instruction::Shl: 8431 case Instruction::Sub: 8432 case Instruction::Xor: 8433 case Instruction::Freeze: 8434 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end())); 8435 }; 8436} 8437 8438void VPRecipeBuilder::fixHeaderPhis() { 8439 BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); 8440 for (VPHeaderPHIRecipe *R : PhisToFix) { 8441 auto *PN = cast<PHINode>(R->getUnderlyingValue()); 8442 VPRecipeBase *IncR = 8443 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch))); 8444 R->addOperand(IncR->getVPSingleValue()); 8445 } 8446} 8447 8448VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I, 8449 VFRange &Range, 8450 VPlan &Plan) { 8451 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 8452 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, 8453 Range); 8454 8455 bool IsPredicated = CM.isPredicatedInst(I); 8456 8457 // Even if the instruction is not marked as uniform, there are certain 8458 // intrinsic calls that can be effectively treated as such, so we check for 8459 // them here. Conservatively, we only do this for scalable vectors, since 8460 // for fixed-width VFs we can always fall back on full scalarization. 8461 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) { 8462 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) { 8463 case Intrinsic::assume: 8464 case Intrinsic::lifetime_start: 8465 case Intrinsic::lifetime_end: 8466 // For scalable vectors if one of the operands is variant then we still 8467 // want to mark as uniform, which will generate one instruction for just 8468 // the first lane of the vector. We can't scalarize the call in the same 8469 // way as for fixed-width vectors because we don't know how many lanes 8470 // there are. 8471 // 8472 // The reasons for doing it this way for scalable vectors are: 8473 // 1. For the assume intrinsic generating the instruction for the first 8474 // lane is still be better than not generating any at all. For 8475 // example, the input may be a splat across all lanes. 8476 // 2. For the lifetime start/end intrinsics the pointer operand only 8477 // does anything useful when the input comes from a stack object, 8478 // which suggests it should always be uniform. For non-stack objects 8479 // the effect is to poison the object, which still allows us to 8480 // remove the call. 8481 IsUniform = true; 8482 break; 8483 default: 8484 break; 8485 } 8486 } 8487 VPValue *BlockInMask = nullptr; 8488 if (!IsPredicated) { 8489 // Finalize the recipe for Instr, first if it is not predicated. 8490 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 8491 } else { 8492 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 8493 // Instructions marked for predication are replicated and a mask operand is 8494 // added initially. Masked replicate recipes will later be placed under an 8495 // if-then construct to prevent side-effects. Generate recipes to compute 8496 // the block mask for this region. 8497 BlockInMask = getBlockInMask(I->getParent()); 8498 } 8499 8500 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()), 8501 IsUniform, BlockInMask); 8502 return toVPRecipeResult(Recipe); 8503} 8504 8505VPRecipeOrVPValueTy 8506VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, 8507 ArrayRef<VPValue *> Operands, 8508 VFRange &Range, VPBasicBlock *VPBB, 8509 VPlanPtr &Plan) { 8510 // First, check for specific widening recipes that deal with inductions, Phi 8511 // nodes, calls and memory operations. 8512 VPRecipeBase *Recipe; 8513 if (auto Phi = dyn_cast<PHINode>(Instr)) { 8514 if (Phi->getParent() != OrigLoop->getHeader()) 8515 return tryToBlend(Phi, Operands, Plan); 8516 8517 // Always record recipes for header phis. Later first-order recurrence phis 8518 // can have earlier phis as incoming values. 8519 recordRecipeOf(Phi); 8520 8521 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) 8522 return toVPRecipeResult(Recipe); 8523 8524 VPHeaderPHIRecipe *PhiRecipe = nullptr; 8525 assert((Legal->isReductionVariable(Phi) || 8526 Legal->isFixedOrderRecurrence(Phi)) && 8527 "can only widen reductions and fixed-order recurrences here"); 8528 VPValue *StartV = Operands[0]; 8529 if (Legal->isReductionVariable(Phi)) { 8530 const RecurrenceDescriptor &RdxDesc = 8531 Legal->getReductionVars().find(Phi)->second; 8532 assert(RdxDesc.getRecurrenceStartValue() == 8533 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); 8534 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, 8535 CM.isInLoopReduction(Phi), 8536 CM.useOrderedReductions(RdxDesc)); 8537 } else { 8538 // TODO: Currently fixed-order recurrences are modeled as chains of 8539 // first-order recurrences. If there are no users of the intermediate 8540 // recurrences in the chain, the fixed order recurrence should be modeled 8541 // directly, enabling more efficient codegen. 8542 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); 8543 } 8544 8545 // Record the incoming value from the backedge, so we can add the incoming 8546 // value from the backedge after all recipes have been created. 8547 auto *Inc = cast<Instruction>( 8548 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); 8549 auto RecipeIter = Ingredient2Recipe.find(Inc); 8550 if (RecipeIter == Ingredient2Recipe.end()) 8551 recordRecipeOf(Inc); 8552 8553 PhisToFix.push_back(PhiRecipe); 8554 return toVPRecipeResult(PhiRecipe); 8555 } 8556 8557 if (isa<TruncInst>(Instr) && 8558 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands, 8559 Range, *Plan))) 8560 return toVPRecipeResult(Recipe); 8561 8562 // All widen recipes below deal only with VF > 1. 8563 if (LoopVectorizationPlanner::getDecisionAndClampRange( 8564 [&](ElementCount VF) { return VF.isScalar(); }, Range)) 8565 return nullptr; 8566 8567 if (auto *CI = dyn_cast<CallInst>(Instr)) 8568 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); 8569 8570 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) 8571 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); 8572 8573 if (!shouldWiden(Instr, Range)) 8574 return nullptr; 8575 8576 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr)) 8577 return toVPRecipeResult(new VPWidenGEPRecipe( 8578 GEP, make_range(Operands.begin(), Operands.end()))); 8579 8580 if (auto *SI = dyn_cast<SelectInst>(Instr)) { 8581 return toVPRecipeResult(new VPWidenSelectRecipe( 8582 *SI, make_range(Operands.begin(), Operands.end()))); 8583 } 8584 8585 if (auto *CI = dyn_cast<CastInst>(Instr)) { 8586 return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0], 8587 CI->getType(), *CI)); 8588 } 8589 8590 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); 8591} 8592 8593void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, 8594 ElementCount MaxVF) { 8595 assert(OrigLoop->isInnermost() && "Inner loop expected."); 8596 8597 auto MaxVFTimes2 = MaxVF * 2; 8598 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { 8599 VFRange SubRange = {VF, MaxVFTimes2}; 8600 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { 8601 // Now optimize the initial VPlan. 8602 if (!Plan->hasVF(ElementCount::getFixed(1))) 8603 VPlanTransforms::truncateToMinimalBitwidths( 8604 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); 8605 VPlanTransforms::optimize(*Plan, *PSE.getSE()); 8606 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); 8607 VPlans.push_back(std::move(Plan)); 8608 } 8609 VF = SubRange.End; 8610 } 8611} 8612 8613// Add the necessary canonical IV and branch recipes required to control the 8614// loop. 8615static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, 8616 DebugLoc DL) { 8617 Value *StartIdx = ConstantInt::get(IdxTy, 0); 8618 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx); 8619 8620 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8621 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); 8622 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); 8623 VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); 8624 Header->insert(CanonicalIVPHI, Header->begin()); 8625 8626 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar 8627 // IV by VF * UF. 8628 auto *CanonicalIVIncrement = 8629 new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, 8630 {HasNUW, false}, DL, "index.next"); 8631 CanonicalIVPHI->addOperand(CanonicalIVIncrement); 8632 8633 VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); 8634 EB->appendRecipe(CanonicalIVIncrement); 8635 8636 // Add the BranchOnCount VPInstruction to the latch. 8637 VPInstruction *BranchBack = 8638 new VPInstruction(VPInstruction::BranchOnCount, 8639 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); 8640 EB->appendRecipe(BranchBack); 8641} 8642 8643// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the 8644// original exit block. 8645static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop, 8646 VPlan &Plan) { 8647 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); 8648 BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); 8649 // Only handle single-exit loops with unique exit blocks for now. 8650 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) 8651 return; 8652 8653 // Introduce VPUsers modeling the exit values. 8654 for (PHINode &ExitPhi : ExitBB->phis()) { 8655 Value *IncomingValue = 8656 ExitPhi.getIncomingValueForBlock(ExitingBB); 8657 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue); 8658 Plan.addLiveOut(&ExitPhi, V); 8659 } 8660} 8661 8662VPlanPtr 8663LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { 8664 8665 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 8666 8667 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder); 8668 8669 // --------------------------------------------------------------------------- 8670 // Pre-construction: record ingredients whose recipes we'll need to further 8671 // process after constructing the initial VPlan. 8672 // --------------------------------------------------------------------------- 8673 8674 // For each interleave group which is relevant for this (possibly trimmed) 8675 // Range, add it to the set of groups to be later applied to the VPlan and add 8676 // placeholders for its members' Recipes which we'll be replacing with a 8677 // single VPInterleaveRecipe. 8678 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 8679 auto applyIG = [IG, this](ElementCount VF) -> bool { 8680 bool Result = (VF.isVector() && // Query is illegal for VF == 1 8681 CM.getWideningDecision(IG->getInsertPos(), VF) == 8682 LoopVectorizationCostModel::CM_Interleave); 8683 // For scalable vectors, the only interleave factor currently supported 8684 // is 2 since we require the (de)interleave2 intrinsics instead of 8685 // shufflevectors. 8686 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && 8687 "Unsupported interleave factor for scalable vectors"); 8688 return Result; 8689 }; 8690 if (!getDecisionAndClampRange(applyIG, Range)) 8691 continue; 8692 InterleaveGroups.insert(IG); 8693 for (unsigned i = 0; i < IG->getFactor(); i++) 8694 if (Instruction *Member = IG->getMember(i)) 8695 RecipeBuilder.recordRecipeOf(Member); 8696 }; 8697 8698 // --------------------------------------------------------------------------- 8699 // Build initial VPlan: Scan the body of the loop in a topological order to 8700 // visit each basic block after having visited its predecessor basic blocks. 8701 // --------------------------------------------------------------------------- 8702 8703 // Create initial VPlan skeleton, having a basic block for the pre-header 8704 // which contains SCEV expansions that need to happen before the CFG is 8705 // modified; a basic block for the vector pre-header, followed by a region for 8706 // the vector loop, followed by the middle basic block. The skeleton vector 8707 // loop region contains a header and latch basic blocks. 8708 VPlanPtr Plan = VPlan::createInitialVPlan( 8709 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8710 *PSE.getSE()); 8711 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); 8712 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); 8713 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); 8714 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); 8715 Plan->getVectorLoopRegion()->setExiting(LatchVPBB); 8716 8717 // Don't use getDecisionAndClampRange here, because we don't know the UF 8718 // so this function is better to be conservative, rather than to split 8719 // it up into different VPlans. 8720 // TODO: Consider using getDecisionAndClampRange here to split up VPlans. 8721 bool IVUpdateMayOverflow = false; 8722 for (ElementCount VF : Range) 8723 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF); 8724 8725 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); 8726 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); 8727 // When not folding the tail, we know that the induction increment will not 8728 // overflow. 8729 bool HasNUW = Style == TailFoldingStyle::None; 8730 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL); 8731 8732 // Scan the body of the loop in a topological order to visit each basic block 8733 // after having visited its predecessor basic blocks. 8734 LoopBlocksDFS DFS(OrigLoop); 8735 DFS.perform(LI); 8736 8737 VPBasicBlock *VPBB = HeaderVPBB; 8738 bool NeedsMasks = CM.foldTailByMasking() || 8739 any_of(OrigLoop->blocks(), [this](BasicBlock *BB) { 8740 return Legal->blockNeedsPredication(BB); 8741 }); 8742 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 8743 // Relevant instructions from basic block BB will be grouped into VPRecipe 8744 // ingredients and fill a new VPBasicBlock. 8745 if (VPBB != HeaderVPBB) 8746 VPBB->setName(BB->getName()); 8747 Builder.setInsertPoint(VPBB); 8748 8749 if (VPBB == HeaderVPBB) 8750 RecipeBuilder.createHeaderMask(*Plan); 8751 else if (NeedsMasks) 8752 RecipeBuilder.createBlockInMask(BB, *Plan); 8753 8754 // Introduce each ingredient into VPlan. 8755 // TODO: Model and preserve debug intrinsics in VPlan. 8756 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { 8757 Instruction *Instr = &I; 8758 SmallVector<VPValue *, 4> Operands; 8759 auto *Phi = dyn_cast<PHINode>(Instr); 8760 if (Phi && Phi->getParent() == OrigLoop->getHeader()) { 8761 Operands.push_back(Plan->getVPValueOrAddLiveIn( 8762 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))); 8763 } else { 8764 auto OpRange = Plan->mapToVPValues(Instr->operands()); 8765 Operands = {OpRange.begin(), OpRange.end()}; 8766 } 8767 8768 // Invariant stores inside loop will be deleted and a single store 8769 // with the final reduction value will be added to the exit block 8770 StoreInst *SI; 8771 if ((SI = dyn_cast<StoreInst>(&I)) && 8772 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) 8773 continue; 8774 8775 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe( 8776 Instr, Operands, Range, VPBB, Plan); 8777 if (!RecipeOrValue) 8778 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan); 8779 // If Instr can be simplified to an existing VPValue, use it. 8780 if (isa<VPValue *>(RecipeOrValue)) { 8781 auto *VPV = cast<VPValue *>(RecipeOrValue); 8782 Plan->addVPValue(Instr, VPV); 8783 // If the re-used value is a recipe, register the recipe for the 8784 // instruction, in case the recipe for Instr needs to be recorded. 8785 if (VPRecipeBase *R = VPV->getDefiningRecipe()) 8786 RecipeBuilder.setRecipe(Instr, R); 8787 continue; 8788 } 8789 // Otherwise, add the new recipe. 8790 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue); 8791 for (auto *Def : Recipe->definedValues()) { 8792 auto *UV = Def->getUnderlyingValue(); 8793 Plan->addVPValue(UV, Def); 8794 } 8795 8796 RecipeBuilder.setRecipe(Instr, Recipe); 8797 if (isa<VPHeaderPHIRecipe>(Recipe)) { 8798 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 8799 // the following cases, VPHeaderPHIRecipes may be created after non-phi 8800 // recipes and need to be moved to the phi section of HeaderVPBB: 8801 // * tail-folding (non-phi recipes computing the header mask are 8802 // introduced earlier than regular header phi recipes, and should appear 8803 // after them) 8804 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 8805 8806 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || 8807 CM.foldTailByMasking() || isa<TruncInst>(Instr)) && 8808 "unexpected recipe needs moving"); 8809 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); 8810 } else 8811 VPBB->appendRecipe(Recipe); 8812 } 8813 8814 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); 8815 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); 8816 } 8817 8818 // After here, VPBB should not be used. 8819 VPBB = nullptr; 8820 8821 if (CM.requiresScalarEpilogue(Range)) { 8822 // No edge from the middle block to the unique exit block has been inserted 8823 // and there is nothing to fix from vector loop; phis should have incoming 8824 // from scalar loop only. 8825 } else 8826 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan); 8827 8828 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && 8829 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && 8830 "entry block must be set to a VPRegionBlock having a non-empty entry " 8831 "VPBasicBlock"); 8832 RecipeBuilder.fixHeaderPhis(); 8833 8834 // --------------------------------------------------------------------------- 8835 // Transform initial VPlan: Apply previously taken decisions, in order, to 8836 // bring the VPlan to its final state. 8837 // --------------------------------------------------------------------------- 8838 8839 // Adjust the recipes for any inloop reductions. 8840 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start); 8841 8842 // Interleave memory: for each Interleave Group we marked earlier as relevant 8843 // for this VPlan, replace the Recipes widening its memory instructions with a 8844 // single VPInterleaveRecipe at its insertion point. 8845 for (const auto *IG : InterleaveGroups) { 8846 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 8847 RecipeBuilder.getRecipe(IG->getInsertPos())); 8848 SmallVector<VPValue *, 4> StoredValues; 8849 for (unsigned i = 0; i < IG->getFactor(); ++i) 8850 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) { 8851 auto *StoreR = 8852 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI)); 8853 StoredValues.push_back(StoreR->getStoredValue()); 8854 } 8855 8856 bool NeedsMaskForGaps = 8857 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); 8858 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, 8859 Recipe->getMask(), NeedsMaskForGaps); 8860 VPIG->insertBefore(Recipe); 8861 unsigned J = 0; 8862 for (unsigned i = 0; i < IG->getFactor(); ++i) 8863 if (Instruction *Member = IG->getMember(i)) { 8864 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member); 8865 if (!Member->getType()->isVoidTy()) { 8866 VPValue *OriginalV = MemberR->getVPSingleValue(); 8867 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J)); 8868 J++; 8869 } 8870 MemberR->eraseFromParent(); 8871 } 8872 } 8873 8874 for (ElementCount VF : Range) 8875 Plan->addVF(VF); 8876 Plan->setName("Initial VPlan"); 8877 8878 // Replace VPValues for known constant strides guaranteed by predicate scalar 8879 // evolution. 8880 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { 8881 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue(); 8882 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV)); 8883 // Only handle constant strides for now. 8884 if (!ScevStride) 8885 continue; 8886 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt()); 8887 8888 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI); 8889 // The versioned value may not be used in the loop directly, so just add a 8890 // new live-in in those cases. 8891 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV); 8892 } 8893 8894 // From this point onwards, VPlan-to-VPlan transformations may change the plan 8895 // in ways that accessing values using original IR values is incorrect. 8896 Plan->disableValue2VPValue(); 8897 8898 // Sink users of fixed-order recurrence past the recipe defining the previous 8899 // value and introduce FirstOrderRecurrenceSplice VPInstructions. 8900 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder)) 8901 return nullptr; 8902 8903 if (useActiveLaneMask(Style)) { 8904 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once 8905 // TailFoldingStyle is visible there. 8906 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); 8907 bool WithoutRuntimeCheck = 8908 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; 8909 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, 8910 WithoutRuntimeCheck); 8911 } 8912 return Plan; 8913} 8914 8915VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 8916 // Outer loop handling: They may require CFG and instruction level 8917 // transformations before even evaluating whether vectorization is profitable. 8918 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 8919 // the vectorization pipeline. 8920 assert(!OrigLoop->isInnermost()); 8921 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 8922 8923 // Create new empty VPlan 8924 auto Plan = VPlan::createInitialVPlan( 8925 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), 8926 *PSE.getSE()); 8927 8928 // Build hierarchical CFG 8929 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 8930 HCFGBuilder.buildHierarchicalCFG(); 8931 8932 for (ElementCount VF : Range) 8933 Plan->addVF(VF); 8934 8935 VPlanTransforms::VPInstructionsToVPRecipes( 8936 Plan, 8937 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, 8938 *PSE.getSE(), *TLI); 8939 8940 // Remove the existing terminator of the exiting block of the top-most region. 8941 // A BranchOnCount will be added instead when adding the canonical IV recipes. 8942 auto *Term = 8943 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); 8944 Term->eraseFromParent(); 8945 8946 // Tail folding is not supported for outer loops, so the induction increment 8947 // is guaranteed to not wrap. 8948 bool HasNUW = true; 8949 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, 8950 DebugLoc()); 8951 return Plan; 8952} 8953 8954// Adjust the recipes for reductions. For in-loop reductions the chain of 8955// instructions leading from the loop exit instr to the phi need to be converted 8956// to reductions, with one operand being vector and the other being the scalar 8957// reduction chain. For other reductions, a select is introduced between the phi 8958// and live-out recipes when folding the tail. 8959// 8960// A ComputeReductionResult recipe is added to the middle block, also for 8961// in-loop reductions which compute their result in-loop, because generating 8962// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 8963void LoopVectorizationPlanner::adjustRecipesForReductions( 8964 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, 8965 ElementCount MinVF) { 8966 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); 8967 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); 8968 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores 8969 // sank outside of the loop would keep the same order as they had in the 8970 // original loop. 8971 SmallVector<VPReductionPHIRecipe *> ReductionPHIList; 8972 for (VPRecipeBase &R : Header->phis()) { 8973 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) 8974 ReductionPHIList.emplace_back(ReductionPhi); 8975 } 8976 bool HasIntermediateStore = false; 8977 stable_sort(ReductionPHIList, 8978 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, 8979 const VPReductionPHIRecipe *R2) { 8980 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; 8981 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; 8982 HasIntermediateStore |= IS1 || IS2; 8983 8984 // If neither of the recipes has an intermediate store, keep the 8985 // order the same. 8986 if (!IS1 && !IS2) 8987 return false; 8988 8989 // If only one of the recipes has an intermediate store, then 8990 // move it towards the beginning of the list. 8991 if (IS1 && !IS2) 8992 return true; 8993 8994 if (!IS1 && IS2) 8995 return false; 8996 8997 // If both recipes have an intermediate store, then the recipe 8998 // with the later store should be processed earlier. So it 8999 // should go to the beginning of the list. 9000 return DT->dominates(IS2, IS1); 9001 }); 9002 9003 if (HasIntermediateStore && ReductionPHIList.size() > 1) 9004 for (VPRecipeBase *R : ReductionPHIList) 9005 R->moveBefore(*Header, Header->getFirstNonPhi()); 9006 9007 for (VPRecipeBase &R : Header->phis()) { 9008 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9009 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) 9010 continue; 9011 9012 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9013 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9014 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && 9015 "AnyOf reductions are not allowed for in-loop reductions"); 9016 9017 // Collect the chain of "link" recipes for the reduction starting at PhiR. 9018 SetVector<VPSingleDefRecipe *> Worklist; 9019 Worklist.insert(PhiR); 9020 for (unsigned I = 0; I != Worklist.size(); ++I) { 9021 VPSingleDefRecipe *Cur = Worklist[I]; 9022 for (VPUser *U : Cur->users()) { 9023 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U); 9024 if (!UserRecipe) { 9025 assert(isa<VPLiveOut>(U) && 9026 "U must either be a VPSingleDef or VPLiveOut"); 9027 continue; 9028 } 9029 Worklist.insert(UserRecipe); 9030 } 9031 } 9032 9033 // Visit operation "Links" along the reduction chain top-down starting from 9034 // the phi until LoopExitValue. We keep track of the previous item 9035 // (PreviousLink) to tell which of the two operands of a Link will remain 9036 // scalar and which will be reduced. For minmax by select(cmp), Link will be 9037 // the select instructions. 9038 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. 9039 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { 9040 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); 9041 9042 // Index of the first operand which holds a non-mask vector operand. 9043 unsigned IndexOfFirstOperand; 9044 // Recognize a call to the llvm.fmuladd intrinsic. 9045 bool IsFMulAdd = (Kind == RecurKind::FMulAdd); 9046 VPValue *VecOp; 9047 VPBasicBlock *LinkVPBB = CurrentLink->getParent(); 9048 if (IsFMulAdd) { 9049 assert( 9050 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && 9051 "Expected instruction to be a call to the llvm.fmuladd intrinsic"); 9052 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || 9053 isa<VPWidenCallRecipe>(CurrentLink)) && 9054 CurrentLink->getOperand(2) == PreviousLink && 9055 "expected a call where the previous link is the added operand"); 9056 9057 // If the instruction is a call to the llvm.fmuladd intrinsic then we 9058 // need to create an fmul recipe (multiplying the first two operands of 9059 // the fmuladd together) to use as the vector operand for the fadd 9060 // reduction. 9061 VPInstruction *FMulRecipe = new VPInstruction( 9062 Instruction::FMul, 9063 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)}, 9064 CurrentLinkI->getFastMathFlags()); 9065 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator()); 9066 VecOp = FMulRecipe; 9067 } else { 9068 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9069 if (isa<VPWidenRecipe>(CurrentLink)) { 9070 assert(isa<CmpInst>(CurrentLinkI) && 9071 "need to have the compare of the select"); 9072 continue; 9073 } 9074 assert(isa<VPWidenSelectRecipe>(CurrentLink) && 9075 "must be a select recipe"); 9076 IndexOfFirstOperand = 1; 9077 } else { 9078 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && 9079 "Expected to replace a VPWidenSC"); 9080 IndexOfFirstOperand = 0; 9081 } 9082 // Note that for non-commutable operands (cmp-selects), the semantics of 9083 // the cmp-select are captured in the recurrence kind. 9084 unsigned VecOpId = 9085 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink 9086 ? IndexOfFirstOperand + 1 9087 : IndexOfFirstOperand; 9088 VecOp = CurrentLink->getOperand(VecOpId); 9089 assert(VecOp != PreviousLink && 9090 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - 9091 (VecOpId - IndexOfFirstOperand)) == 9092 PreviousLink && 9093 "PreviousLink must be the operand other than VecOp"); 9094 } 9095 9096 BasicBlock *BB = CurrentLinkI->getParent(); 9097 VPValue *CondOp = nullptr; 9098 if (CM.blockNeedsPredicationForAnyReason(BB)) { 9099 VPBuilder::InsertPointGuard Guard(Builder); 9100 Builder.setInsertPoint(CurrentLink); 9101 CondOp = RecipeBuilder.getBlockInMask(BB); 9102 } 9103 9104 VPReductionRecipe *RedRecipe = new VPReductionRecipe( 9105 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp); 9106 // Append the recipe to the end of the VPBasicBlock because we need to 9107 // ensure that it comes after all of it's inputs, including CondOp. 9108 // Note that this transformation may leave over dead recipes (including 9109 // CurrentLink), which will be cleaned by a later VPlan transform. 9110 LinkVPBB->appendRecipe(RedRecipe); 9111 CurrentLink->replaceAllUsesWith(RedRecipe); 9112 PreviousLink = RedRecipe; 9113 } 9114 } 9115 Builder.setInsertPoint(&*LatchVPBB->begin()); 9116 for (VPRecipeBase &R : 9117 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { 9118 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); 9119 if (!PhiR) 9120 continue; 9121 9122 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); 9123 // If tail is folded by masking, introduce selects between the phi 9124 // and the live-out instruction of each reduction, at the beginning of the 9125 // dedicated latch block. 9126 auto *OrigExitingVPV = PhiR->getBackedgeValue(); 9127 auto *NewExitingVPV = PhiR->getBackedgeValue(); 9128 if (!PhiR->isInLoop() && CM.foldTailByMasking()) { 9129 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader()); 9130 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && 9131 "reduction recipe must be defined before latch"); 9132 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType(); 9133 std::optional<FastMathFlags> FMFs = 9134 PhiTy->isFloatingPointTy() 9135 ? std::make_optional(RdxDesc.getFastMathFlags()) 9136 : std::nullopt; 9137 NewExitingVPV = 9138 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs); 9139 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) { 9140 return isa<VPInstruction>(&U) && 9141 cast<VPInstruction>(&U)->getOpcode() == 9142 VPInstruction::ComputeReductionResult; 9143 }); 9144 if (PreferPredicatedReductionSelect || 9145 TTI.preferPredicatedReductionSelect( 9146 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy, 9147 TargetTransformInfo::ReductionFlags())) 9148 PhiR->setOperand(1, NewExitingVPV); 9149 } 9150 9151 // If the vector reduction can be performed in a smaller type, we truncate 9152 // then extend the loop exit value to enable InstCombine to evaluate the 9153 // entire expression in the smaller type. 9154 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); 9155 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { 9156 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); 9157 Type *RdxTy = RdxDesc.getRecurrenceType(); 9158 auto *Trunc = 9159 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); 9160 auto *Extnd = 9161 RdxDesc.isSigned() 9162 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) 9163 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); 9164 9165 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe()); 9166 Extnd->insertAfter(Trunc); 9167 if (PhiR->getOperand(1) == NewExitingVPV) 9168 PhiR->setOperand(1, Extnd->getVPSingleValue()); 9169 NewExitingVPV = Extnd; 9170 } 9171 9172 // We want code in the middle block to appear to execute on the location of 9173 // the scalar loop's latch terminator because: (a) it is all compiler 9174 // generated, (b) these instructions are always executed after evaluating 9175 // the latch conditional branch, and (c) other passes may add new 9176 // predecessors which terminate on this line. This is the easiest way to 9177 // ensure we don't accidentally cause an extra step back into the loop while 9178 // debugging. 9179 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); 9180 9181 // TODO: At the moment ComputeReductionResult also drives creation of the 9182 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9183 // even for in-loop reductions, until the reduction resume value handling is 9184 // also modeled in VPlan. 9185 auto *FinalReductionResult = new VPInstruction( 9186 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); 9187 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) 9188 ->appendRecipe(FinalReductionResult); 9189 OrigExitingVPV->replaceUsesWithIf( 9190 FinalReductionResult, 9191 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); 9192 } 9193 9194 VPlanTransforms::clearReductionWrapFlags(*Plan); 9195} 9196 9197#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 9198void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 9199 VPSlotTracker &SlotTracker) const { 9200 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 9201 IG->getInsertPos()->printAsOperand(O, false); 9202 O << ", "; 9203 getAddr()->printAsOperand(O, SlotTracker); 9204 VPValue *Mask = getMask(); 9205 if (Mask) { 9206 O << ", "; 9207 Mask->printAsOperand(O, SlotTracker); 9208 } 9209 9210 unsigned OpIdx = 0; 9211 for (unsigned i = 0; i < IG->getFactor(); ++i) { 9212 if (!IG->getMember(i)) 9213 continue; 9214 if (getNumStoreOperands() > 0) { 9215 O << "\n" << Indent << " store "; 9216 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker); 9217 O << " to index " << i; 9218 } else { 9219 O << "\n" << Indent << " "; 9220 getVPValue(OpIdx)->printAsOperand(O, SlotTracker); 9221 O << " = load from index " << i; 9222 } 9223 ++OpIdx; 9224 } 9225} 9226#endif 9227 9228void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { 9229 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && 9230 "Not a pointer induction according to InductionDescriptor!"); 9231 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && 9232 "Unexpected type."); 9233 9234 auto *IVR = getParent()->getPlan()->getCanonicalIV(); 9235 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0)); 9236 9237 if (onlyScalarsGenerated(State.VF)) { 9238 // This is the normalized GEP that starts counting at zero. 9239 Value *PtrInd = State.Builder.CreateSExtOrTrunc( 9240 CanonicalIV, IndDesc.getStep()->getType()); 9241 // Determine the number of scalars we need to generate for each unroll 9242 // iteration. If the instruction is uniform, we only need to generate the 9243 // first lane. Otherwise, we generate all VF values. 9244 bool IsUniform = vputils::onlyFirstLaneUsed(this); 9245 assert((IsUniform || !State.VF.isScalable()) && 9246 "Cannot scalarize a scalable VF"); 9247 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue(); 9248 9249 for (unsigned Part = 0; Part < State.UF; ++Part) { 9250 Value *PartStart = 9251 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part); 9252 9253 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 9254 Value *Idx = State.Builder.CreateAdd( 9255 PartStart, ConstantInt::get(PtrInd->getType(), Lane)); 9256 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx); 9257 9258 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane)); 9259 Value *SclrGep = emitTransformedIndex( 9260 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, 9261 IndDesc.getKind(), IndDesc.getInductionBinOp()); 9262 SclrGep->setName("next.gep"); 9263 State.set(this, SclrGep, VPIteration(Part, Lane)); 9264 } 9265 } 9266 return; 9267 } 9268 9269 Type *PhiType = IndDesc.getStep()->getType(); 9270 9271 // Build a pointer phi 9272 Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); 9273 Type *ScStValueType = ScalarStartValue->getType(); 9274 PHINode *NewPointerPhi = 9275 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV); 9276 9277 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); 9278 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH); 9279 9280 // A pointer induction, performed by using a gep 9281 Instruction *InductionLoc = &*State.Builder.GetInsertPoint(); 9282 9283 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0)); 9284 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF); 9285 Value *NumUnrolledElems = 9286 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF)); 9287 Value *InductionGEP = GetElementPtrInst::Create( 9288 State.Builder.getInt8Ty(), NewPointerPhi, 9289 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind", 9290 InductionLoc); 9291 // Add induction update using an incorrect block temporarily. The phi node 9292 // will be fixed after VPlan execution. Note that at this point the latch 9293 // block cannot be used, as it does not exist yet. 9294 // TODO: Model increment value in VPlan, by turning the recipe into a 9295 // multi-def and a subclass of VPHeaderPHIRecipe. 9296 NewPointerPhi->addIncoming(InductionGEP, VectorPH); 9297 9298 // Create UF many actual address geps that use the pointer 9299 // phi as base and a vectorized version of the step value 9300 // (<step*0, ..., step*N>) as offset. 9301 for (unsigned Part = 0; Part < State.UF; ++Part) { 9302 Type *VecPhiType = VectorType::get(PhiType, State.VF); 9303 Value *StartOffsetScalar = 9304 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part)); 9305 Value *StartOffset = 9306 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar); 9307 // Create a vector of consecutive numbers from zero to VF. 9308 StartOffset = State.Builder.CreateAdd( 9309 StartOffset, State.Builder.CreateStepVector(VecPhiType)); 9310 9311 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && 9312 "scalar step must be the same across all parts"); 9313 Value *GEP = State.Builder.CreateGEP( 9314 State.Builder.getInt8Ty(), NewPointerPhi, 9315 State.Builder.CreateMul( 9316 StartOffset, 9317 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue), 9318 "vector.gep")); 9319 State.set(this, GEP, Part); 9320 } 9321} 9322 9323void VPDerivedIVRecipe::execute(VPTransformState &State) { 9324 assert(!State.Instance && "VPDerivedIVRecipe being replicated."); 9325 9326 // Fast-math-flags propagate from the original induction instruction. 9327 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); 9328 if (FPBinOp) 9329 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); 9330 9331 Value *Step = State.get(getStepValue(), VPIteration(0, 0)); 9332 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); 9333 Value *DerivedIV = emitTransformedIndex( 9334 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, 9335 Kind, cast_if_present<BinaryOperator>(FPBinOp)); 9336 DerivedIV->setName("offset.idx"); 9337 if (TruncResultTy) { 9338 assert(TruncResultTy != DerivedIV->getType() && 9339 Step->getType()->isIntegerTy() && 9340 "Truncation requires an integer step"); 9341 DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy); 9342 } 9343 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?"); 9344 9345 State.set(this, DerivedIV, VPIteration(0, 0)); 9346} 9347 9348void VPInterleaveRecipe::execute(VPTransformState &State) { 9349 assert(!State.Instance && "Interleave group being replicated."); 9350 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(), 9351 getStoredValues(), getMask(), 9352 NeedsMaskForGaps); 9353} 9354 9355void VPReductionRecipe::execute(VPTransformState &State) { 9356 assert(!State.Instance && "Reduction being replicated."); 9357 Value *PrevInChain = State.get(getChainOp(), 0); 9358 RecurKind Kind = RdxDesc.getRecurrenceKind(); 9359 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc); 9360 // Propagate the fast-math flags carried by the underlying instruction. 9361 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); 9362 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); 9363 for (unsigned Part = 0; Part < State.UF; ++Part) { 9364 Value *NewVecOp = State.get(getVecOp(), Part); 9365 if (VPValue *Cond = getCondOp()) { 9366 Value *NewCond = State.VF.isVector() ? State.get(Cond, Part) 9367 : State.get(Cond, {Part, 0}); 9368 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType()); 9369 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); 9370 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy, 9371 RdxDesc.getFastMathFlags()); 9372 if (State.VF.isVector()) { 9373 Iden = 9374 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden); 9375 } 9376 9377 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden); 9378 NewVecOp = Select; 9379 } 9380 Value *NewRed; 9381 Value *NextInChain; 9382 if (IsOrdered) { 9383 if (State.VF.isVector()) 9384 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp, 9385 PrevInChain); 9386 else 9387 NewRed = State.Builder.CreateBinOp( 9388 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain, 9389 NewVecOp); 9390 PrevInChain = NewRed; 9391 } else { 9392 PrevInChain = State.get(getChainOp(), Part); 9393 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp); 9394 } 9395 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { 9396 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(), 9397 NewRed, PrevInChain); 9398 } else if (IsOrdered) 9399 NextInChain = NewRed; 9400 else 9401 NextInChain = State.Builder.CreateBinOp( 9402 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain); 9403 State.set(this, NextInChain, Part); 9404 } 9405} 9406 9407void VPReplicateRecipe::execute(VPTransformState &State) { 9408 Instruction *UI = getUnderlyingInstr(); 9409 if (State.Instance) { // Generate a single instance. 9410 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9411 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State); 9412 // Insert scalar instance packing it into a vector. 9413 if (State.VF.isVector() && shouldPack()) { 9414 // If we're constructing lane 0, initialize to start from poison. 9415 if (State.Instance->Lane.isFirstLane()) { 9416 assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); 9417 Value *Poison = PoisonValue::get( 9418 VectorType::get(UI->getType(), State.VF)); 9419 State.set(this, Poison, State.Instance->Part); 9420 } 9421 State.packScalarIntoVectorValue(this, *State.Instance); 9422 } 9423 return; 9424 } 9425 9426 if (IsUniform) { 9427 // If the recipe is uniform across all parts (instead of just per VF), only 9428 // generate a single instance. 9429 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) && 9430 all_of(operands(), [](VPValue *Op) { 9431 return Op->isDefinedOutsideVectorRegions(); 9432 })) { 9433 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State); 9434 if (user_begin() != user_end()) { 9435 for (unsigned Part = 1; Part < State.UF; ++Part) 9436 State.set(this, State.get(this, VPIteration(0, 0)), 9437 VPIteration(Part, 0)); 9438 } 9439 return; 9440 } 9441 9442 // Uniform within VL means we need to generate lane 0 only for each 9443 // unrolled copy. 9444 for (unsigned Part = 0; Part < State.UF; ++Part) 9445 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State); 9446 return; 9447 } 9448 9449 // A store of a loop varying value to a uniform address only needs the last 9450 // copy of the store. 9451 if (isa<StoreInst>(UI) && 9452 vputils::isUniformAfterVectorization(getOperand(1))) { 9453 auto Lane = VPLane::getLastLaneForVF(State.VF); 9454 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), 9455 State); 9456 return; 9457 } 9458 9459 // Generate scalar instances for all VF lanes of all UF parts. 9460 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); 9461 const unsigned EndLane = State.VF.getKnownMinValue(); 9462 for (unsigned Part = 0; Part < State.UF; ++Part) 9463 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 9464 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); 9465} 9466 9467void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 9468 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; 9469 9470 // Attempt to issue a wide load. 9471 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient); 9472 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient); 9473 9474 assert((LI || SI) && "Invalid Load/Store instruction"); 9475 assert((!SI || StoredValue) && "No stored value provided for widened store"); 9476 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 9477 9478 Type *ScalarDataTy = getLoadStoreType(&Ingredient); 9479 9480 auto *DataTy = VectorType::get(ScalarDataTy, State.VF); 9481 const Align Alignment = getLoadStoreAlignment(&Ingredient); 9482 bool CreateGatherScatter = !isConsecutive(); 9483 9484 auto &Builder = State.Builder; 9485 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); 9486 bool isMaskRequired = getMask(); 9487 if (isMaskRequired) { 9488 // Mask reversal is only needed for non-all-one (null) masks, as reverse of 9489 // a null all-one mask is a null mask. 9490 for (unsigned Part = 0; Part < State.UF; ++Part) { 9491 Value *Mask = State.get(getMask(), Part); 9492 if (isReverse()) 9493 Mask = Builder.CreateVectorReverse(Mask, "reverse"); 9494 BlockInMaskParts[Part] = Mask; 9495 } 9496 } 9497 9498 // Handle Stores: 9499 if (SI) { 9500 State.setDebugLocFrom(SI->getDebugLoc()); 9501 9502 for (unsigned Part = 0; Part < State.UF; ++Part) { 9503 Instruction *NewSI = nullptr; 9504 Value *StoredVal = State.get(StoredValue, Part); 9505 if (CreateGatherScatter) { 9506 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9507 Value *VectorGep = State.get(getAddr(), Part); 9508 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 9509 MaskPart); 9510 } else { 9511 if (isReverse()) { 9512 // If we store to reverse consecutive memory locations, then we need 9513 // to reverse the order of elements in the stored value. 9514 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); 9515 // We don't want to update the value in the map as it might be used in 9516 // another expression. So don't call resetVectorValue(StoredVal). 9517 } 9518 auto *VecPtr = State.get(getAddr(), Part); 9519 if (isMaskRequired) 9520 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 9521 BlockInMaskParts[Part]); 9522 else 9523 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 9524 } 9525 State.addMetadata(NewSI, SI); 9526 } 9527 return; 9528 } 9529 9530 // Handle loads. 9531 assert(LI && "Must have a load instruction"); 9532 State.setDebugLocFrom(LI->getDebugLoc()); 9533 for (unsigned Part = 0; Part < State.UF; ++Part) { 9534 Value *NewLI; 9535 if (CreateGatherScatter) { 9536 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 9537 Value *VectorGep = State.get(getAddr(), Part); 9538 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, 9539 nullptr, "wide.masked.gather"); 9540 State.addMetadata(NewLI, LI); 9541 } else { 9542 auto *VecPtr = State.get(getAddr(), Part); 9543 if (isMaskRequired) 9544 NewLI = Builder.CreateMaskedLoad( 9545 DataTy, VecPtr, Alignment, BlockInMaskParts[Part], 9546 PoisonValue::get(DataTy), "wide.masked.load"); 9547 else 9548 NewLI = 9549 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 9550 9551 // Add metadata to the load, but setVectorValue to the reverse shuffle. 9552 State.addMetadata(NewLI, LI); 9553 if (Reverse) 9554 NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); 9555 } 9556 9557 State.set(getVPSingleValue(), NewLI, Part); 9558 } 9559} 9560 9561// Determine how to lower the scalar epilogue, which depends on 1) optimising 9562// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9563// predication, and 4) a TTI hook that analyses whether the loop is suitable 9564// for predication. 9565static ScalarEpilogueLowering getScalarEpilogueLowering( 9566 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 9567 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 9568 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { 9569 // 1) OptSize takes precedence over all other options, i.e. if this is set, 9570 // don't look at hints or options, and don't request a scalar epilogue. 9571 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9572 // LoopAccessInfo (due to code dependency and not being able to reliably get 9573 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9574 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9575 // versioning when the vectorization is forced, unlike hasOptSize. So revert 9576 // back to the old way and vectorize with versioning when forced. See D81345.) 9577 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 9578 PGSOQueryType::IRPass) && 9579 Hints.getForce() != LoopVectorizeHints::FK_Enabled)) 9580 return CM_ScalarEpilogueNotAllowedOptSize; 9581 9582 // 2) If set, obey the directives 9583 if (PreferPredicateOverEpilogue.getNumOccurrences()) { 9584 switch (PreferPredicateOverEpilogue) { 9585 case PreferPredicateTy::ScalarEpilogue: 9586 return CM_ScalarEpilogueAllowed; 9587 case PreferPredicateTy::PredicateElseScalarEpilogue: 9588 return CM_ScalarEpilogueNotNeededUsePredicate; 9589 case PreferPredicateTy::PredicateOrDontVectorize: 9590 return CM_ScalarEpilogueNotAllowedUsePredicate; 9591 }; 9592 } 9593 9594 // 3) If set, obey the hints 9595 switch (Hints.getPredicate()) { 9596 case LoopVectorizeHints::FK_Enabled: 9597 return CM_ScalarEpilogueNotNeededUsePredicate; 9598 case LoopVectorizeHints::FK_Disabled: 9599 return CM_ScalarEpilogueAllowed; 9600 }; 9601 9602 // 4) if the TTI hook indicates this is profitable, request predication. 9603 TailFoldingInfo TFI(TLI, &LVL, IAI); 9604 if (TTI->preferPredicateOverEpilogue(&TFI)) 9605 return CM_ScalarEpilogueNotNeededUsePredicate; 9606 9607 return CM_ScalarEpilogueAllowed; 9608} 9609 9610// Process the loop in the VPlan-native vectorization path. This path builds 9611// VPlan upfront in the vectorization pipeline, which allows to apply 9612// VPlan-to-VPlan transformations from the very beginning without modifying the 9613// input LLVM IR. 9614static bool processLoopInVPlanNativePath( 9615 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 9616 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 9617 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 9618 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 9619 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, 9620 LoopVectorizationRequirements &Requirements) { 9621 9622 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { 9623 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); 9624 return false; 9625 } 9626 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 9627 Function *F = L->getHeader()->getParent(); 9628 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 9629 9630 ScalarEpilogueLowering SEL = 9631 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); 9632 9633 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 9634 &Hints, IAI); 9635 // Use the planner for outer loop vectorization. 9636 // TODO: CM is not used at this point inside the planner. Turn CM into an 9637 // optional argument if we don't need it in the future. 9638 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, 9639 ORE); 9640 9641 // Get user vectorization factor. 9642 ElementCount UserVF = Hints.getWidth(); 9643 9644 CM.collectElementTypesForWidening(); 9645 9646 // Plan how to best vectorize, return the best VF and its cost. 9647 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 9648 9649 // If we are stress testing VPlan builds, do not attempt to generate vector 9650 // code. Masked vector code generation support will follow soon. 9651 // Also, do not attempt to vectorize if no vector code will be produced. 9652 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) 9653 return false; 9654 9655 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 9656 9657 { 9658 bool AddBranchWeights = 9659 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 9660 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 9661 F->getParent()->getDataLayout(), AddBranchWeights); 9662 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 9663 VF.Width, 1, LVL, &CM, BFI, PSI, Checks); 9664 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 9665 << L->getHeader()->getParent()->getName() << "\"\n"); 9666 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); 9667 } 9668 9669 reportVectorization(ORE, L, VF, 1); 9670 9671 // Mark the loop as already vectorized to avoid vectorizing again. 9672 Hints.setAlreadyVectorized(); 9673 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 9674 return true; 9675} 9676 9677// Emit a remark if there are stores to floats that required a floating point 9678// extension. If the vectorized loop was generated with floating point there 9679// will be a performance penalty from the conversion overhead and the change in 9680// the vector width. 9681static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { 9682 SmallVector<Instruction *, 4> Worklist; 9683 for (BasicBlock *BB : L->getBlocks()) { 9684 for (Instruction &Inst : *BB) { 9685 if (auto *S = dyn_cast<StoreInst>(&Inst)) { 9686 if (S->getValueOperand()->getType()->isFloatTy()) 9687 Worklist.push_back(S); 9688 } 9689 } 9690 } 9691 9692 // Traverse the floating point stores upwards searching, for floating point 9693 // conversions. 9694 SmallPtrSet<const Instruction *, 4> Visited; 9695 SmallPtrSet<const Instruction *, 4> EmittedRemark; 9696 while (!Worklist.empty()) { 9697 auto *I = Worklist.pop_back_val(); 9698 if (!L->contains(I)) 9699 continue; 9700 if (!Visited.insert(I).second) 9701 continue; 9702 9703 // Emit a remark if the floating point store required a floating 9704 // point conversion. 9705 // TODO: More work could be done to identify the root cause such as a 9706 // constant or a function return type and point the user to it. 9707 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second) 9708 ORE->emit([&]() { 9709 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision", 9710 I->getDebugLoc(), L->getHeader()) 9711 << "floating point conversion changes vector width. " 9712 << "Mixed floating point precision requires an up/down " 9713 << "cast that will negatively impact performance."; 9714 }); 9715 9716 for (Use &Op : I->operands()) 9717 if (auto *OpI = dyn_cast<Instruction>(Op)) 9718 Worklist.push_back(OpI); 9719 } 9720} 9721 9722static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, 9723 VectorizationFactor &VF, 9724 std::optional<unsigned> VScale, Loop *L, 9725 ScalarEvolution &SE, 9726 ScalarEpilogueLowering SEL) { 9727 InstructionCost CheckCost = Checks.getCost(); 9728 if (!CheckCost.isValid()) 9729 return false; 9730 9731 // When interleaving only scalar and vector cost will be equal, which in turn 9732 // would lead to a divide by 0. Fall back to hard threshold. 9733 if (VF.Width.isScalar()) { 9734 if (CheckCost > VectorizeMemoryCheckThreshold) { 9735 LLVM_DEBUG( 9736 dbgs() 9737 << "LV: Interleaving only is not profitable due to runtime checks\n"); 9738 return false; 9739 } 9740 return true; 9741 } 9742 9743 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 9744 double ScalarC = *VF.ScalarCost.getValue(); 9745 if (ScalarC == 0) 9746 return true; 9747 9748 // First, compute the minimum iteration count required so that the vector 9749 // loop outperforms the scalar loop. 9750 // The total cost of the scalar loop is 9751 // ScalarC * TC 9752 // where 9753 // * TC is the actual trip count of the loop. 9754 // * ScalarC is the cost of a single scalar iteration. 9755 // 9756 // The total cost of the vector loop is 9757 // RtC + VecC * (TC / VF) + EpiC 9758 // where 9759 // * RtC is the cost of the generated runtime checks 9760 // * VecC is the cost of a single vector iteration. 9761 // * TC is the actual trip count of the loop 9762 // * VF is the vectorization factor 9763 // * EpiCost is the cost of the generated epilogue, including the cost 9764 // of the remaining scalar operations. 9765 // 9766 // Vectorization is profitable once the total vector cost is less than the 9767 // total scalar cost: 9768 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 9769 // 9770 // Now we can compute the minimum required trip count TC as 9771 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC 9772 // 9773 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 9774 // the computations are performed on doubles, not integers and the result 9775 // is rounded up, hence we get an upper estimate of the TC. 9776 unsigned IntVF = VF.Width.getKnownMinValue(); 9777 if (VF.Width.isScalable()) { 9778 unsigned AssumedMinimumVscale = 1; 9779 if (VScale) 9780 AssumedMinimumVscale = *VScale; 9781 IntVF *= AssumedMinimumVscale; 9782 } 9783 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; 9784 double RtC = *CheckCost.getValue(); 9785 double MinTC1 = RtC / (ScalarC - VecCOverVF); 9786 9787 // Second, compute a minimum iteration count so that the cost of the 9788 // runtime checks is only a fraction of the total scalar loop cost. This 9789 // adds a loop-dependent bound on the overhead incurred if the runtime 9790 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 9791 // * TC. To bound the runtime check to be a fraction 1/X of the scalar 9792 // cost, compute 9793 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 9794 double MinTC2 = RtC * 10 / ScalarC; 9795 9796 // Now pick the larger minimum. If it is not a multiple of VF and a scalar 9797 // epilogue is allowed, choose the next closest multiple of VF. This should 9798 // partly compensate for ignoring the epilogue cost. 9799 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); 9800 if (SEL == CM_ScalarEpilogueAllowed) 9801 MinTC = alignTo(MinTC, IntVF); 9802 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); 9803 9804 LLVM_DEBUG( 9805 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" 9806 << VF.MinProfitableTripCount << "\n"); 9807 9808 // Skip vectorization if the expected trip count is less than the minimum 9809 // required trip count. 9810 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { 9811 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), 9812 VF.MinProfitableTripCount)) { 9813 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " 9814 "trip count < minimum profitable VF (" 9815 << *ExpectedTC << " < " << VF.MinProfitableTripCount 9816 << ")\n"); 9817 9818 return false; 9819 } 9820 } 9821 return true; 9822} 9823 9824LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) 9825 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || 9826 !EnableLoopInterleaving), 9827 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || 9828 !EnableLoopVectorization) {} 9829 9830bool LoopVectorizePass::processLoop(Loop *L) { 9831 assert((EnableVPlanNativePath || L->isInnermost()) && 9832 "VPlan-native path is not enabled. Only process inner loops."); 9833 9834#ifndef NDEBUG 9835 const std::string DebugLocStr = getDebugLocString(L); 9836#endif /* NDEBUG */ 9837 9838 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" 9839 << L->getHeader()->getParent()->getName() << "' from " 9840 << DebugLocStr << "\n"); 9841 9842 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); 9843 9844 LLVM_DEBUG( 9845 dbgs() << "LV: Loop hints:" 9846 << " force=" 9847 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 9848 ? "disabled" 9849 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 9850 ? "enabled" 9851 : "?")) 9852 << " width=" << Hints.getWidth() 9853 << " interleave=" << Hints.getInterleave() << "\n"); 9854 9855 // Function containing loop 9856 Function *F = L->getHeader()->getParent(); 9857 9858 // Looking at the diagnostic output is the only way to determine if a loop 9859 // was vectorized (other than looking at the IR or machine code), so it 9860 // is important to generate an optimization remark for each loop. Most of 9861 // these messages are generated as OptimizationRemarkAnalysis. Remarks 9862 // generated as OptimizationRemark and OptimizationRemarkMissed are 9863 // less verbose reporting vectorized loops and unvectorized loops that may 9864 // benefit from vectorization, respectively. 9865 9866 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 9867 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 9868 return false; 9869 } 9870 9871 PredicatedScalarEvolution PSE(*SE, *L); 9872 9873 // Check if it is legal to vectorize the loop. 9874 LoopVectorizationRequirements Requirements; 9875 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, 9876 &Requirements, &Hints, DB, AC, BFI, PSI); 9877 if (!LVL.canVectorize(EnableVPlanNativePath)) { 9878 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 9879 Hints.emitRemarkWithHints(); 9880 return false; 9881 } 9882 9883 // Entrance to the VPlan-native vectorization path. Outer loops are processed 9884 // here. They may require CFG and instruction level transformations before 9885 // even evaluating whether vectorization is profitable. Since we cannot modify 9886 // the incoming IR, we need to build VPlan upfront in the vectorization 9887 // pipeline. 9888 if (!L->isInnermost()) 9889 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 9890 ORE, BFI, PSI, Hints, Requirements); 9891 9892 assert(L->isInnermost() && "Inner loop expected."); 9893 9894 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 9895 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 9896 9897 // If an override option has been passed in for interleaved accesses, use it. 9898 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 9899 UseInterleaved = EnableInterleavedMemAccesses; 9900 9901 // Analyze interleaved memory accesses. 9902 if (UseInterleaved) 9903 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 9904 9905 // Check the function attributes and profiles to find out if this function 9906 // should be optimized for size. 9907 ScalarEpilogueLowering SEL = 9908 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); 9909 9910 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 9911 // count by optimizing for size, to minimize overheads. 9912 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 9913 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 9914 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 9915 << "This loop is worth vectorizing only if no scalar " 9916 << "iteration overheads are incurred."); 9917 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 9918 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 9919 else { 9920 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { 9921 LLVM_DEBUG(dbgs() << "\n"); 9922 // Predicate tail-folded loops are efficient even when the loop 9923 // iteration count is low. However, setting the epilogue policy to 9924 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 9925 // with runtime checks. It's more effective to let 9926 // `areRuntimeChecksProfitable` determine if vectorization is beneficial 9927 // for the loop. 9928 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) 9929 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 9930 } else { 9931 LLVM_DEBUG(dbgs() << " But the target considers the trip count too " 9932 "small to consider vectorizing.\n"); 9933 reportVectorizationFailure( 9934 "The trip count is below the minial threshold value.", 9935 "loop trip count is too low, avoiding vectorization", 9936 "LowTripCount", ORE, L); 9937 Hints.emitRemarkWithHints(); 9938 return false; 9939 } 9940 } 9941 } 9942 9943 // Check the function attributes to see if implicit floats or vectors are 9944 // allowed. 9945 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 9946 reportVectorizationFailure( 9947 "Can't vectorize when the NoImplicitFloat attribute is used", 9948 "loop not vectorized due to NoImplicitFloat attribute", 9949 "NoImplicitFloat", ORE, L); 9950 Hints.emitRemarkWithHints(); 9951 return false; 9952 } 9953 9954 // Check if the target supports potentially unsafe FP vectorization. 9955 // FIXME: Add a check for the type of safety issue (denormal, signaling) 9956 // for the target we're vectorizing for, to make sure none of the 9957 // additional fp-math flags can help. 9958 if (Hints.isPotentiallyUnsafe() && 9959 TTI->isFPVectorizationPotentiallyUnsafe()) { 9960 reportVectorizationFailure( 9961 "Potentially unsafe FP op prevents vectorization", 9962 "loop not vectorized due to unsafe FP support.", 9963 "UnsafeFP", ORE, L); 9964 Hints.emitRemarkWithHints(); 9965 return false; 9966 } 9967 9968 bool AllowOrderedReductions; 9969 // If the flag is set, use that instead and override the TTI behaviour. 9970 if (ForceOrderedReductions.getNumOccurrences() > 0) 9971 AllowOrderedReductions = ForceOrderedReductions; 9972 else 9973 AllowOrderedReductions = TTI->enableOrderedReductions(); 9974 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) { 9975 ORE->emit([&]() { 9976 auto *ExactFPMathInst = Requirements.getExactFPInst(); 9977 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps", 9978 ExactFPMathInst->getDebugLoc(), 9979 ExactFPMathInst->getParent()) 9980 << "loop not vectorized: cannot prove it is safe to reorder " 9981 "floating-point operations"; 9982 }); 9983 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " 9984 "reorder floating-point operations\n"); 9985 Hints.emitRemarkWithHints(); 9986 return false; 9987 } 9988 9989 // Use the cost model. 9990 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 9991 F, &Hints, IAI); 9992 // Use the planner for vectorization. 9993 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, 9994 ORE); 9995 9996 // Get user vectorization factor and interleave count. 9997 ElementCount UserVF = Hints.getWidth(); 9998 unsigned UserIC = Hints.getInterleave(); 9999 10000 // Plan how to best vectorize, return the best VF and its cost. 10001 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); 10002 10003 VectorizationFactor VF = VectorizationFactor::Disabled(); 10004 unsigned IC = 1; 10005 10006 bool AddBranchWeights = 10007 hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); 10008 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, 10009 F->getParent()->getDataLayout(), AddBranchWeights); 10010 if (MaybeVF) { 10011 VF = *MaybeVF; 10012 // Select the interleave count. 10013 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 10014 10015 unsigned SelectedIC = std::max(IC, UserIC); 10016 // Optimistically generate runtime checks if they are needed. Drop them if 10017 // they turn out to not be profitable. 10018 if (VF.Width.isVector() || SelectedIC > 1) 10019 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); 10020 10021 // Check if it is profitable to vectorize with runtime checks. 10022 bool ForceVectorization = 10023 Hints.getForce() == LoopVectorizeHints::FK_Enabled; 10024 if (!ForceVectorization && 10025 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, 10026 *PSE.getSE(), SEL)) { 10027 ORE->emit([&]() { 10028 return OptimizationRemarkAnalysisAliasing( 10029 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), 10030 L->getHeader()) 10031 << "loop not vectorized: cannot prove it is safe to reorder " 10032 "memory operations"; 10033 }); 10034 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 10035 Hints.emitRemarkWithHints(); 10036 return false; 10037 } 10038 } 10039 10040 // Identify the diagnostic messages that should be produced. 10041 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 10042 bool VectorizeLoop = true, InterleaveLoop = true; 10043 if (VF.Width.isScalar()) { 10044 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 10045 VecDiagMsg = std::make_pair( 10046 "VectorizationNotBeneficial", 10047 "the cost-model indicates that vectorization is not beneficial"); 10048 VectorizeLoop = false; 10049 } 10050 10051 if (!MaybeVF && UserIC > 1) { 10052 // Tell the user interleaving was avoided up-front, despite being explicitly 10053 // requested. 10054 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 10055 "interleaving should be avoided up front\n"); 10056 IntDiagMsg = std::make_pair( 10057 "InterleavingAvoided", 10058 "Ignoring UserIC, because interleaving was avoided up front"); 10059 InterleaveLoop = false; 10060 } else if (IC == 1 && UserIC <= 1) { 10061 // Tell the user interleaving is not beneficial. 10062 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 10063 IntDiagMsg = std::make_pair( 10064 "InterleavingNotBeneficial", 10065 "the cost-model indicates that interleaving is not beneficial"); 10066 InterleaveLoop = false; 10067 if (UserIC == 1) { 10068 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 10069 IntDiagMsg.second += 10070 " and is explicitly disabled or interleave count is set to 1"; 10071 } 10072 } else if (IC > 1 && UserIC == 1) { 10073 // Tell the user interleaving is beneficial, but it explicitly disabled. 10074 LLVM_DEBUG( 10075 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 10076 IntDiagMsg = std::make_pair( 10077 "InterleavingBeneficialButDisabled", 10078 "the cost-model indicates that interleaving is beneficial " 10079 "but is explicitly disabled or interleave count is set to 1"); 10080 InterleaveLoop = false; 10081 } 10082 10083 // Override IC if user provided an interleave count. 10084 IC = UserIC > 0 ? UserIC : IC; 10085 10086 // Emit diagnostic messages, if any. 10087 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 10088 if (!VectorizeLoop && !InterleaveLoop) { 10089 // Do not vectorize or interleaving the loop. 10090 ORE->emit([&]() { 10091 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 10092 L->getStartLoc(), L->getHeader()) 10093 << VecDiagMsg.second; 10094 }); 10095 ORE->emit([&]() { 10096 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 10097 L->getStartLoc(), L->getHeader()) 10098 << IntDiagMsg.second; 10099 }); 10100 return false; 10101 } else if (!VectorizeLoop && InterleaveLoop) { 10102 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10103 ORE->emit([&]() { 10104 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 10105 L->getStartLoc(), L->getHeader()) 10106 << VecDiagMsg.second; 10107 }); 10108 } else if (VectorizeLoop && !InterleaveLoop) { 10109 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10110 << ") in " << DebugLocStr << '\n'); 10111 ORE->emit([&]() { 10112 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 10113 L->getStartLoc(), L->getHeader()) 10114 << IntDiagMsg.second; 10115 }); 10116 } else if (VectorizeLoop && InterleaveLoop) { 10117 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 10118 << ") in " << DebugLocStr << '\n'); 10119 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 10120 } 10121 10122 bool DisableRuntimeUnroll = false; 10123 MDNode *OrigLoopID = L->getLoopID(); 10124 { 10125 using namespace ore; 10126 if (!VectorizeLoop) { 10127 assert(IC > 1 && "interleave count should not be 1 or 0"); 10128 // If we decided that it is not legal to vectorize the loop, then 10129 // interleave it. 10130 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 10131 &CM, BFI, PSI, Checks); 10132 10133 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10134 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); 10135 10136 ORE->emit([&]() { 10137 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 10138 L->getHeader()) 10139 << "interleaved loop (interleaved count: " 10140 << NV("InterleaveCount", IC) << ")"; 10141 }); 10142 } else { 10143 // If we decided that it is *legal* to vectorize the loop, then do it. 10144 10145 // Consider vectorizing the epilogue too if it's profitable. 10146 VectorizationFactor EpilogueVF = 10147 LVP.selectEpilogueVectorizationFactor(VF.Width, IC); 10148 if (EpilogueVF.Width.isVector()) { 10149 10150 // The first pass vectorizes the main loop and creates a scalar epilogue 10151 // to be vectorized by executing the plan (potentially with a different 10152 // factor) again shortly afterwards. 10153 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); 10154 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, 10155 EPI, &LVL, &CM, BFI, PSI, Checks); 10156 10157 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); 10158 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( 10159 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); 10160 ++LoopsVectorized; 10161 10162 // Second pass vectorizes the epilogue and adjusts the control flow 10163 // edges from the first pass. 10164 EPI.MainLoopVF = EPI.EpilogueVF; 10165 EPI.MainLoopUF = EPI.EpilogueUF; 10166 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, 10167 ORE, EPI, &LVL, &CM, BFI, PSI, 10168 Checks); 10169 10170 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); 10171 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); 10172 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); 10173 Header->setName("vec.epilog.vector.body"); 10174 10175 // Re-use the trip count and steps expanded for the main loop, as 10176 // skeleton creation needs it as a value that dominates both the scalar 10177 // and vector epilogue loops 10178 // TODO: This is a workaround needed for epilogue vectorization and it 10179 // should be removed once induction resume value creation is done 10180 // directly in VPlan. 10181 EpilogILV.setTripCount(MainILV.getTripCount()); 10182 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { 10183 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R); 10184 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn( 10185 ExpandedSCEVs.find(ExpandR->getSCEV())->second); 10186 ExpandR->replaceAllUsesWith(ExpandedVal); 10187 ExpandR->eraseFromParent(); 10188 } 10189 10190 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, 10191 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated 10192 // before vectorizing the epilogue loop. 10193 for (VPRecipeBase &R : Header->phis()) { 10194 if (isa<VPCanonicalIVPHIRecipe>(&R)) 10195 continue; 10196 10197 Value *ResumeV = nullptr; 10198 // TODO: Move setting of resume values to prepareToExecute. 10199 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) { 10200 ResumeV = ReductionResumeValues 10201 .find(&ReductionPhi->getRecurrenceDescriptor()) 10202 ->second; 10203 } else { 10204 // Create induction resume values for both widened pointer and 10205 // integer/fp inductions and update the start value of the induction 10206 // recipes to use the resume value. 10207 PHINode *IndPhi = nullptr; 10208 const InductionDescriptor *ID; 10209 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) { 10210 IndPhi = cast<PHINode>(Ind->getUnderlyingValue()); 10211 ID = &Ind->getInductionDescriptor(); 10212 } else { 10213 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R); 10214 IndPhi = WidenInd->getPHINode(); 10215 ID = &WidenInd->getInductionDescriptor(); 10216 } 10217 10218 ResumeV = MainILV.createInductionResumeValue( 10219 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), 10220 {EPI.MainLoopIterationCountCheck}); 10221 } 10222 assert(ResumeV && "Must have a resume value"); 10223 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV); 10224 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal); 10225 } 10226 10227 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, 10228 DT, true, &ExpandedSCEVs); 10229 ++LoopsEpilogueVectorized; 10230 10231 if (!MainILV.areSafetyChecksAdded()) 10232 DisableRuntimeUnroll = true; 10233 } else { 10234 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 10235 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, 10236 PSI, Checks); 10237 10238 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); 10239 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); 10240 ++LoopsVectorized; 10241 10242 // Add metadata to disable runtime unrolling a scalar loop when there 10243 // are no runtime checks about strides and memory. A scalar loop that is 10244 // rarely used is not worth unrolling. 10245 if (!LB.areSafetyChecksAdded()) 10246 DisableRuntimeUnroll = true; 10247 } 10248 // Report the vectorization decision. 10249 reportVectorization(ORE, L, VF, IC); 10250 } 10251 10252 if (ORE->allowExtraAnalysis(LV_NAME)) 10253 checkMixedPrecision(L, ORE); 10254 } 10255 10256 std::optional<MDNode *> RemainderLoopID = 10257 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 10258 LLVMLoopVectorizeFollowupEpilogue}); 10259 if (RemainderLoopID) { 10260 L->setLoopID(*RemainderLoopID); 10261 } else { 10262 if (DisableRuntimeUnroll) 10263 AddRuntimeUnrollDisableMetaData(L); 10264 10265 // Mark the loop as already vectorized to avoid vectorizing again. 10266 Hints.setAlreadyVectorized(); 10267 } 10268 10269 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); 10270 return true; 10271} 10272 10273LoopVectorizeResult LoopVectorizePass::runImpl( 10274 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 10275 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, 10276 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, 10277 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 10278 SE = &SE_; 10279 LI = &LI_; 10280 TTI = &TTI_; 10281 DT = &DT_; 10282 BFI = BFI_; 10283 TLI = TLI_; 10284 AC = &AC_; 10285 LAIs = &LAIs_; 10286 DB = &DB_; 10287 ORE = &ORE_; 10288 PSI = PSI_; 10289 10290 // Don't attempt if 10291 // 1. the target claims to have no vector registers, and 10292 // 2. interleaving won't help ILP. 10293 // 10294 // The second condition is necessary because, even if the target has no 10295 // vector registers, loop vectorization may still enable scalar 10296 // interleaving. 10297 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 10298 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) 10299 return LoopVectorizeResult(false, false); 10300 10301 bool Changed = false, CFGChanged = false; 10302 10303 // The vectorizer requires loops to be in simplified form. 10304 // Since simplification may add new inner loops, it has to run before the 10305 // legality and profitability checks. This means running the loop vectorizer 10306 // will simplify all loops, regardless of whether anything end up being 10307 // vectorized. 10308 for (const auto &L : *LI) 10309 Changed |= CFGChanged |= 10310 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 10311 10312 // Build up a worklist of inner-loops to vectorize. This is necessary as 10313 // the act of vectorizing or partially unrolling a loop creates new loops 10314 // and can invalidate iterators across the loops. 10315 SmallVector<Loop *, 8> Worklist; 10316 10317 for (Loop *L : *LI) 10318 collectSupportedLoops(*L, LI, ORE, Worklist); 10319 10320 LoopsAnalyzed += Worklist.size(); 10321 10322 // Now walk the identified inner loops. 10323 while (!Worklist.empty()) { 10324 Loop *L = Worklist.pop_back_val(); 10325 10326 // For the inner loops we actually process, form LCSSA to simplify the 10327 // transform. 10328 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 10329 10330 Changed |= CFGChanged |= processLoop(L); 10331 10332 if (Changed) { 10333 LAIs->clear(); 10334 10335#ifndef NDEBUG 10336 if (VerifySCEV) 10337 SE->verify(); 10338#endif 10339 } 10340 } 10341 10342 // Process each loop nest in the function. 10343 return LoopVectorizeResult(Changed, CFGChanged); 10344} 10345 10346PreservedAnalyses LoopVectorizePass::run(Function &F, 10347 FunctionAnalysisManager &AM) { 10348 auto &LI = AM.getResult<LoopAnalysis>(F); 10349 // There are no loops in the function. Return before computing other expensive 10350 // analyses. 10351 if (LI.empty()) 10352 return PreservedAnalyses::all(); 10353 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10354 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 10355 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10356 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 10357 auto &AC = AM.getResult<AssumptionAnalysis>(F); 10358 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 10359 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10360 10361 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 10362 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); 10363 ProfileSummaryInfo *PSI = 10364 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 10365 BlockFrequencyInfo *BFI = nullptr; 10366 if (PSI && PSI->hasProfileSummary()) 10367 BFI = &AM.getResult<BlockFrequencyAnalysis>(F); 10368 LoopVectorizeResult Result = 10369 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); 10370 if (!Result.MadeAnyChange) 10371 return PreservedAnalyses::all(); 10372 PreservedAnalyses PA; 10373 10374 if (isAssignmentTrackingEnabled(*F.getParent())) { 10375 for (auto &BB : F) 10376 RemoveRedundantDbgInstrs(&BB); 10377 } 10378 10379 // We currently do not preserve loopinfo/dominator analyses with outer loop 10380 // vectorization. Until this is addressed, mark these analyses as preserved 10381 // only for non-VPlan-native path. 10382 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 10383 if (!EnableVPlanNativePath) { 10384 PA.preserve<LoopAnalysis>(); 10385 PA.preserve<DominatorTreeAnalysis>(); 10386 PA.preserve<ScalarEvolutionAnalysis>(); 10387 } 10388 10389 if (Result.MadeCFGChange) { 10390 // Making CFG changes likely means a loop got vectorized. Indicate that 10391 // extra simplification passes should be run. 10392 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10393 // be run if runtime checks have been added. 10394 AM.getResult<ShouldRunExtraVectorPasses>(F); 10395 PA.preserve<ShouldRunExtraVectorPasses>(); 10396 } else { 10397 PA.preserveSet<CFGAnalyses>(); 10398 } 10399 return PA; 10400} 10401 10402void LoopVectorizePass::printPipeline( 10403 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 10404 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( 10405 OS, MapClassName2PassName); 10406 10407 OS << '<'; 10408 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;"; 10409 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;"; 10410 OS << '>'; 10411} 10412