1243789Sdim//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2243789Sdim// 3243789Sdim// The LLVM Compiler Infrastructure 4243789Sdim// 5243789Sdim// This file is distributed under the University of Illinois Open Source 6243789Sdim// License. See LICENSE.TXT for details. 7243789Sdim// 8243789Sdim//===----------------------------------------------------------------------===// 9243789Sdim// 10243789Sdim// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11251662Sdim// and generates target-independent LLVM-IR. 12251662Sdim// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13251662Sdim// of instructions in order to estimate the profitability of vectorization. 14243789Sdim// 15249423Sdim// The loop vectorizer combines consecutive loop iterations into a single 16243789Sdim// 'wide' iteration. After this transformation the index is incremented 17243789Sdim// by the SIMD vector width, and not by one. 18243789Sdim// 19243789Sdim// This pass has three parts: 20243789Sdim// 1. The main loop pass that drives the different parts. 21243789Sdim// 2. LoopVectorizationLegality - A unit that checks for the legality 22243789Sdim// of the vectorization. 23249423Sdim// 3. InnerLoopVectorizer - A unit that performs the actual 24243789Sdim// widening of instructions. 25243789Sdim// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26243789Sdim// of vectorization. It decides on the optimal vector width, which 27243789Sdim// can be one, if vectorization is not profitable. 28249423Sdim// 29243789Sdim//===----------------------------------------------------------------------===// 30243789Sdim// 31243789Sdim// The reduction-variable vectorization is based on the paper: 32243789Sdim// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33243789Sdim// 34243789Sdim// Variable uniformity checks are inspired by: 35249423Sdim// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36243789Sdim// 37288943Sdim// The interleaved access vectorization is based on the paper: 38288943Sdim// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 39288943Sdim// Data for SIMD 40288943Sdim// 41243789Sdim// Other ideas/concepts are from: 42243789Sdim// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 43243789Sdim// 44249423Sdim// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 45249423Sdim// Vectorizing Compilers. 46249423Sdim// 47243789Sdim//===----------------------------------------------------------------------===// 48249423Sdim 49249423Sdim#include "llvm/Transforms/Vectorize.h" 50249423Sdim#include "llvm/ADT/DenseMap.h" 51261991Sdim#include "llvm/ADT/Hashing.h" 52249423Sdim#include "llvm/ADT/MapVector.h" 53261991Sdim#include "llvm/ADT/SetVector.h" 54249423Sdim#include "llvm/ADT/SmallPtrSet.h" 55249423Sdim#include "llvm/ADT/SmallSet.h" 56243789Sdim#include "llvm/ADT/SmallVector.h" 57276479Sdim#include "llvm/ADT/Statistic.h" 58243789Sdim#include "llvm/ADT/StringExtras.h" 59243789Sdim#include "llvm/Analysis/AliasAnalysis.h" 60296417Sdim#include "llvm/Analysis/BasicAliasAnalysis.h" 61276479Sdim#include "llvm/Analysis/AliasSetTracker.h" 62280031Sdim#include "llvm/Analysis/AssumptionCache.h" 63276479Sdim#include "llvm/Analysis/BlockFrequencyInfo.h" 64280031Sdim#include "llvm/Analysis/CodeMetrics.h" 65296417Sdim#include "llvm/Analysis/DemandedBits.h" 66296417Sdim#include "llvm/Analysis/GlobalsModRef.h" 67288943Sdim#include "llvm/Analysis/LoopAccessAnalysis.h" 68249423Sdim#include "llvm/Analysis/LoopInfo.h" 69249423Sdim#include "llvm/Analysis/LoopIterator.h" 70249423Sdim#include "llvm/Analysis/LoopPass.h" 71243789Sdim#include "llvm/Analysis/ScalarEvolution.h" 72249423Sdim#include "llvm/Analysis/ScalarEvolutionExpander.h" 73243789Sdim#include "llvm/Analysis/ScalarEvolutionExpressions.h" 74249423Sdim#include "llvm/Analysis/TargetTransformInfo.h" 75243789Sdim#include "llvm/Analysis/ValueTracking.h" 76249423Sdim#include "llvm/IR/Constants.h" 77249423Sdim#include "llvm/IR/DataLayout.h" 78276479Sdim#include "llvm/IR/DebugInfo.h" 79249423Sdim#include "llvm/IR/DerivedTypes.h" 80276479Sdim#include "llvm/IR/DiagnosticInfo.h" 81276479Sdim#include "llvm/IR/Dominators.h" 82249423Sdim#include "llvm/IR/Function.h" 83249423Sdim#include "llvm/IR/IRBuilder.h" 84249423Sdim#include "llvm/IR/Instructions.h" 85249423Sdim#include "llvm/IR/IntrinsicInst.h" 86249423Sdim#include "llvm/IR/LLVMContext.h" 87249423Sdim#include "llvm/IR/Module.h" 88276479Sdim#include "llvm/IR/PatternMatch.h" 89249423Sdim#include "llvm/IR/Type.h" 90249423Sdim#include "llvm/IR/Value.h" 91276479Sdim#include "llvm/IR/ValueHandle.h" 92276479Sdim#include "llvm/IR/Verifier.h" 93249423Sdim#include "llvm/Pass.h" 94276479Sdim#include "llvm/Support/BranchProbability.h" 95243789Sdim#include "llvm/Support/CommandLine.h" 96243789Sdim#include "llvm/Support/Debug.h" 97243789Sdim#include "llvm/Support/raw_ostream.h" 98249423Sdim#include "llvm/Transforms/Scalar.h" 99249423Sdim#include "llvm/Transforms/Utils/BasicBlockUtils.h" 100243789Sdim#include "llvm/Transforms/Utils/Local.h" 101288943Sdim#include "llvm/Analysis/VectorUtils.h" 102288943Sdim#include "llvm/Transforms/Utils/LoopUtils.h" 103243789Sdim#include <algorithm> 104296417Sdim#include <functional> 105249423Sdim#include <map> 106276479Sdim#include <tuple> 107249423Sdim 108243789Sdimusing namespace llvm; 109251662Sdimusing namespace llvm::PatternMatch; 110243789Sdim 111276479Sdim#define LV_NAME "loop-vectorize" 112276479Sdim#define DEBUG_TYPE LV_NAME 113276479Sdim 114276479SdimSTATISTIC(LoopsVectorized, "Number of loops vectorized"); 115276479SdimSTATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 116276479Sdim 117249423Sdimstatic cl::opt<bool> 118249423SdimEnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 119249423Sdim cl::desc("Enable if-conversion during vectorization.")); 120249423Sdim 121243789Sdim/// We don't vectorize loops with a known constant trip count below this number. 122249423Sdimstatic cl::opt<unsigned> 123249423SdimTinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 124249423Sdim cl::Hidden, 125249423Sdim cl::desc("Don't vectorize loops with a constant " 126249423Sdim "trip count that is smaller than this " 127249423Sdim "value.")); 128243789Sdim 129296417Sdimstatic cl::opt<bool> MaximizeBandwidth( 130296417Sdim "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 131296417Sdim cl::desc("Maximize bandwidth when selecting vectorization factor which " 132296417Sdim "will be determined by the smallest type in loop.")); 133296417Sdim 134276479Sdim/// This enables versioning on the strides of symbolically striding memory 135276479Sdim/// accesses in code like the following. 136276479Sdim/// for (i = 0; i < N; ++i) 137276479Sdim/// A[i * Stride1] += B[i * Stride2] ... 138276479Sdim/// 139276479Sdim/// Will be roughly translated to 140276479Sdim/// if (Stride1 == 1 && Stride2 == 1) { 141276479Sdim/// for (i = 0; i < N; i+=4) 142276479Sdim/// A[i:i+3] += ... 143276479Sdim/// } else 144276479Sdim/// ... 145276479Sdimstatic cl::opt<bool> EnableMemAccessVersioning( 146276479Sdim "enable-mem-access-versioning", cl::init(true), cl::Hidden, 147296417Sdim cl::desc("Enable symbolic stride memory access versioning")); 148276479Sdim 149288943Sdimstatic cl::opt<bool> EnableInterleavedMemAccesses( 150288943Sdim "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 151288943Sdim cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 152249423Sdim 153288943Sdim/// Maximum factor for an interleaved memory access. 154288943Sdimstatic cl::opt<unsigned> MaxInterleaveGroupFactor( 155288943Sdim "max-interleave-group-factor", cl::Hidden, 156288943Sdim cl::desc("Maximum factor for an interleaved access group (default = 8)"), 157288943Sdim cl::init(8)); 158243789Sdim 159288943Sdim/// We don't interleave loops with a known constant trip count below this 160288943Sdim/// number. 161288943Sdimstatic const unsigned TinyTripCountInterleaveThreshold = 128; 162249423Sdim 163276479Sdimstatic cl::opt<unsigned> ForceTargetNumScalarRegs( 164276479Sdim "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 165276479Sdim cl::desc("A flag that overrides the target's number of scalar registers.")); 166276479Sdim 167276479Sdimstatic cl::opt<unsigned> ForceTargetNumVectorRegs( 168276479Sdim "force-target-num-vector-regs", cl::init(0), cl::Hidden, 169276479Sdim cl::desc("A flag that overrides the target's number of vector registers.")); 170276479Sdim 171280031Sdim/// Maximum vectorization interleave count. 172280031Sdimstatic const unsigned MaxInterleaveFactor = 16; 173261991Sdim 174280031Sdimstatic cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 175280031Sdim "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 176280031Sdim cl::desc("A flag that overrides the target's max interleave factor for " 177280031Sdim "scalar loops.")); 178261991Sdim 179280031Sdimstatic cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 180280031Sdim "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 181280031Sdim cl::desc("A flag that overrides the target's max interleave factor for " 182276479Sdim "vectorized loops.")); 183276479Sdim 184276479Sdimstatic cl::opt<unsigned> ForceTargetInstructionCost( 185276479Sdim "force-target-instruction-cost", cl::init(0), cl::Hidden, 186276479Sdim cl::desc("A flag that overrides the target's expected cost for " 187276479Sdim "an instruction to a single constant value. Mostly " 188276479Sdim "useful for getting consistent testing.")); 189276479Sdim 190276479Sdimstatic cl::opt<unsigned> SmallLoopCost( 191276479Sdim "small-loop-cost", cl::init(20), cl::Hidden, 192288943Sdim cl::desc( 193288943Sdim "The cost of a loop that is considered 'small' by the interleaver.")); 194276479Sdim 195276479Sdimstatic cl::opt<bool> LoopVectorizeWithBlockFrequency( 196276479Sdim "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, 197276479Sdim cl::desc("Enable the use of the block frequency analysis to access PGO " 198276479Sdim "heuristics minimizing code growth in cold regions and being more " 199276479Sdim "aggressive in hot regions.")); 200276479Sdim 201288943Sdim// Runtime interleave loops for load/store throughput. 202288943Sdimstatic cl::opt<bool> EnableLoadStoreRuntimeInterleave( 203288943Sdim "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 204288943Sdim cl::desc( 205288943Sdim "Enable runtime interleaving until load/store ports are saturated")); 206276479Sdim 207276479Sdim/// The number of stores in a loop that are allowed to need predication. 208276479Sdimstatic cl::opt<unsigned> NumberOfStoresToPredicate( 209276479Sdim "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 210276479Sdim cl::desc("Max number of stores to be predicated behind an if.")); 211276479Sdim 212276479Sdimstatic cl::opt<bool> EnableIndVarRegisterHeur( 213276479Sdim "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 214288943Sdim cl::desc("Count the induction variable only once when interleaving")); 215276479Sdim 216276479Sdimstatic cl::opt<bool> EnableCondStoresVectorization( 217276479Sdim "enable-cond-stores-vec", cl::init(false), cl::Hidden, 218276479Sdim cl::desc("Enable if predication of stores during vectorization.")); 219276479Sdim 220288943Sdimstatic cl::opt<unsigned> MaxNestedScalarReductionIC( 221288943Sdim "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 222288943Sdim cl::desc("The maximum interleave count to use when interleaving a scalar " 223280031Sdim "reduction in a nested loop.")); 224280031Sdim 225296417Sdimstatic cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold( 226296417Sdim "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, 227296417Sdim cl::desc("The maximum allowed number of runtime memory checks with a " 228296417Sdim "vectorize(enable) pragma.")); 229296417Sdim 230296417Sdimstatic cl::opt<unsigned> VectorizeSCEVCheckThreshold( 231296417Sdim "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, 232296417Sdim cl::desc("The maximum number of SCEV checks allowed.")); 233296417Sdim 234296417Sdimstatic cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( 235296417Sdim "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, 236296417Sdim cl::desc("The maximum number of SCEV checks allowed with a " 237296417Sdim "vectorize(enable) pragma")); 238296417Sdim 239243789Sdimnamespace { 240243789Sdim 241243789Sdim// Forward declarations. 242296417Sdimclass LoopVectorizeHints; 243243789Sdimclass LoopVectorizationLegality; 244243789Sdimclass LoopVectorizationCostModel; 245296417Sdimclass LoopVectorizationRequirements; 246243789Sdim 247288943Sdim/// \brief This modifies LoopAccessReport to initialize message with 248288943Sdim/// loop-vectorizer-specific part. 249288943Sdimclass VectorizationReport : public LoopAccessReport { 250276479Sdimpublic: 251288943Sdim VectorizationReport(Instruction *I = nullptr) 252288943Sdim : LoopAccessReport("loop not vectorized: ", I) {} 253276479Sdim 254288943Sdim /// \brief This allows promotion of the loop-access analysis report into the 255288943Sdim /// loop-vectorizer report. It modifies the message to add the 256288943Sdim /// loop-vectorizer-specific part of the message. 257288943Sdim explicit VectorizationReport(const LoopAccessReport &R) 258288943Sdim : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), 259288943Sdim R.getInstr()) {} 260288943Sdim}; 261276479Sdim 262288943Sdim/// A helper function for converting Scalar types to vector types. 263288943Sdim/// If the incoming type is void, we return void. If the VF is 1, we return 264288943Sdim/// the scalar type. 265288943Sdimstatic Type* ToVectorTy(Type *Scalar, unsigned VF) { 266288943Sdim if (Scalar->isVoidTy() || VF == 1) 267288943Sdim return Scalar; 268288943Sdim return VectorType::get(Scalar, VF); 269288943Sdim} 270276479Sdim 271296417Sdim/// A helper function that returns GEP instruction and knows to skip a 272296417Sdim/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination 273296417Sdim/// pointee types of the 'bitcast' have the same size. 274296417Sdim/// For example: 275296417Sdim/// bitcast double** %var to i64* - can be skipped 276296417Sdim/// bitcast double** %var to i8* - can not 277296417Sdimstatic GetElementPtrInst *getGEPInstruction(Value *Ptr) { 278296417Sdim 279296417Sdim if (isa<GetElementPtrInst>(Ptr)) 280296417Sdim return cast<GetElementPtrInst>(Ptr); 281296417Sdim 282296417Sdim if (isa<BitCastInst>(Ptr) && 283296417Sdim isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) { 284296417Sdim Type *BitcastTy = Ptr->getType(); 285296417Sdim Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy(); 286296417Sdim if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy)) 287296417Sdim return nullptr; 288296417Sdim Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType(); 289296417Sdim Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType(); 290296417Sdim const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout(); 291296417Sdim if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) 292296417Sdim return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0)); 293296417Sdim } 294296417Sdim return nullptr; 295296417Sdim} 296296417Sdim 297249423Sdim/// InnerLoopVectorizer vectorizes loops which contain only one basic 298243789Sdim/// block to a specified vectorization factor (VF). 299243789Sdim/// This class performs the widening of scalars into vectors, or multiple 300243789Sdim/// scalars. This class also implements the following features: 301243789Sdim/// * It inserts an epilogue loop for handling loops that don't have iteration 302243789Sdim/// counts that are known to be a multiple of the vectorization factor. 303243789Sdim/// * It handles the code generation for reduction variables. 304243789Sdim/// * Scalarization (implementation using scalars) of un-vectorizable 305243789Sdim/// instructions. 306249423Sdim/// InnerLoopVectorizer does not perform any vectorization-legality 307243789Sdim/// checks, and relies on the caller to check for the different legality 308249423Sdim/// aspects. The InnerLoopVectorizer relies on the 309243789Sdim/// LoopVectorizationLegality class to provide information about the induction 310243789Sdim/// and reduction variables that were found to a given vectorization factor. 311249423Sdimclass InnerLoopVectorizer { 312243789Sdimpublic: 313296417Sdim InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 314296417Sdim LoopInfo *LI, DominatorTree *DT, 315296417Sdim const TargetLibraryInfo *TLI, 316288943Sdim const TargetTransformInfo *TTI, unsigned VecWidth, 317249423Sdim unsigned UnrollFactor) 318296417Sdim : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 319296417Sdim VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), 320276479Sdim Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), 321296417Sdim TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), 322296417Sdim AddedSafetyChecks(false) {} 323243789Sdim 324243789Sdim // Perform the actual loop widening (vectorization). 325296417Sdim // MinimumBitWidths maps scalar integer values to the smallest bitwidth they 326296417Sdim // can be validly truncated to. The cost model has assumed this truncation 327296417Sdim // will happen when vectorizing. 328296417Sdim void vectorize(LoopVectorizationLegality *L, 329296417Sdim MapVector<Instruction*,uint64_t> MinimumBitWidths) { 330296417Sdim MinBWs = MinimumBitWidths; 331276479Sdim Legal = L; 332249423Sdim // Create a new empty loop. Unlink the old loop and connect the new one. 333276479Sdim createEmptyLoop(); 334249423Sdim // Widen each instruction in the old loop to a new one in the new loop. 335249423Sdim // Use the Legality module to find the induction and reduction variables. 336276479Sdim vectorizeLoop(); 337249423Sdim } 338243789Sdim 339288943Sdim // Return true if any runtime check is added. 340288943Sdim bool IsSafetyChecksAdded() { 341288943Sdim return AddedSafetyChecks; 342288943Sdim } 343288943Sdim 344261991Sdim virtual ~InnerLoopVectorizer() {} 345261991Sdim 346261991Sdimprotected: 347249423Sdim /// A small list of PHINodes. 348249423Sdim typedef SmallVector<PHINode*, 4> PhiVector; 349249423Sdim /// When we unroll loops we have multiple vector values for each scalar. 350249423Sdim /// This data structure holds the unrolled and vectorized values that 351249423Sdim /// originated from one scalar instruction. 352249423Sdim typedef SmallVector<Value*, 2> VectorParts; 353249423Sdim 354288943Sdim // When we if-convert we need to create edge masks. We have to cache values 355288943Sdim // so that we don't end up with exponential recursion/IR. 356261991Sdim typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, 357261991Sdim VectorParts> EdgeMaskCache; 358261991Sdim 359243789Sdim /// Create an empty loop, based on the loop ranges of the old loop. 360276479Sdim void createEmptyLoop(); 361296417Sdim /// Create a new induction variable inside L. 362296417Sdim PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 363296417Sdim Value *Step, Instruction *DL); 364243789Sdim /// Copy and widen the instructions from the old loop. 365276479Sdim virtual void vectorizeLoop(); 366249423Sdim 367261991Sdim /// \brief The Loop exit block may have single value PHI nodes where the 368261991Sdim /// incoming value is 'Undef'. While vectorizing we only handled real values 369261991Sdim /// that were defined inside the loop. Here we fix the 'undef case'. 370261991Sdim /// See PR14725. 371261991Sdim void fixLCSSAPHIs(); 372261991Sdim 373296417Sdim /// Shrinks vector element sizes based on information in "MinBWs". 374296417Sdim void truncateToMinimalBitwidths(); 375296417Sdim 376249423Sdim /// A helper function that computes the predicate of the block BB, assuming 377249423Sdim /// that the header block of the loop is set to True. It returns the *entry* 378249423Sdim /// mask for the block BB. 379249423Sdim VectorParts createBlockInMask(BasicBlock *BB); 380249423Sdim /// A helper function that computes the predicate of the edge between SRC 381249423Sdim /// and DST. 382249423Sdim VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 383249423Sdim 384249423Sdim /// A helper function to vectorize a single BB within the innermost loop. 385276479Sdim void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); 386296417Sdim 387261991Sdim /// Vectorize a single PHINode in a block. This method handles the induction 388261991Sdim /// variable canonicalization. It supports both VF = 1 for unrolled loops and 389261991Sdim /// arbitrary length vectors. 390261991Sdim void widenPHIInstruction(Instruction *PN, VectorParts &Entry, 391261991Sdim unsigned UF, unsigned VF, PhiVector *PV); 392261991Sdim 393243789Sdim /// Insert the new loop to the loop hierarchy and pass manager 394243789Sdim /// and update the analysis passes. 395243789Sdim void updateAnalysis(); 396243789Sdim 397243789Sdim /// This instruction is un-vectorizable. Implement it as a sequence 398276479Sdim /// of scalars. If \p IfPredicateStore is true we need to 'hide' each 399276479Sdim /// scalarized instruction behind an if block predicated on the control 400276479Sdim /// dependence of the instruction. 401276479Sdim virtual void scalarizeInstruction(Instruction *Instr, 402276479Sdim bool IfPredicateStore=false); 403243789Sdim 404249423Sdim /// Vectorize Load and Store instructions, 405276479Sdim virtual void vectorizeMemoryInstruction(Instruction *Instr); 406249423Sdim 407243789Sdim /// Create a broadcast instruction. This method generates a broadcast 408243789Sdim /// instruction (shuffle) for loop invariant values and for the induction 409243789Sdim /// value. If this is the induction variable then we extend it to N, N+1, ... 410243789Sdim /// this is needed because each iteration in the loop corresponds to a SIMD 411243789Sdim /// element. 412261991Sdim virtual Value *getBroadcastInstrs(Value *V); 413243789Sdim 414288943Sdim /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 415288943Sdim /// to each vector element of Val. The sequence starts at StartIndex. 416288943Sdim virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); 417243789Sdim 418243789Sdim /// When we go over instructions in the basic block we rely on previous 419243789Sdim /// values within the current basic block or on loop invariant values. 420243789Sdim /// When we widen (vectorize) values we place them in the map. If the values 421243789Sdim /// are not within the map, they have to be loop invariant, so we simply 422243789Sdim /// broadcast them into a vector. 423249423Sdim VectorParts &getVectorValue(Value *V); 424243789Sdim 425288943Sdim /// Try to vectorize the interleaved access group that \p Instr belongs to. 426288943Sdim void vectorizeInterleaveGroup(Instruction *Instr); 427288943Sdim 428249423Sdim /// Generate a shuffle sequence that will reverse the vector Vec. 429261991Sdim virtual Value *reverseVector(Value *Vec); 430243789Sdim 431296417Sdim /// Returns (and creates if needed) the original loop trip count. 432296417Sdim Value *getOrCreateTripCount(Loop *NewLoop); 433296417Sdim 434296417Sdim /// Returns (and creates if needed) the trip count of the widened loop. 435296417Sdim Value *getOrCreateVectorTripCount(Loop *NewLoop); 436296417Sdim 437296417Sdim /// Emit a bypass check to see if the trip count would overflow, or we 438296417Sdim /// wouldn't have enough iterations to execute one vector loop. 439296417Sdim void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 440296417Sdim /// Emit a bypass check to see if the vector trip count is nonzero. 441296417Sdim void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); 442296417Sdim /// Emit a bypass check to see if all of the SCEV assumptions we've 443296417Sdim /// had to make are correct. 444296417Sdim void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 445296417Sdim /// Emit bypass checks to check any memory assumptions we may have made. 446296417Sdim void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 447296417Sdim 448249423Sdim /// This is a helper class that holds the vectorizer state. It maps scalar 449249423Sdim /// instructions to vector instructions. When the code is 'unrolled' then 450249423Sdim /// then a single scalar value is mapped to multiple vector parts. The parts 451249423Sdim /// are stored in the VectorPart type. 452249423Sdim struct ValueMap { 453249423Sdim /// C'tor. UnrollFactor controls the number of vectors ('parts') that 454249423Sdim /// are mapped. 455249423Sdim ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 456243789Sdim 457249423Sdim /// \return True if 'Key' is saved in the Value Map. 458249423Sdim bool has(Value *Key) const { return MapStorage.count(Key); } 459249423Sdim 460249423Sdim /// Initializes a new entry in the map. Sets all of the vector parts to the 461249423Sdim /// save value in 'Val'. 462249423Sdim /// \return A reference to a vector with splat values. 463249423Sdim VectorParts &splat(Value *Key, Value *Val) { 464249423Sdim VectorParts &Entry = MapStorage[Key]; 465249423Sdim Entry.assign(UF, Val); 466249423Sdim return Entry; 467249423Sdim } 468249423Sdim 469249423Sdim ///\return A reference to the value that is stored at 'Key'. 470249423Sdim VectorParts &get(Value *Key) { 471249423Sdim VectorParts &Entry = MapStorage[Key]; 472249423Sdim if (Entry.empty()) 473249423Sdim Entry.resize(UF); 474249423Sdim assert(Entry.size() == UF); 475249423Sdim return Entry; 476249423Sdim } 477249423Sdim 478249423Sdim private: 479249423Sdim /// The unroll factor. Each entry in the map stores this number of vector 480249423Sdim /// elements. 481249423Sdim unsigned UF; 482249423Sdim 483249423Sdim /// Map storage. We use std::map and not DenseMap because insertions to a 484249423Sdim /// dense map invalidates its iterators. 485249423Sdim std::map<Value *, VectorParts> MapStorage; 486249423Sdim }; 487249423Sdim 488243789Sdim /// The original loop. 489243789Sdim Loop *OrigLoop; 490296417Sdim /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 491296417Sdim /// dynamic knowledge to simplify SCEV expressions and converts them to a 492296417Sdim /// more usable form. 493296417Sdim PredicatedScalarEvolution &PSE; 494249423Sdim /// Loop Info. 495243789Sdim LoopInfo *LI; 496249423Sdim /// Dominator Tree. 497243789Sdim DominatorTree *DT; 498276479Sdim /// Alias Analysis. 499276479Sdim AliasAnalysis *AA; 500249423Sdim /// Target Library Info. 501249423Sdim const TargetLibraryInfo *TLI; 502288943Sdim /// Target Transform Info. 503288943Sdim const TargetTransformInfo *TTI; 504249423Sdim 505249423Sdim /// The vectorization SIMD factor to use. Each vector will have this many 506249423Sdim /// vector elements. 507243789Sdim unsigned VF; 508261991Sdim 509261991Sdimprotected: 510249423Sdim /// The vectorization unroll factor to use. Each scalar is vectorized to this 511249423Sdim /// many different vector instructions. 512249423Sdim unsigned UF; 513243789Sdim 514249423Sdim /// The builder that we use 515243789Sdim IRBuilder<> Builder; 516243789Sdim 517243789Sdim // --- Vectorization state --- 518243789Sdim 519243789Sdim /// The vector-loop preheader. 520243789Sdim BasicBlock *LoopVectorPreHeader; 521243789Sdim /// The scalar-loop preheader. 522243789Sdim BasicBlock *LoopScalarPreHeader; 523243789Sdim /// Middle Block between the vector and the scalar. 524243789Sdim BasicBlock *LoopMiddleBlock; 525243789Sdim ///The ExitBlock of the scalar loop. 526243789Sdim BasicBlock *LoopExitBlock; 527243789Sdim ///The vector loop body. 528276479Sdim SmallVector<BasicBlock *, 4> LoopVectorBody; 529243789Sdim ///The scalar loop body. 530243789Sdim BasicBlock *LoopScalarBody; 531249423Sdim /// A list of all bypass blocks. The first block is the entry of the loop. 532249423Sdim SmallVector<BasicBlock *, 4> LoopBypassBlocks; 533243789Sdim 534243789Sdim /// The new Induction variable which was added to the new block. 535243789Sdim PHINode *Induction; 536243789Sdim /// The induction variable of the old basic block. 537243789Sdim PHINode *OldInduction; 538249423Sdim /// Maps scalars to widened vectors. 539243789Sdim ValueMap WidenMap; 540296417Sdim /// Store instructions that should be predicated, as a pair 541296417Sdim /// <StoreInst, Predicate> 542296417Sdim SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores; 543261991Sdim EdgeMaskCache MaskCache; 544296417Sdim /// Trip count of the original loop. 545296417Sdim Value *TripCount; 546296417Sdim /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 547296417Sdim Value *VectorTripCount; 548276479Sdim 549296417Sdim /// Map of scalar integer values to the smallest bitwidth they can be legally 550296417Sdim /// represented as. The vector equivalents of these values should be truncated 551296417Sdim /// to this type. 552296417Sdim MapVector<Instruction*,uint64_t> MinBWs; 553276479Sdim LoopVectorizationLegality *Legal; 554288943Sdim 555288943Sdim // Record whether runtime check is added. 556288943Sdim bool AddedSafetyChecks; 557243789Sdim}; 558243789Sdim 559261991Sdimclass InnerLoopUnroller : public InnerLoopVectorizer { 560261991Sdimpublic: 561296417Sdim InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 562296417Sdim LoopInfo *LI, DominatorTree *DT, 563296417Sdim const TargetLibraryInfo *TLI, 564288943Sdim const TargetTransformInfo *TTI, unsigned UnrollFactor) 565296417Sdim : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} 566261991Sdim 567261991Sdimprivate: 568276479Sdim void scalarizeInstruction(Instruction *Instr, 569276479Sdim bool IfPredicateStore = false) override; 570276479Sdim void vectorizeMemoryInstruction(Instruction *Instr) override; 571276479Sdim Value *getBroadcastInstrs(Value *V) override; 572288943Sdim Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; 573276479Sdim Value *reverseVector(Value *Vec) override; 574261991Sdim}; 575261991Sdim 576261991Sdim/// \brief Look for a meaningful debug location on the instruction or it's 577261991Sdim/// operands. 578261991Sdimstatic Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 579261991Sdim if (!I) 580261991Sdim return I; 581261991Sdim 582261991Sdim DebugLoc Empty; 583261991Sdim if (I->getDebugLoc() != Empty) 584261991Sdim return I; 585261991Sdim 586261991Sdim for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 587261991Sdim if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 588261991Sdim if (OpInst->getDebugLoc() != Empty) 589261991Sdim return OpInst; 590261991Sdim } 591261991Sdim 592261991Sdim return I; 593261991Sdim} 594261991Sdim 595261991Sdim/// \brief Set the debug location in the builder using the debug location in the 596261991Sdim/// instruction. 597261991Sdimstatic void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 598261991Sdim if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) 599261991Sdim B.SetCurrentDebugLocation(Inst->getDebugLoc()); 600261991Sdim else 601261991Sdim B.SetCurrentDebugLocation(DebugLoc()); 602261991Sdim} 603261991Sdim 604276479Sdim#ifndef NDEBUG 605276479Sdim/// \return string containing a file name and a line # for the given loop. 606276479Sdimstatic std::string getDebugLocString(const Loop *L) { 607276479Sdim std::string Result; 608276479Sdim if (L) { 609276479Sdim raw_string_ostream OS(Result); 610288943Sdim if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 611288943Sdim LoopDbgLoc.print(OS); 612276479Sdim else 613276479Sdim // Just print the module name. 614276479Sdim OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 615276479Sdim OS.flush(); 616276479Sdim } 617276479Sdim return Result; 618276479Sdim} 619276479Sdim#endif 620276479Sdim 621276479Sdim/// \brief Propagate known metadata from one instruction to another. 622276479Sdimstatic void propagateMetadata(Instruction *To, const Instruction *From) { 623276479Sdim SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 624276479Sdim From->getAllMetadataOtherThanDebugLoc(Metadata); 625276479Sdim 626276479Sdim for (auto M : Metadata) { 627276479Sdim unsigned Kind = M.first; 628276479Sdim 629276479Sdim // These are safe to transfer (this is safe for TBAA, even when we 630276479Sdim // if-convert, because should that metadata have had a control dependency 631276479Sdim // on the condition, and thus actually aliased with some other 632276479Sdim // non-speculated memory access when the condition was false, this would be 633276479Sdim // caught by the runtime overlap checks). 634276479Sdim if (Kind != LLVMContext::MD_tbaa && 635280031Sdim Kind != LLVMContext::MD_alias_scope && 636280031Sdim Kind != LLVMContext::MD_noalias && 637296417Sdim Kind != LLVMContext::MD_fpmath && 638296417Sdim Kind != LLVMContext::MD_nontemporal) 639276479Sdim continue; 640276479Sdim 641276479Sdim To->setMetadata(Kind, M.second); 642276479Sdim } 643276479Sdim} 644276479Sdim 645276479Sdim/// \brief Propagate known metadata from one instruction to a vector of others. 646296417Sdimstatic void propagateMetadata(SmallVectorImpl<Value *> &To, 647296417Sdim const Instruction *From) { 648276479Sdim for (Value *V : To) 649276479Sdim if (Instruction *I = dyn_cast<Instruction>(V)) 650276479Sdim propagateMetadata(I, From); 651276479Sdim} 652276479Sdim 653288943Sdim/// \brief The group of interleaved loads/stores sharing the same stride and 654288943Sdim/// close to each other. 655288943Sdim/// 656288943Sdim/// Each member in this group has an index starting from 0, and the largest 657288943Sdim/// index should be less than interleaved factor, which is equal to the absolute 658288943Sdim/// value of the access's stride. 659288943Sdim/// 660288943Sdim/// E.g. An interleaved load group of factor 4: 661288943Sdim/// for (unsigned i = 0; i < 1024; i+=4) { 662288943Sdim/// a = A[i]; // Member of index 0 663288943Sdim/// b = A[i+1]; // Member of index 1 664288943Sdim/// d = A[i+3]; // Member of index 3 665288943Sdim/// ... 666288943Sdim/// } 667288943Sdim/// 668288943Sdim/// An interleaved store group of factor 4: 669288943Sdim/// for (unsigned i = 0; i < 1024; i+=4) { 670288943Sdim/// ... 671288943Sdim/// A[i] = a; // Member of index 0 672288943Sdim/// A[i+1] = b; // Member of index 1 673288943Sdim/// A[i+2] = c; // Member of index 2 674288943Sdim/// A[i+3] = d; // Member of index 3 675288943Sdim/// } 676288943Sdim/// 677288943Sdim/// Note: the interleaved load group could have gaps (missing members), but 678288943Sdim/// the interleaved store group doesn't allow gaps. 679288943Sdimclass InterleaveGroup { 680288943Sdimpublic: 681288943Sdim InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) 682288943Sdim : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { 683288943Sdim assert(Align && "The alignment should be non-zero"); 684288943Sdim 685288943Sdim Factor = std::abs(Stride); 686288943Sdim assert(Factor > 1 && "Invalid interleave factor"); 687288943Sdim 688288943Sdim Reverse = Stride < 0; 689288943Sdim Members[0] = Instr; 690288943Sdim } 691288943Sdim 692288943Sdim bool isReverse() const { return Reverse; } 693288943Sdim unsigned getFactor() const { return Factor; } 694288943Sdim unsigned getAlignment() const { return Align; } 695288943Sdim unsigned getNumMembers() const { return Members.size(); } 696288943Sdim 697288943Sdim /// \brief Try to insert a new member \p Instr with index \p Index and 698288943Sdim /// alignment \p NewAlign. The index is related to the leader and it could be 699288943Sdim /// negative if it is the new leader. 700288943Sdim /// 701288943Sdim /// \returns false if the instruction doesn't belong to the group. 702288943Sdim bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { 703288943Sdim assert(NewAlign && "The new member's alignment should be non-zero"); 704288943Sdim 705288943Sdim int Key = Index + SmallestKey; 706288943Sdim 707288943Sdim // Skip if there is already a member with the same index. 708288943Sdim if (Members.count(Key)) 709288943Sdim return false; 710288943Sdim 711288943Sdim if (Key > LargestKey) { 712288943Sdim // The largest index is always less than the interleave factor. 713288943Sdim if (Index >= static_cast<int>(Factor)) 714288943Sdim return false; 715288943Sdim 716288943Sdim LargestKey = Key; 717288943Sdim } else if (Key < SmallestKey) { 718288943Sdim // The largest index is always less than the interleave factor. 719288943Sdim if (LargestKey - Key >= static_cast<int>(Factor)) 720288943Sdim return false; 721288943Sdim 722288943Sdim SmallestKey = Key; 723288943Sdim } 724288943Sdim 725288943Sdim // It's always safe to select the minimum alignment. 726288943Sdim Align = std::min(Align, NewAlign); 727288943Sdim Members[Key] = Instr; 728288943Sdim return true; 729288943Sdim } 730288943Sdim 731288943Sdim /// \brief Get the member with the given index \p Index 732288943Sdim /// 733288943Sdim /// \returns nullptr if contains no such member. 734288943Sdim Instruction *getMember(unsigned Index) const { 735288943Sdim int Key = SmallestKey + Index; 736288943Sdim if (!Members.count(Key)) 737288943Sdim return nullptr; 738288943Sdim 739288943Sdim return Members.find(Key)->second; 740288943Sdim } 741288943Sdim 742288943Sdim /// \brief Get the index for the given member. Unlike the key in the member 743288943Sdim /// map, the index starts from 0. 744288943Sdim unsigned getIndex(Instruction *Instr) const { 745288943Sdim for (auto I : Members) 746288943Sdim if (I.second == Instr) 747288943Sdim return I.first - SmallestKey; 748288943Sdim 749288943Sdim llvm_unreachable("InterleaveGroup contains no such member"); 750288943Sdim } 751288943Sdim 752288943Sdim Instruction *getInsertPos() const { return InsertPos; } 753288943Sdim void setInsertPos(Instruction *Inst) { InsertPos = Inst; } 754288943Sdim 755288943Sdimprivate: 756288943Sdim unsigned Factor; // Interleave Factor. 757288943Sdim bool Reverse; 758288943Sdim unsigned Align; 759288943Sdim DenseMap<int, Instruction *> Members; 760288943Sdim int SmallestKey; 761288943Sdim int LargestKey; 762288943Sdim 763288943Sdim // To avoid breaking dependences, vectorized instructions of an interleave 764288943Sdim // group should be inserted at either the first load or the last store in 765288943Sdim // program order. 766288943Sdim // 767288943Sdim // E.g. %even = load i32 // Insert Position 768288943Sdim // %add = add i32 %even // Use of %even 769288943Sdim // %odd = load i32 770288943Sdim // 771288943Sdim // store i32 %even 772288943Sdim // %odd = add i32 // Def of %odd 773288943Sdim // store i32 %odd // Insert Position 774288943Sdim Instruction *InsertPos; 775288943Sdim}; 776288943Sdim 777288943Sdim/// \brief Drive the analysis of interleaved memory accesses in the loop. 778288943Sdim/// 779288943Sdim/// Use this class to analyze interleaved accesses only when we can vectorize 780288943Sdim/// a loop. Otherwise it's meaningless to do analysis as the vectorization 781288943Sdim/// on interleaved accesses is unsafe. 782288943Sdim/// 783288943Sdim/// The analysis collects interleave groups and records the relationships 784288943Sdim/// between the member and the group in a map. 785288943Sdimclass InterleavedAccessInfo { 786288943Sdimpublic: 787296417Sdim InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, 788296417Sdim DominatorTree *DT) 789296417Sdim : PSE(PSE), TheLoop(L), DT(DT) {} 790288943Sdim 791288943Sdim ~InterleavedAccessInfo() { 792288943Sdim SmallSet<InterleaveGroup *, 4> DelSet; 793288943Sdim // Avoid releasing a pointer twice. 794288943Sdim for (auto &I : InterleaveGroupMap) 795288943Sdim DelSet.insert(I.second); 796288943Sdim for (auto *Ptr : DelSet) 797288943Sdim delete Ptr; 798288943Sdim } 799288943Sdim 800288943Sdim /// \brief Analyze the interleaved accesses and collect them in interleave 801288943Sdim /// groups. Substitute symbolic strides using \p Strides. 802288943Sdim void analyzeInterleaving(const ValueToValueMap &Strides); 803288943Sdim 804288943Sdim /// \brief Check if \p Instr belongs to any interleave group. 805288943Sdim bool isInterleaved(Instruction *Instr) const { 806288943Sdim return InterleaveGroupMap.count(Instr); 807288943Sdim } 808288943Sdim 809288943Sdim /// \brief Get the interleave group that \p Instr belongs to. 810288943Sdim /// 811288943Sdim /// \returns nullptr if doesn't have such group. 812288943Sdim InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { 813288943Sdim if (InterleaveGroupMap.count(Instr)) 814288943Sdim return InterleaveGroupMap.find(Instr)->second; 815288943Sdim return nullptr; 816288943Sdim } 817288943Sdim 818288943Sdimprivate: 819296417Sdim /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. 820296417Sdim /// Simplifies SCEV expressions in the context of existing SCEV assumptions. 821296417Sdim /// The interleaved access analysis can also add new predicates (for example 822296417Sdim /// by versioning strides of pointers). 823296417Sdim PredicatedScalarEvolution &PSE; 824288943Sdim Loop *TheLoop; 825288943Sdim DominatorTree *DT; 826288943Sdim 827288943Sdim /// Holds the relationships between the members and the interleave group. 828288943Sdim DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap; 829288943Sdim 830288943Sdim /// \brief The descriptor for a strided memory access. 831288943Sdim struct StrideDescriptor { 832288943Sdim StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, 833288943Sdim unsigned Align) 834288943Sdim : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} 835288943Sdim 836288943Sdim StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} 837288943Sdim 838288943Sdim int Stride; // The access's stride. It is negative for a reverse access. 839288943Sdim const SCEV *Scev; // The scalar expression of this access 840288943Sdim unsigned Size; // The size of the memory object. 841288943Sdim unsigned Align; // The alignment of this access. 842288943Sdim }; 843288943Sdim 844288943Sdim /// \brief Create a new interleave group with the given instruction \p Instr, 845288943Sdim /// stride \p Stride and alignment \p Align. 846288943Sdim /// 847288943Sdim /// \returns the newly created interleave group. 848288943Sdim InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, 849288943Sdim unsigned Align) { 850288943Sdim assert(!InterleaveGroupMap.count(Instr) && 851288943Sdim "Already in an interleaved access group"); 852288943Sdim InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); 853288943Sdim return InterleaveGroupMap[Instr]; 854288943Sdim } 855288943Sdim 856288943Sdim /// \brief Release the group and remove all the relationships. 857288943Sdim void releaseGroup(InterleaveGroup *Group) { 858288943Sdim for (unsigned i = 0; i < Group->getFactor(); i++) 859288943Sdim if (Instruction *Member = Group->getMember(i)) 860288943Sdim InterleaveGroupMap.erase(Member); 861288943Sdim 862288943Sdim delete Group; 863288943Sdim } 864288943Sdim 865288943Sdim /// \brief Collect all the accesses with a constant stride in program order. 866288943Sdim void collectConstStridedAccesses( 867288943Sdim MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 868288943Sdim const ValueToValueMap &Strides); 869288943Sdim}; 870288943Sdim 871296417Sdim/// Utility class for getting and setting loop vectorizer hints in the form 872296417Sdim/// of loop metadata. 873296417Sdim/// This class keeps a number of loop annotations locally (as member variables) 874296417Sdim/// and can, upon request, write them back as metadata on the loop. It will 875296417Sdim/// initially scan the loop for existing metadata, and will update the local 876296417Sdim/// values based on information in the loop. 877296417Sdim/// We cannot write all values to metadata, as the mere presence of some info, 878296417Sdim/// for example 'force', means a decision has been made. So, we need to be 879296417Sdim/// careful NOT to add them if the user hasn't specifically asked so. 880296417Sdimclass LoopVectorizeHints { 881296417Sdim enum HintKind { 882296417Sdim HK_WIDTH, 883296417Sdim HK_UNROLL, 884296417Sdim HK_FORCE 885296417Sdim }; 886296417Sdim 887296417Sdim /// Hint - associates name and validation with the hint value. 888296417Sdim struct Hint { 889296417Sdim const char * Name; 890296417Sdim unsigned Value; // This may have to change for non-numeric values. 891296417Sdim HintKind Kind; 892296417Sdim 893296417Sdim Hint(const char * Name, unsigned Value, HintKind Kind) 894296417Sdim : Name(Name), Value(Value), Kind(Kind) { } 895296417Sdim 896296417Sdim bool validate(unsigned Val) { 897296417Sdim switch (Kind) { 898296417Sdim case HK_WIDTH: 899296417Sdim return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; 900296417Sdim case HK_UNROLL: 901296417Sdim return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; 902296417Sdim case HK_FORCE: 903296417Sdim return (Val <= 1); 904296417Sdim } 905296417Sdim return false; 906296417Sdim } 907296417Sdim }; 908296417Sdim 909296417Sdim /// Vectorization width. 910296417Sdim Hint Width; 911296417Sdim /// Vectorization interleave factor. 912296417Sdim Hint Interleave; 913296417Sdim /// Vectorization forced 914296417Sdim Hint Force; 915296417Sdim 916296417Sdim /// Return the loop metadata prefix. 917296417Sdim static StringRef Prefix() { return "llvm.loop."; } 918296417Sdim 919296417Sdimpublic: 920296417Sdim enum ForceKind { 921296417Sdim FK_Undefined = -1, ///< Not selected. 922296417Sdim FK_Disabled = 0, ///< Forcing disabled. 923296417Sdim FK_Enabled = 1, ///< Forcing enabled. 924296417Sdim }; 925296417Sdim 926296417Sdim LoopVectorizeHints(const Loop *L, bool DisableInterleaving) 927296417Sdim : Width("vectorize.width", VectorizerParams::VectorizationFactor, 928296417Sdim HK_WIDTH), 929296417Sdim Interleave("interleave.count", DisableInterleaving, HK_UNROLL), 930296417Sdim Force("vectorize.enable", FK_Undefined, HK_FORCE), 931296417Sdim TheLoop(L) { 932296417Sdim // Populate values with existing loop metadata. 933296417Sdim getHintsFromMetadata(); 934296417Sdim 935296417Sdim // force-vector-interleave overrides DisableInterleaving. 936296417Sdim if (VectorizerParams::isInterleaveForced()) 937296417Sdim Interleave.Value = VectorizerParams::VectorizationInterleave; 938296417Sdim 939296417Sdim DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() 940296417Sdim << "LV: Interleaving disabled by the pass manager\n"); 941296417Sdim } 942296417Sdim 943296417Sdim /// Mark the loop L as already vectorized by setting the width to 1. 944296417Sdim void setAlreadyVectorized() { 945296417Sdim Width.Value = Interleave.Value = 1; 946296417Sdim Hint Hints[] = {Width, Interleave}; 947296417Sdim writeHintsToMetadata(Hints); 948296417Sdim } 949296417Sdim 950296417Sdim bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { 951296417Sdim if (getForce() == LoopVectorizeHints::FK_Disabled) { 952296417Sdim DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); 953296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), 954296417Sdim vectorizeAnalysisPassName(), *F, 955296417Sdim L->getStartLoc(), emitRemark()); 956296417Sdim return false; 957296417Sdim } 958296417Sdim 959296417Sdim if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { 960296417Sdim DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); 961296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), 962296417Sdim vectorizeAnalysisPassName(), *F, 963296417Sdim L->getStartLoc(), emitRemark()); 964296417Sdim return false; 965296417Sdim } 966296417Sdim 967296417Sdim if (getWidth() == 1 && getInterleave() == 1) { 968296417Sdim // FIXME: Add a separate metadata to indicate when the loop has already 969296417Sdim // been vectorized instead of setting width and count to 1. 970296417Sdim DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); 971296417Sdim // FIXME: Add interleave.disable metadata. This will allow 972296417Sdim // vectorize.disable to be used without disabling the pass and errors 973296417Sdim // to differentiate between disabled vectorization and a width of 1. 974296417Sdim emitOptimizationRemarkAnalysis( 975296417Sdim F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), 976296417Sdim "loop not vectorized: vectorization and interleaving are explicitly " 977296417Sdim "disabled, or vectorize width and interleave count are both set to " 978296417Sdim "1"); 979296417Sdim return false; 980296417Sdim } 981296417Sdim 982296417Sdim return true; 983296417Sdim } 984296417Sdim 985296417Sdim /// Dumps all the hint information. 986296417Sdim std::string emitRemark() const { 987296417Sdim VectorizationReport R; 988296417Sdim if (Force.Value == LoopVectorizeHints::FK_Disabled) 989296417Sdim R << "vectorization is explicitly disabled"; 990296417Sdim else { 991296417Sdim R << "use -Rpass-analysis=loop-vectorize for more info"; 992296417Sdim if (Force.Value == LoopVectorizeHints::FK_Enabled) { 993296417Sdim R << " (Force=true"; 994296417Sdim if (Width.Value != 0) 995296417Sdim R << ", Vector Width=" << Width.Value; 996296417Sdim if (Interleave.Value != 0) 997296417Sdim R << ", Interleave Count=" << Interleave.Value; 998296417Sdim R << ")"; 999296417Sdim } 1000296417Sdim } 1001296417Sdim 1002296417Sdim return R.str(); 1003296417Sdim } 1004296417Sdim 1005296417Sdim unsigned getWidth() const { return Width.Value; } 1006296417Sdim unsigned getInterleave() const { return Interleave.Value; } 1007296417Sdim enum ForceKind getForce() const { return (ForceKind)Force.Value; } 1008296417Sdim const char *vectorizeAnalysisPassName() const { 1009296417Sdim // If hints are provided that don't disable vectorization use the 1010296417Sdim // AlwaysPrint pass name to force the frontend to print the diagnostic. 1011296417Sdim if (getWidth() == 1) 1012296417Sdim return LV_NAME; 1013296417Sdim if (getForce() == LoopVectorizeHints::FK_Disabled) 1014296417Sdim return LV_NAME; 1015296417Sdim if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) 1016296417Sdim return LV_NAME; 1017296417Sdim return DiagnosticInfo::AlwaysPrint; 1018296417Sdim } 1019296417Sdim 1020296417Sdim bool allowReordering() const { 1021296417Sdim // When enabling loop hints are provided we allow the vectorizer to change 1022296417Sdim // the order of operations that is given by the scalar loop. This is not 1023296417Sdim // enabled by default because can be unsafe or inefficient. For example, 1024296417Sdim // reordering floating-point operations will change the way round-off 1025296417Sdim // error accumulates in the loop. 1026296417Sdim return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; 1027296417Sdim } 1028296417Sdim 1029296417Sdimprivate: 1030296417Sdim /// Find hints specified in the loop metadata and update local values. 1031296417Sdim void getHintsFromMetadata() { 1032296417Sdim MDNode *LoopID = TheLoop->getLoopID(); 1033296417Sdim if (!LoopID) 1034296417Sdim return; 1035296417Sdim 1036296417Sdim // First operand should refer to the loop id itself. 1037296417Sdim assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 1038296417Sdim assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 1039296417Sdim 1040296417Sdim for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1041296417Sdim const MDString *S = nullptr; 1042296417Sdim SmallVector<Metadata *, 4> Args; 1043296417Sdim 1044296417Sdim // The expected hint is either a MDString or a MDNode with the first 1045296417Sdim // operand a MDString. 1046296417Sdim if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 1047296417Sdim if (!MD || MD->getNumOperands() == 0) 1048296417Sdim continue; 1049296417Sdim S = dyn_cast<MDString>(MD->getOperand(0)); 1050296417Sdim for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 1051296417Sdim Args.push_back(MD->getOperand(i)); 1052296417Sdim } else { 1053296417Sdim S = dyn_cast<MDString>(LoopID->getOperand(i)); 1054296417Sdim assert(Args.size() == 0 && "too many arguments for MDString"); 1055296417Sdim } 1056296417Sdim 1057296417Sdim if (!S) 1058296417Sdim continue; 1059296417Sdim 1060296417Sdim // Check if the hint starts with the loop metadata prefix. 1061296417Sdim StringRef Name = S->getString(); 1062296417Sdim if (Args.size() == 1) 1063296417Sdim setHint(Name, Args[0]); 1064296417Sdim } 1065296417Sdim } 1066296417Sdim 1067296417Sdim /// Checks string hint with one operand and set value if valid. 1068296417Sdim void setHint(StringRef Name, Metadata *Arg) { 1069296417Sdim if (!Name.startswith(Prefix())) 1070296417Sdim return; 1071296417Sdim Name = Name.substr(Prefix().size(), StringRef::npos); 1072296417Sdim 1073296417Sdim const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg); 1074296417Sdim if (!C) return; 1075296417Sdim unsigned Val = C->getZExtValue(); 1076296417Sdim 1077296417Sdim Hint *Hints[] = {&Width, &Interleave, &Force}; 1078296417Sdim for (auto H : Hints) { 1079296417Sdim if (Name == H->Name) { 1080296417Sdim if (H->validate(Val)) 1081296417Sdim H->Value = Val; 1082296417Sdim else 1083296417Sdim DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); 1084296417Sdim break; 1085296417Sdim } 1086296417Sdim } 1087296417Sdim } 1088296417Sdim 1089296417Sdim /// Create a new hint from name / value pair. 1090296417Sdim MDNode *createHintMetadata(StringRef Name, unsigned V) const { 1091296417Sdim LLVMContext &Context = TheLoop->getHeader()->getContext(); 1092296417Sdim Metadata *MDs[] = {MDString::get(Context, Name), 1093296417Sdim ConstantAsMetadata::get( 1094296417Sdim ConstantInt::get(Type::getInt32Ty(Context), V))}; 1095296417Sdim return MDNode::get(Context, MDs); 1096296417Sdim } 1097296417Sdim 1098296417Sdim /// Matches metadata with hint name. 1099296417Sdim bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) { 1100296417Sdim MDString* Name = dyn_cast<MDString>(Node->getOperand(0)); 1101296417Sdim if (!Name) 1102296417Sdim return false; 1103296417Sdim 1104296417Sdim for (auto H : HintTypes) 1105296417Sdim if (Name->getString().endswith(H.Name)) 1106296417Sdim return true; 1107296417Sdim return false; 1108296417Sdim } 1109296417Sdim 1110296417Sdim /// Sets current hints into loop metadata, keeping other values intact. 1111296417Sdim void writeHintsToMetadata(ArrayRef<Hint> HintTypes) { 1112296417Sdim if (HintTypes.size() == 0) 1113296417Sdim return; 1114296417Sdim 1115296417Sdim // Reserve the first element to LoopID (see below). 1116296417Sdim SmallVector<Metadata *, 4> MDs(1); 1117296417Sdim // If the loop already has metadata, then ignore the existing operands. 1118296417Sdim MDNode *LoopID = TheLoop->getLoopID(); 1119296417Sdim if (LoopID) { 1120296417Sdim for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1121296417Sdim MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); 1122296417Sdim // If node in update list, ignore old value. 1123296417Sdim if (!matchesHintMetadataName(Node, HintTypes)) 1124296417Sdim MDs.push_back(Node); 1125296417Sdim } 1126296417Sdim } 1127296417Sdim 1128296417Sdim // Now, add the missing hints. 1129296417Sdim for (auto H : HintTypes) 1130296417Sdim MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); 1131296417Sdim 1132296417Sdim // Replace current metadata node with new one. 1133296417Sdim LLVMContext &Context = TheLoop->getHeader()->getContext(); 1134296417Sdim MDNode *NewLoopID = MDNode::get(Context, MDs); 1135296417Sdim // Set operand 0 to refer to the loop id itself. 1136296417Sdim NewLoopID->replaceOperandWith(0, NewLoopID); 1137296417Sdim 1138296417Sdim TheLoop->setLoopID(NewLoopID); 1139296417Sdim } 1140296417Sdim 1141296417Sdim /// The loop these hints belong to. 1142296417Sdim const Loop *TheLoop; 1143296417Sdim}; 1144296417Sdim 1145296417Sdimstatic void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, 1146296417Sdim const LoopVectorizeHints &Hints, 1147296417Sdim const LoopAccessReport &Message) { 1148296417Sdim const char *Name = Hints.vectorizeAnalysisPassName(); 1149296417Sdim LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); 1150296417Sdim} 1151296417Sdim 1152296417Sdimstatic void emitMissedWarning(Function *F, Loop *L, 1153296417Sdim const LoopVectorizeHints &LH) { 1154296417Sdim emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1155296417Sdim LH.emitRemark()); 1156296417Sdim 1157296417Sdim if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { 1158296417Sdim if (LH.getWidth() != 1) 1159296417Sdim emitLoopVectorizeWarning( 1160296417Sdim F->getContext(), *F, L->getStartLoc(), 1161296417Sdim "failed explicitly specified loop vectorization"); 1162296417Sdim else if (LH.getInterleave() != 1) 1163296417Sdim emitLoopInterleaveWarning( 1164296417Sdim F->getContext(), *F, L->getStartLoc(), 1165296417Sdim "failed explicitly specified loop interleaving"); 1166296417Sdim } 1167296417Sdim} 1168296417Sdim 1169243789Sdim/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 1170243789Sdim/// to what vectorization factor. 1171243789Sdim/// This class does not look at the profitability of vectorization, only the 1172243789Sdim/// legality. This class has two main kinds of checks: 1173243789Sdim/// * Memory checks - The code in canVectorizeMemory checks if vectorization 1174243789Sdim/// will change the order of memory accesses in a way that will change the 1175243789Sdim/// correctness of the program. 1176249423Sdim/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 1177249423Sdim/// checks for a number of different conditions, such as the availability of a 1178249423Sdim/// single induction variable, that all types are supported and vectorize-able, 1179249423Sdim/// etc. This code reflects the capabilities of InnerLoopVectorizer. 1180249423Sdim/// This class is also used by InnerLoopVectorizer for identifying 1181243789Sdim/// induction variable and the different reduction variables. 1182243789Sdimclass LoopVectorizationLegality { 1183243789Sdimpublic: 1184296417Sdim LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, 1185296417Sdim DominatorTree *DT, TargetLibraryInfo *TLI, 1186296417Sdim AliasAnalysis *AA, Function *F, 1187296417Sdim const TargetTransformInfo *TTI, 1188296417Sdim LoopAccessAnalysis *LAA, 1189296417Sdim LoopVectorizationRequirements *R, 1190296417Sdim const LoopVectorizeHints *H) 1191296417Sdim : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), 1192296417Sdim TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), 1193296417Sdim Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), 1194296417Sdim Requirements(R), Hints(H) {} 1195243789Sdim 1196243789Sdim /// ReductionList contains the reduction descriptors for all 1197243789Sdim /// of the reductions that were found in the loop. 1198288943Sdim typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList; 1199243789Sdim 1200249423Sdim /// InductionList saves induction variables and maps them to the 1201249423Sdim /// induction descriptor. 1202296417Sdim typedef MapVector<PHINode*, InductionDescriptor> InductionList; 1203249423Sdim 1204243789Sdim /// Returns true if it is legal to vectorize this loop. 1205243789Sdim /// This does not mean that it is profitable to vectorize this 1206243789Sdim /// loop, only that it is legal to do so. 1207243789Sdim bool canVectorize(); 1208243789Sdim 1209243789Sdim /// Returns the Induction variable. 1210249423Sdim PHINode *getInduction() { return Induction; } 1211243789Sdim 1212243789Sdim /// Returns the reduction variables found in the loop. 1213243789Sdim ReductionList *getReductionVars() { return &Reductions; } 1214243789Sdim 1215249423Sdim /// Returns the induction variables found in the loop. 1216249423Sdim InductionList *getInductionVars() { return &Inductions; } 1217249423Sdim 1218261991Sdim /// Returns the widest induction type. 1219261991Sdim Type *getWidestInductionType() { return WidestIndTy; } 1220261991Sdim 1221249423Sdim /// Returns True if V is an induction variable in this loop. 1222249423Sdim bool isInductionVariable(const Value *V); 1223249423Sdim 1224296417Sdim /// Returns True if PN is a reduction variable in this loop. 1225296417Sdim bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } 1226296417Sdim 1227249423Sdim /// Return true if the block BB needs to be predicated in order for the loop 1228249423Sdim /// to be vectorized. 1229249423Sdim bool blockNeedsPredication(BasicBlock *BB); 1230249423Sdim 1231249423Sdim /// Check if this pointer is consecutive when vectorizing. This happens 1232249423Sdim /// when the last index of the GEP is the induction variable, or that the 1233249423Sdim /// pointer itself is an induction variable. 1234243789Sdim /// This check allows us to vectorize A[idx] into a wide load/store. 1235249423Sdim /// Returns: 1236276479Sdim /// 0 - Stride is unknown or non-consecutive. 1237249423Sdim /// 1 - Address is consecutive. 1238249423Sdim /// -1 - Address is consecutive, and decreasing. 1239249423Sdim int isConsecutivePtr(Value *Ptr); 1240243789Sdim 1241243789Sdim /// Returns true if the value V is uniform within the loop. 1242243789Sdim bool isUniform(Value *V); 1243243789Sdim 1244243789Sdim /// Returns true if this instruction will remain scalar after vectorization. 1245249423Sdim bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 1246243789Sdim 1247243789Sdim /// Returns the information that we collected about runtime memory check. 1248288943Sdim const RuntimePointerChecking *getRuntimePointerChecking() const { 1249288943Sdim return LAI->getRuntimePointerChecking(); 1250288943Sdim } 1251251662Sdim 1252288943Sdim const LoopAccessInfo *getLAI() const { 1253288943Sdim return LAI; 1254288943Sdim } 1255261991Sdim 1256288943Sdim /// \brief Check if \p Instr belongs to any interleaved access group. 1257288943Sdim bool isAccessInterleaved(Instruction *Instr) { 1258288943Sdim return InterleaveInfo.isInterleaved(Instr); 1259288943Sdim } 1260261991Sdim 1261288943Sdim /// \brief Get the interleaved access group that \p Instr belongs to. 1262288943Sdim const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { 1263288943Sdim return InterleaveInfo.getInterleaveGroup(Instr); 1264288943Sdim } 1265288943Sdim 1266288943Sdim unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } 1267288943Sdim 1268276479Sdim bool hasStride(Value *V) { return StrideSet.count(V); } 1269276479Sdim bool mustCheckStrides() { return !StrideSet.empty(); } 1270276479Sdim SmallPtrSet<Value *, 8>::iterator strides_begin() { 1271276479Sdim return StrideSet.begin(); 1272276479Sdim } 1273276479Sdim SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); } 1274276479Sdim 1275280031Sdim /// Returns true if the target machine supports masked store operation 1276280031Sdim /// for the given \p DataType and kind of access to \p Ptr. 1277280031Sdim bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1278296417Sdim return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); 1279280031Sdim } 1280280031Sdim /// Returns true if the target machine supports masked load operation 1281280031Sdim /// for the given \p DataType and kind of access to \p Ptr. 1282280031Sdim bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1283296417Sdim return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); 1284280031Sdim } 1285280031Sdim /// Returns true if vector representation of the instruction \p I 1286280031Sdim /// requires mask. 1287280031Sdim bool isMaskRequired(const Instruction* I) { 1288280031Sdim return (MaskedOp.count(I) != 0); 1289280031Sdim } 1290288943Sdim unsigned getNumStores() const { 1291288943Sdim return LAI->getNumStores(); 1292288943Sdim } 1293288943Sdim unsigned getNumLoads() const { 1294288943Sdim return LAI->getNumLoads(); 1295288943Sdim } 1296288943Sdim unsigned getNumPredStores() const { 1297288943Sdim return NumPredStores; 1298288943Sdim } 1299243789Sdimprivate: 1300243789Sdim /// Check if a single basic block loop is vectorizable. 1301243789Sdim /// At this point we know that this is a loop with a constant trip count 1302243789Sdim /// and we only need to check individual instructions. 1303249423Sdim bool canVectorizeInstrs(); 1304243789Sdim 1305243789Sdim /// When we vectorize loops we may change the order in which 1306243789Sdim /// we read and write from memory. This method checks if it is 1307243789Sdim /// legal to vectorize the code, considering only memory constrains. 1308249423Sdim /// Returns true if the loop is vectorizable 1309249423Sdim bool canVectorizeMemory(); 1310243789Sdim 1311249423Sdim /// Return true if we can vectorize this loop using the IF-conversion 1312249423Sdim /// transformation. 1313249423Sdim bool canVectorizeWithIfConvert(); 1314249423Sdim 1315249423Sdim /// Collect the variables that need to stay uniform after vectorization. 1316249423Sdim void collectLoopUniforms(); 1317249423Sdim 1318249423Sdim /// Return true if all of the instructions in the block can be speculatively 1319261991Sdim /// executed. \p SafePtrs is a list of addresses that are known to be legal 1320261991Sdim /// and we know that we can read from them without segfault. 1321280031Sdim bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs); 1322249423Sdim 1323276479Sdim /// \brief Collect memory access with loop invariant strides. 1324276479Sdim /// 1325276479Sdim /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop 1326276479Sdim /// invariant. 1327280031Sdim void collectStridedAccess(Value *LoadOrStoreInst); 1328276479Sdim 1329276479Sdim /// Report an analysis message to assist the user in diagnosing loops that are 1330288943Sdim /// not vectorized. These are handled as LoopAccessReport rather than 1331288943Sdim /// VectorizationReport because the << operator of VectorizationReport returns 1332288943Sdim /// LoopAccessReport. 1333296417Sdim void emitAnalysis(const LoopAccessReport &Message) const { 1334296417Sdim emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1335276479Sdim } 1336276479Sdim 1337288943Sdim unsigned NumPredStores; 1338288943Sdim 1339243789Sdim /// The loop that we evaluate. 1340243789Sdim Loop *TheLoop; 1341296417Sdim /// A wrapper around ScalarEvolution used to add runtime SCEV checks. 1342296417Sdim /// Applies dynamic knowledge to simplify SCEV expressions in the context 1343296417Sdim /// of existing SCEV assumptions. The analysis will also add a minimal set 1344296417Sdim /// of new predicates if this is required to enable vectorization and 1345296417Sdim /// unrolling. 1346296417Sdim PredicatedScalarEvolution &PSE; 1347249423Sdim /// Target Library Info. 1348249423Sdim TargetLibraryInfo *TLI; 1349276479Sdim /// Parent function 1350276479Sdim Function *TheFunction; 1351280031Sdim /// Target Transform Info 1352280031Sdim const TargetTransformInfo *TTI; 1353288943Sdim /// Dominator Tree. 1354288943Sdim DominatorTree *DT; 1355288943Sdim // LoopAccess analysis. 1356288943Sdim LoopAccessAnalysis *LAA; 1357288943Sdim // And the loop-accesses info corresponding to this loop. This pointer is 1358288943Sdim // null until canVectorizeMemory sets it up. 1359288943Sdim const LoopAccessInfo *LAI; 1360243789Sdim 1361288943Sdim /// The interleave access information contains groups of interleaved accesses 1362288943Sdim /// with the same stride and close to each other. 1363288943Sdim InterleavedAccessInfo InterleaveInfo; 1364288943Sdim 1365243789Sdim // --- vectorization state --- // 1366243789Sdim 1367249423Sdim /// Holds the integer induction variable. This is the counter of the 1368249423Sdim /// loop. 1369243789Sdim PHINode *Induction; 1370243789Sdim /// Holds the reduction variables. 1371243789Sdim ReductionList Reductions; 1372249423Sdim /// Holds all of the induction variables that we found in the loop. 1373249423Sdim /// Notice that inductions don't need to start at zero and that induction 1374249423Sdim /// variables can be pointers. 1375249423Sdim InductionList Inductions; 1376261991Sdim /// Holds the widest induction type encountered. 1377261991Sdim Type *WidestIndTy; 1378249423Sdim 1379243789Sdim /// Allowed outside users. This holds the reduction 1380243789Sdim /// vars which can be accessed from outside the loop. 1381243789Sdim SmallPtrSet<Value*, 4> AllowedExit; 1382243789Sdim /// This set holds the variables which are known to be uniform after 1383243789Sdim /// vectorization. 1384243789Sdim SmallPtrSet<Instruction*, 4> Uniforms; 1385288943Sdim 1386251662Sdim /// Can we assume the absence of NaNs. 1387251662Sdim bool HasFunNoNaNAttr; 1388261991Sdim 1389296417Sdim /// Vectorization requirements that will go through late-evaluation. 1390296417Sdim LoopVectorizationRequirements *Requirements; 1391296417Sdim 1392296417Sdim /// Used to emit an analysis of any legality issues. 1393296417Sdim const LoopVectorizeHints *Hints; 1394296417Sdim 1395276479Sdim ValueToValueMap Strides; 1396276479Sdim SmallPtrSet<Value *, 8> StrideSet; 1397288943Sdim 1398280031Sdim /// While vectorizing these instructions we have to generate a 1399280031Sdim /// call to the appropriate masked intrinsic 1400296417Sdim SmallPtrSet<const Instruction *, 8> MaskedOp; 1401243789Sdim}; 1402243789Sdim 1403243789Sdim/// LoopVectorizationCostModel - estimates the expected speedups due to 1404243789Sdim/// vectorization. 1405249423Sdim/// In many cases vectorization is not profitable. This can happen because of 1406249423Sdim/// a number of reasons. In this class we mainly attempt to predict the 1407249423Sdim/// expected speedup/slowdowns due to the supported instruction set. We use the 1408249423Sdim/// TargetTransformInfo to query the different backends for the cost of 1409249423Sdim/// different operations. 1410243789Sdimclass LoopVectorizationCostModel { 1411243789Sdimpublic: 1412249423Sdim LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 1413249423Sdim LoopVectorizationLegality *Legal, 1414249423Sdim const TargetTransformInfo &TTI, 1415296417Sdim const TargetLibraryInfo *TLI, DemandedBits *DB, 1416296417Sdim AssumptionCache *AC, const Function *F, 1417296417Sdim const LoopVectorizeHints *Hints, 1418296417Sdim SmallPtrSetImpl<const Value *> &ValuesToIgnore) 1419296417Sdim : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), 1420296417Sdim TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} 1421243789Sdim 1422249423Sdim /// Information about vectorization costs 1423249423Sdim struct VectorizationFactor { 1424249423Sdim unsigned Width; // Vector width with best cost 1425249423Sdim unsigned Cost; // Cost of the loop with that width 1426249423Sdim }; 1427249423Sdim /// \return The most profitable vectorization factor and the cost of that VF. 1428249423Sdim /// This method checks every power of two up to VF. If UserVF is not ZERO 1429249423Sdim /// then this vectorization factor will be selected if vectorization is 1430249423Sdim /// possible. 1431280031Sdim VectorizationFactor selectVectorizationFactor(bool OptForSize); 1432243789Sdim 1433296417Sdim /// \return The size (in bits) of the smallest and widest types in the code 1434296417Sdim /// that needs to be vectorized. We ignore values that remain scalar such as 1435249423Sdim /// 64 bit loop indices. 1436296417Sdim std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 1437249423Sdim 1438288943Sdim /// \return The desired interleave count. 1439288943Sdim /// If interleave count has been specified by metadata it will be returned. 1440288943Sdim /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1441288943Sdim /// are the selected vectorization factor and the cost of the selected VF. 1442288943Sdim unsigned selectInterleaveCount(bool OptForSize, unsigned VF, 1443288943Sdim unsigned LoopCost); 1444288943Sdim 1445249423Sdim /// \return The most profitable unroll factor. 1446288943Sdim /// This method finds the best unroll-factor based on register pressure and 1447288943Sdim /// other parameters. VF and LoopCost are the selected vectorization factor 1448288943Sdim /// and the cost of the selected VF. 1449288943Sdim unsigned computeInterleaveCount(bool OptForSize, unsigned VF, 1450288943Sdim unsigned LoopCost); 1451249423Sdim 1452249423Sdim /// \brief A struct that represents some properties of the register usage 1453249423Sdim /// of a loop. 1454249423Sdim struct RegisterUsage { 1455249423Sdim /// Holds the number of loop invariant values that are used in the loop. 1456249423Sdim unsigned LoopInvariantRegs; 1457249423Sdim /// Holds the maximum number of concurrent live intervals in the loop. 1458249423Sdim unsigned MaxLocalUsers; 1459249423Sdim /// Holds the number of instructions in the loop. 1460249423Sdim unsigned NumInstructions; 1461249423Sdim }; 1462249423Sdim 1463296417Sdim /// \return Returns information about the register usages of the loop for the 1464296417Sdim /// given vectorization factors. 1465296417Sdim SmallVector<RegisterUsage, 8> 1466296417Sdim calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs); 1467249423Sdim 1468243789Sdimprivate: 1469243789Sdim /// Returns the expected execution cost. The unit of the cost does 1470243789Sdim /// not matter because we use the 'cost' units to compare different 1471243789Sdim /// vector widths. The cost that is returned is *not* normalized by 1472243789Sdim /// the factor width. 1473243789Sdim unsigned expectedCost(unsigned VF); 1474243789Sdim 1475243789Sdim /// Returns the execution time cost of an instruction for a given vector 1476243789Sdim /// width. Vector width of one means scalar. 1477243789Sdim unsigned getInstructionCost(Instruction *I, unsigned VF); 1478243789Sdim 1479249423Sdim /// Returns whether the instruction is a load or store and will be a emitted 1480249423Sdim /// as a vector operation. 1481249423Sdim bool isConsecutiveLoadOrStore(Instruction *I); 1482249423Sdim 1483280031Sdim /// Report an analysis message to assist the user in diagnosing loops that are 1484288943Sdim /// not vectorized. These are handled as LoopAccessReport rather than 1485288943Sdim /// VectorizationReport because the << operator of VectorizationReport returns 1486288943Sdim /// LoopAccessReport. 1487296417Sdim void emitAnalysis(const LoopAccessReport &Message) const { 1488296417Sdim emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); 1489280031Sdim } 1490280031Sdim 1491296417Sdimpublic: 1492296417Sdim /// Map of scalar integer values to the smallest bitwidth they can be legally 1493296417Sdim /// represented as. The vector equivalents of these values should be truncated 1494296417Sdim /// to this type. 1495296417Sdim MapVector<Instruction*,uint64_t> MinBWs; 1496280031Sdim 1497243789Sdim /// The loop that we evaluate. 1498243789Sdim Loop *TheLoop; 1499243789Sdim /// Scev analysis. 1500243789Sdim ScalarEvolution *SE; 1501249423Sdim /// Loop Info analysis. 1502249423Sdim LoopInfo *LI; 1503243789Sdim /// Vectorization legality. 1504243789Sdim LoopVectorizationLegality *Legal; 1505243789Sdim /// Vector target information. 1506249423Sdim const TargetTransformInfo &TTI; 1507249423Sdim /// Target Library Info. 1508249423Sdim const TargetLibraryInfo *TLI; 1509296417Sdim /// Demanded bits analysis 1510296417Sdim DemandedBits *DB; 1511280031Sdim const Function *TheFunction; 1512280031Sdim // Loop Vectorize Hint. 1513280031Sdim const LoopVectorizeHints *Hints; 1514296417Sdim // Values to ignore in the cost model. 1515296417Sdim const SmallPtrSetImpl<const Value *> &ValuesToIgnore; 1516243789Sdim}; 1517243789Sdim 1518296417Sdim/// \brief This holds vectorization requirements that must be verified late in 1519296417Sdim/// the process. The requirements are set by legalize and costmodel. Once 1520296417Sdim/// vectorization has been determined to be possible and profitable the 1521296417Sdim/// requirements can be verified by looking for metadata or compiler options. 1522296417Sdim/// For example, some loops require FP commutativity which is only allowed if 1523296417Sdim/// vectorization is explicitly specified or if the fast-math compiler option 1524296417Sdim/// has been provided. 1525296417Sdim/// Late evaluation of these requirements allows helpful diagnostics to be 1526296417Sdim/// composed that tells the user what need to be done to vectorize the loop. For 1527296417Sdim/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late 1528296417Sdim/// evaluation should be used only when diagnostics can generated that can be 1529296417Sdim/// followed by a non-expert user. 1530296417Sdimclass LoopVectorizationRequirements { 1531276479Sdimpublic: 1532296417Sdim LoopVectorizationRequirements() 1533296417Sdim : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} 1534261991Sdim 1535296417Sdim void addUnsafeAlgebraInst(Instruction *I) { 1536296417Sdim // First unsafe algebra instruction. 1537296417Sdim if (!UnsafeAlgebraInst) 1538296417Sdim UnsafeAlgebraInst = I; 1539261991Sdim } 1540261991Sdim 1541296417Sdim void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } 1542261991Sdim 1543296417Sdim bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { 1544296417Sdim const char *Name = Hints.vectorizeAnalysisPassName(); 1545296417Sdim bool Failed = false; 1546296417Sdim if (UnsafeAlgebraInst && !Hints.allowReordering()) { 1547296417Sdim emitOptimizationRemarkAnalysisFPCommute( 1548296417Sdim F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), 1549296417Sdim VectorizationReport() << "cannot prove it is safe to reorder " 1550296417Sdim "floating-point operations"); 1551296417Sdim Failed = true; 1552276479Sdim } 1553280031Sdim 1554296417Sdim // Test if runtime memcheck thresholds are exceeded. 1555296417Sdim bool PragmaThresholdReached = 1556296417Sdim NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; 1557296417Sdim bool ThresholdReached = 1558296417Sdim NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; 1559296417Sdim if ((ThresholdReached && !Hints.allowReordering()) || 1560296417Sdim PragmaThresholdReached) { 1561296417Sdim emitOptimizationRemarkAnalysisAliasing( 1562296417Sdim F->getContext(), Name, *F, L->getStartLoc(), 1563296417Sdim VectorizationReport() 1564296417Sdim << "cannot prove it is safe to reorder memory operations"); 1565296417Sdim DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); 1566296417Sdim Failed = true; 1567261991Sdim } 1568261991Sdim 1569296417Sdim return Failed; 1570261991Sdim } 1571276479Sdim 1572296417Sdimprivate: 1573296417Sdim unsigned NumRuntimePointerChecks; 1574296417Sdim Instruction *UnsafeAlgebraInst; 1575261991Sdim}; 1576261991Sdim 1577276479Sdimstatic void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) { 1578276479Sdim if (L.empty()) 1579276479Sdim return V.push_back(&L); 1580276479Sdim 1581276479Sdim for (Loop *InnerL : L) 1582276479Sdim addInnerLoop(*InnerL, V); 1583276479Sdim} 1584276479Sdim 1585249423Sdim/// The LoopVectorize Pass. 1586276479Sdimstruct LoopVectorize : public FunctionPass { 1587249423Sdim /// Pass identification, replacement for typeid 1588249423Sdim static char ID; 1589243789Sdim 1590276479Sdim explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) 1591276479Sdim : FunctionPass(ID), 1592276479Sdim DisableUnrolling(NoUnrolling), 1593276479Sdim AlwaysVectorize(AlwaysVectorize) { 1594243789Sdim initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1595243789Sdim } 1596243789Sdim 1597243789Sdim ScalarEvolution *SE; 1598243789Sdim LoopInfo *LI; 1599243789Sdim TargetTransformInfo *TTI; 1600243789Sdim DominatorTree *DT; 1601276479Sdim BlockFrequencyInfo *BFI; 1602249423Sdim TargetLibraryInfo *TLI; 1603296417Sdim DemandedBits *DB; 1604276479Sdim AliasAnalysis *AA; 1605280031Sdim AssumptionCache *AC; 1606288943Sdim LoopAccessAnalysis *LAA; 1607261991Sdim bool DisableUnrolling; 1608276479Sdim bool AlwaysVectorize; 1609243789Sdim 1610276479Sdim BlockFrequency ColdEntryFreq; 1611243789Sdim 1612276479Sdim bool runOnFunction(Function &F) override { 1613296417Sdim SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1614288943Sdim LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1615288943Sdim TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1616276479Sdim DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1617296417Sdim BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1618288943Sdim auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1619288943Sdim TLI = TLIP ? &TLIP->getTLI() : nullptr; 1620296417Sdim AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1621280031Sdim AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1622288943Sdim LAA = &getAnalysis<LoopAccessAnalysis>(); 1623296417Sdim DB = &getAnalysis<DemandedBits>(); 1624243789Sdim 1625276479Sdim // Compute some weights outside of the loop over the loops. Compute this 1626276479Sdim // using a BranchProbability to re-use its scaling math. 1627276479Sdim const BranchProbability ColdProb(1, 5); // 20% 1628276479Sdim ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; 1629276479Sdim 1630288943Sdim // Don't attempt if 1631288943Sdim // 1. the target claims to have no vector registers, and 1632288943Sdim // 2. interleaving won't help ILP. 1633288943Sdim // 1634288943Sdim // The second condition is necessary because, even if the target has no 1635288943Sdim // vector registers, loop vectorization may still enable scalar 1636288943Sdim // interleaving. 1637288943Sdim if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 1638261991Sdim return false; 1639261991Sdim 1640276479Sdim // Build up a worklist of inner-loops to vectorize. This is necessary as 1641276479Sdim // the act of vectorizing or partially unrolling a loop creates new loops 1642276479Sdim // and can invalidate iterators across the loops. 1643276479Sdim SmallVector<Loop *, 8> Worklist; 1644243789Sdim 1645276479Sdim for (Loop *L : *LI) 1646276479Sdim addInnerLoop(*L, Worklist); 1647276479Sdim 1648276479Sdim LoopsAnalyzed += Worklist.size(); 1649276479Sdim 1650276479Sdim // Now walk the identified inner loops. 1651276479Sdim bool Changed = false; 1652276479Sdim while (!Worklist.empty()) 1653276479Sdim Changed |= processLoop(Worklist.pop_back_val()); 1654276479Sdim 1655276479Sdim // Process each loop nest in the function. 1656276479Sdim return Changed; 1657276479Sdim } 1658276479Sdim 1659288943Sdim static void AddRuntimeUnrollDisableMetaData(Loop *L) { 1660288943Sdim SmallVector<Metadata *, 4> MDs; 1661288943Sdim // Reserve first location for self reference to the LoopID metadata node. 1662288943Sdim MDs.push_back(nullptr); 1663288943Sdim bool IsUnrollMetadata = false; 1664288943Sdim MDNode *LoopID = L->getLoopID(); 1665288943Sdim if (LoopID) { 1666288943Sdim // First find existing loop unrolling disable metadata. 1667288943Sdim for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 1668288943Sdim MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 1669288943Sdim if (MD) { 1670288943Sdim const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); 1671288943Sdim IsUnrollMetadata = 1672288943Sdim S && S->getString().startswith("llvm.loop.unroll.disable"); 1673288943Sdim } 1674288943Sdim MDs.push_back(LoopID->getOperand(i)); 1675288943Sdim } 1676288943Sdim } 1677288943Sdim 1678288943Sdim if (!IsUnrollMetadata) { 1679288943Sdim // Add runtime unroll disable metadata. 1680288943Sdim LLVMContext &Context = L->getHeader()->getContext(); 1681288943Sdim SmallVector<Metadata *, 1> DisableOperands; 1682288943Sdim DisableOperands.push_back( 1683288943Sdim MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 1684288943Sdim MDNode *DisableNode = MDNode::get(Context, DisableOperands); 1685288943Sdim MDs.push_back(DisableNode); 1686288943Sdim MDNode *NewLoopID = MDNode::get(Context, MDs); 1687288943Sdim // Set operand 0 to refer to the loop id itself. 1688288943Sdim NewLoopID->replaceOperandWith(0, NewLoopID); 1689288943Sdim L->setLoopID(NewLoopID); 1690288943Sdim } 1691288943Sdim } 1692288943Sdim 1693276479Sdim bool processLoop(Loop *L) { 1694276479Sdim assert(L->empty() && "Only process inner loops."); 1695276479Sdim 1696276479Sdim#ifndef NDEBUG 1697276479Sdim const std::string DebugLocStr = getDebugLocString(L); 1698276479Sdim#endif /* NDEBUG */ 1699276479Sdim 1700276479Sdim DEBUG(dbgs() << "\nLV: Checking a loop in \"" 1701276479Sdim << L->getHeader()->getParent()->getName() << "\" from " 1702276479Sdim << DebugLocStr << "\n"); 1703276479Sdim 1704261991Sdim LoopVectorizeHints Hints(L, DisableUnrolling); 1705261991Sdim 1706276479Sdim DEBUG(dbgs() << "LV: Loop hints:" 1707276479Sdim << " force=" 1708276479Sdim << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 1709276479Sdim ? "disabled" 1710276479Sdim : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 1711276479Sdim ? "enabled" 1712276479Sdim : "?")) << " width=" << Hints.getWidth() 1713280031Sdim << " unroll=" << Hints.getInterleave() << "\n"); 1714276479Sdim 1715276479Sdim // Function containing loop 1716276479Sdim Function *F = L->getHeader()->getParent(); 1717276479Sdim 1718276479Sdim // Looking at the diagnostic output is the only way to determine if a loop 1719276479Sdim // was vectorized (other than looking at the IR or machine code), so it 1720276479Sdim // is important to generate an optimization remark for each loop. Most of 1721276479Sdim // these messages are generated by emitOptimizationRemarkAnalysis. Remarks 1722276479Sdim // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are 1723276479Sdim // less verbose reporting vectorized loops and unvectorized loops that may 1724276479Sdim // benefit from vectorization, respectively. 1725276479Sdim 1726296417Sdim if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { 1727296417Sdim DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 1728261991Sdim return false; 1729261991Sdim } 1730261991Sdim 1731276479Sdim // Check the loop for a trip count threshold: 1732276479Sdim // do not vectorize loops with a tiny trip count. 1733280031Sdim const unsigned TC = SE->getSmallConstantTripCount(L); 1734276479Sdim if (TC > 0u && TC < TinyTripCountVectorThreshold) { 1735276479Sdim DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 1736276479Sdim << "This loop is not worth vectorizing."); 1737276479Sdim if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 1738276479Sdim DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 1739276479Sdim else { 1740276479Sdim DEBUG(dbgs() << "\n"); 1741296417Sdim emitAnalysisDiag(F, L, Hints, VectorizationReport() 1742296417Sdim << "vectorization is not beneficial " 1743296417Sdim "and is not explicitly forced"); 1744276479Sdim return false; 1745276479Sdim } 1746276479Sdim } 1747276479Sdim 1748296417Sdim PredicatedScalarEvolution PSE(*SE); 1749296417Sdim 1750243789Sdim // Check if it is legal to vectorize the loop. 1751296417Sdim LoopVectorizationRequirements Requirements; 1752296417Sdim LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, 1753296417Sdim &Requirements, &Hints); 1754243789Sdim if (!LVL.canVectorize()) { 1755276479Sdim DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 1756276479Sdim emitMissedWarning(F, L, Hints); 1757243789Sdim return false; 1758243789Sdim } 1759243789Sdim 1760296417Sdim // Collect values we want to ignore in the cost model. This includes 1761296417Sdim // type-promoting instructions we identified during reduction detection. 1762296417Sdim SmallPtrSet<const Value *, 32> ValuesToIgnore; 1763296417Sdim CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore); 1764296417Sdim for (auto &Reduction : *LVL.getReductionVars()) { 1765296417Sdim RecurrenceDescriptor &RedDes = Reduction.second; 1766296417Sdim SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 1767296417Sdim ValuesToIgnore.insert(Casts.begin(), Casts.end()); 1768296417Sdim } 1769296417Sdim 1770249423Sdim // Use the cost model. 1771296417Sdim LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC, 1772296417Sdim F, &Hints, ValuesToIgnore); 1773243789Sdim 1774249423Sdim // Check the function attributes to find out if this function should be 1775249423Sdim // optimized for size. 1776276479Sdim bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1777296417Sdim F->optForSize(); 1778243789Sdim 1779276479Sdim // Compute the weighted frequency of this loop being executed and see if it 1780276479Sdim // is less than 20% of the function entry baseline frequency. Note that we 1781296417Sdim // always have a canonical loop here because we think we *can* vectorize. 1782276479Sdim // FIXME: This is hidden behind a flag due to pervasive problems with 1783276479Sdim // exactly what block frequency models. 1784276479Sdim if (LoopVectorizeWithBlockFrequency) { 1785276479Sdim BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); 1786276479Sdim if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 1787276479Sdim LoopEntryFreq < ColdEntryFreq) 1788276479Sdim OptForSize = true; 1789276479Sdim } 1790276479Sdim 1791296417Sdim // Check the function attributes to see if implicit floats are allowed. 1792276479Sdim // FIXME: This check doesn't seem possibly correct -- what if the loop is 1793276479Sdim // an integer loop and the vector instructions selected are purely integer 1794276479Sdim // vector instructions? 1795276479Sdim if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 1796249423Sdim DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 1797249423Sdim "attribute is used.\n"); 1798296417Sdim emitAnalysisDiag( 1799296417Sdim F, L, Hints, 1800296417Sdim VectorizationReport() 1801296417Sdim << "loop not vectorized due to NoImplicitFloat attribute"); 1802276479Sdim emitMissedWarning(F, L, Hints); 1803249423Sdim return false; 1804243789Sdim } 1805243789Sdim 1806249423Sdim // Select the optimal vectorization factor. 1807276479Sdim const LoopVectorizationCostModel::VectorizationFactor VF = 1808280031Sdim CM.selectVectorizationFactor(OptForSize); 1809276479Sdim 1810288943Sdim // Select the interleave count. 1811288943Sdim unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); 1812243789Sdim 1813296417Sdim // Get user interleave count. 1814296417Sdim unsigned UserIC = Hints.getInterleave(); 1815261991Sdim 1816296417Sdim // Identify the diagnostic messages that should be produced. 1817296417Sdim std::string VecDiagMsg, IntDiagMsg; 1818296417Sdim bool VectorizeLoop = true, InterleaveLoop = true; 1819296417Sdim 1820296417Sdim if (Requirements.doesNotMeet(F, L, Hints)) { 1821296417Sdim DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 1822296417Sdim "requirements.\n"); 1823296417Sdim emitMissedWarning(F, L, Hints); 1824296417Sdim return false; 1825296417Sdim } 1826296417Sdim 1827249423Sdim if (VF.Width == 1) { 1828296417Sdim DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 1829296417Sdim VecDiagMsg = 1830296417Sdim "the cost-model indicates that vectorization is not beneficial"; 1831296417Sdim VectorizeLoop = false; 1832296417Sdim } 1833276479Sdim 1834296417Sdim if (IC == 1 && UserIC <= 1) { 1835296417Sdim // Tell the user interleaving is not beneficial. 1836296417Sdim DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 1837296417Sdim IntDiagMsg = 1838296417Sdim "the cost-model indicates that interleaving is not beneficial"; 1839296417Sdim InterleaveLoop = false; 1840296417Sdim if (UserIC == 1) 1841296417Sdim IntDiagMsg += 1842296417Sdim " and is explicitly disabled or interleave count is set to 1"; 1843296417Sdim } else if (IC > 1 && UserIC == 1) { 1844296417Sdim // Tell the user interleaving is beneficial, but it explicitly disabled. 1845296417Sdim DEBUG(dbgs() 1846296417Sdim << "LV: Interleaving is beneficial but is explicitly disabled."); 1847296417Sdim IntDiagMsg = "the cost-model indicates that interleaving is beneficial " 1848296417Sdim "but is explicitly disabled or interleave count is set to 1"; 1849296417Sdim InterleaveLoop = false; 1850296417Sdim } 1851276479Sdim 1852296417Sdim // Override IC if user provided an interleave count. 1853296417Sdim IC = UserIC > 0 ? UserIC : IC; 1854276479Sdim 1855296417Sdim // Emit diagnostic messages, if any. 1856296417Sdim const char *VAPassName = Hints.vectorizeAnalysisPassName(); 1857296417Sdim if (!VectorizeLoop && !InterleaveLoop) { 1858296417Sdim // Do not vectorize or interleaving the loop. 1859296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, 1860296417Sdim L->getStartLoc(), VecDiagMsg); 1861296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, 1862296417Sdim L->getStartLoc(), IntDiagMsg); 1863296417Sdim return false; 1864296417Sdim } else if (!VectorizeLoop && InterleaveLoop) { 1865296417Sdim DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1866296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, 1867296417Sdim L->getStartLoc(), VecDiagMsg); 1868296417Sdim } else if (VectorizeLoop && !InterleaveLoop) { 1869296417Sdim DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1870296417Sdim << DebugLocStr << '\n'); 1871296417Sdim emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, 1872296417Sdim L->getStartLoc(), IntDiagMsg); 1873296417Sdim } else if (VectorizeLoop && InterleaveLoop) { 1874296417Sdim DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " 1875296417Sdim << DebugLocStr << '\n'); 1876296417Sdim DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 1877296417Sdim } 1878296417Sdim 1879296417Sdim if (!VectorizeLoop) { 1880296417Sdim assert(IC > 1 && "interleave count should not be 1 or 0"); 1881296417Sdim // If we decided that it is not legal to vectorize the loop then 1882296417Sdim // interleave it. 1883296417Sdim InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); 1884296417Sdim Unroller.vectorize(&LVL, CM.MinBWs); 1885296417Sdim 1886296417Sdim emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1887296417Sdim Twine("interleaved loop (interleaved count: ") + 1888296417Sdim Twine(IC) + ")"); 1889261991Sdim } else { 1890261991Sdim // If we decided that it is *legal* to vectorize the loop then do it. 1891296417Sdim InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); 1892296417Sdim LB.vectorize(&LVL, CM.MinBWs); 1893276479Sdim ++LoopsVectorized; 1894276479Sdim 1895288943Sdim // Add metadata to disable runtime unrolling scalar loop when there's no 1896288943Sdim // runtime check about strides and memory. Because at this situation, 1897288943Sdim // scalar loop is rarely used not worthy to be unrolled. 1898288943Sdim if (!LB.IsSafetyChecksAdded()) 1899288943Sdim AddRuntimeUnrollDisableMetaData(L); 1900288943Sdim 1901276479Sdim // Report the vectorization decision. 1902296417Sdim emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), 1903288943Sdim Twine("vectorized loop (vectorization width: ") + 1904288943Sdim Twine(VF.Width) + ", interleaved count: " + 1905288943Sdim Twine(IC) + ")"); 1906249423Sdim } 1907249423Sdim 1908261991Sdim // Mark the loop as already vectorized to avoid vectorizing again. 1909280031Sdim Hints.setAlreadyVectorized(); 1910249423Sdim 1911243789Sdim DEBUG(verifyFunction(*L->getHeader()->getParent())); 1912243789Sdim return true; 1913243789Sdim } 1914243789Sdim 1915276479Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 1916280031Sdim AU.addRequired<AssumptionCacheTracker>(); 1917243789Sdim AU.addRequiredID(LoopSimplifyID); 1918243789Sdim AU.addRequiredID(LCSSAID); 1919296417Sdim AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1920276479Sdim AU.addRequired<DominatorTreeWrapperPass>(); 1921288943Sdim AU.addRequired<LoopInfoWrapperPass>(); 1922296417Sdim AU.addRequired<ScalarEvolutionWrapperPass>(); 1923288943Sdim AU.addRequired<TargetTransformInfoWrapperPass>(); 1924296417Sdim AU.addRequired<AAResultsWrapperPass>(); 1925288943Sdim AU.addRequired<LoopAccessAnalysis>(); 1926296417Sdim AU.addRequired<DemandedBits>(); 1927288943Sdim AU.addPreserved<LoopInfoWrapperPass>(); 1928276479Sdim AU.addPreserved<DominatorTreeWrapperPass>(); 1929296417Sdim AU.addPreserved<BasicAAWrapperPass>(); 1930296417Sdim AU.addPreserved<AAResultsWrapperPass>(); 1931296417Sdim AU.addPreserved<GlobalsAAWrapperPass>(); 1932243789Sdim } 1933243789Sdim 1934243789Sdim}; 1935243789Sdim 1936249423Sdim} // end anonymous namespace 1937249423Sdim 1938249423Sdim//===----------------------------------------------------------------------===// 1939249423Sdim// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1940249423Sdim// LoopVectorizationCostModel. 1941249423Sdim//===----------------------------------------------------------------------===// 1942249423Sdim 1943249423SdimValue *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1944249423Sdim // We need to place the broadcast of invariant variables outside the loop. 1945249423Sdim Instruction *Instr = dyn_cast<Instruction>(V); 1946276479Sdim bool NewInstr = 1947276479Sdim (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), 1948276479Sdim Instr->getParent()) != LoopVectorBody.end()); 1949249423Sdim bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 1950249423Sdim 1951249423Sdim // Place the code for broadcasting invariant variables in the new preheader. 1952261991Sdim IRBuilder<>::InsertPointGuard Guard(Builder); 1953249423Sdim if (Invariant) 1954249423Sdim Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1955249423Sdim 1956243789Sdim // Broadcast the scalar into all locations in the vector. 1957249423Sdim Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1958249423Sdim 1959243789Sdim return Shuf; 1960243789Sdim} 1961243789Sdim 1962288943SdimValue *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, 1963288943Sdim Value *Step) { 1964243789Sdim assert(Val->getType()->isVectorTy() && "Must be a vector"); 1965243789Sdim assert(Val->getType()->getScalarType()->isIntegerTy() && 1966243789Sdim "Elem must be an integer"); 1967288943Sdim assert(Step->getType() == Val->getType()->getScalarType() && 1968288943Sdim "Step has wrong type"); 1969243789Sdim // Create the types. 1970243789Sdim Type *ITy = Val->getType()->getScalarType(); 1971243789Sdim VectorType *Ty = cast<VectorType>(Val->getType()); 1972249423Sdim int VLen = Ty->getNumElements(); 1973243789Sdim SmallVector<Constant*, 8> Indices; 1974243789Sdim 1975243789Sdim // Create a vector of consecutive numbers from zero to VF. 1976288943Sdim for (int i = 0; i < VLen; ++i) 1977288943Sdim Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); 1978243789Sdim 1979243789Sdim // Add the consecutive indices to the vector value. 1980243789Sdim Constant *Cv = ConstantVector::get(Indices); 1981243789Sdim assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1982288943Sdim Step = Builder.CreateVectorSplat(VLen, Step); 1983288943Sdim assert(Step->getType() == Val->getType() && "Invalid step vec"); 1984288943Sdim // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1985288943Sdim // which can be found from the original scalar operations. 1986288943Sdim Step = Builder.CreateMul(Cv, Step); 1987288943Sdim return Builder.CreateAdd(Val, Step, "induction"); 1988243789Sdim} 1989243789Sdim 1990249423Sdimint LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 1991276479Sdim assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); 1992296417Sdim auto *SE = PSE.getSE(); 1993249423Sdim // Make sure that the pointer does not point to structs. 1994261991Sdim if (Ptr->getType()->getPointerElementType()->isAggregateType()) 1995249423Sdim return 0; 1996249423Sdim 1997249423Sdim // If this value is a pointer induction variable we know it is consecutive. 1998249423Sdim PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 1999249423Sdim if (Phi && Inductions.count(Phi)) { 2000296417Sdim InductionDescriptor II = Inductions[Phi]; 2001288943Sdim return II.getConsecutiveDirection(); 2002249423Sdim } 2003249423Sdim 2004296417Sdim GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2005243789Sdim if (!Gep) 2006249423Sdim return 0; 2007243789Sdim 2008243789Sdim unsigned NumOperands = Gep->getNumOperands(); 2009249423Sdim Value *GpPtr = Gep->getPointerOperand(); 2010249423Sdim // If this GEP value is a consecutive pointer induction variable and all of 2011249423Sdim // the indices are constant then we know it is consecutive. We can 2012249423Sdim Phi = dyn_cast<PHINode>(GpPtr); 2013249423Sdim if (Phi && Inductions.count(Phi)) { 2014249423Sdim 2015249423Sdim // Make sure that the pointer does not point to structs. 2016249423Sdim PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 2017249423Sdim if (GepPtrType->getElementType()->isAggregateType()) 2018249423Sdim return 0; 2019249423Sdim 2020249423Sdim // Make sure that all of the index operands are loop invariant. 2021249423Sdim for (unsigned i = 1; i < NumOperands; ++i) 2022296417Sdim if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2023249423Sdim return 0; 2024249423Sdim 2025296417Sdim InductionDescriptor II = Inductions[Phi]; 2026288943Sdim return II.getConsecutiveDirection(); 2027249423Sdim } 2028249423Sdim 2029288943Sdim unsigned InductionOperand = getGEPInductionOperand(Gep); 2030261991Sdim 2031261991Sdim // Check that all of the gep indices are uniform except for our induction 2032261991Sdim // operand. 2033261991Sdim for (unsigned i = 0; i != NumOperands; ++i) 2034261991Sdim if (i != InductionOperand && 2035296417Sdim !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) 2036249423Sdim return 0; 2037243789Sdim 2038261991Sdim // We can emit wide load/stores only if the last non-zero index is the 2039261991Sdim // induction variable. 2040276479Sdim const SCEV *Last = nullptr; 2041276479Sdim if (!Strides.count(Gep)) 2042296417Sdim Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); 2043276479Sdim else { 2044276479Sdim // Because of the multiplication by a stride we can have a s/zext cast. 2045276479Sdim // We are going to replace this stride by 1 so the cast is safe to ignore. 2046276479Sdim // 2047276479Sdim // %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 2048276479Sdim // %0 = trunc i64 %indvars.iv to i32 2049276479Sdim // %mul = mul i32 %0, %Stride1 2050276479Sdim // %idxprom = zext i32 %mul to i64 << Safe cast. 2051276479Sdim // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom 2052276479Sdim // 2053296417Sdim Last = replaceSymbolicStrideSCEV(PSE, Strides, 2054276479Sdim Gep->getOperand(InductionOperand), Gep); 2055276479Sdim if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last)) 2056276479Sdim Last = 2057276479Sdim (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend) 2058276479Sdim ? C->getOperand() 2059276479Sdim : Last; 2060276479Sdim } 2061243789Sdim if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 2062243789Sdim const SCEV *Step = AR->getStepRecurrence(*SE); 2063243789Sdim 2064243789Sdim // The memory is consecutive because the last index is consecutive 2065243789Sdim // and all other indices are loop invariant. 2066243789Sdim if (Step->isOne()) 2067249423Sdim return 1; 2068249423Sdim if (Step->isAllOnesValue()) 2069249423Sdim return -1; 2070243789Sdim } 2071243789Sdim 2072249423Sdim return 0; 2073243789Sdim} 2074243789Sdim 2075243789Sdimbool LoopVectorizationLegality::isUniform(Value *V) { 2076288943Sdim return LAI->isUniform(V); 2077243789Sdim} 2078243789Sdim 2079249423SdimInnerLoopVectorizer::VectorParts& 2080249423SdimInnerLoopVectorizer::getVectorValue(Value *V) { 2081249423Sdim assert(V != Induction && "The new induction variable should not be used."); 2082243789Sdim assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2083243789Sdim 2084276479Sdim // If we have a stride that is replaced by one, do it here. 2085276479Sdim if (Legal->hasStride(V)) 2086276479Sdim V = ConstantInt::get(V->getType(), 1); 2087276479Sdim 2088249423Sdim // If we have this scalar in the map, return it. 2089249423Sdim if (WidenMap.has(V)) 2090249423Sdim return WidenMap.get(V); 2091249423Sdim 2092249423Sdim // If this scalar is unknown, assume that it is a constant or that it is 2093249423Sdim // loop invariant. Broadcast V and save the value for future uses. 2094243789Sdim Value *B = getBroadcastInstrs(V); 2095249423Sdim return WidenMap.splat(V, B); 2096243789Sdim} 2097243789Sdim 2098249423SdimValue *InnerLoopVectorizer::reverseVector(Value *Vec) { 2099249423Sdim assert(Vec->getType()->isVectorTy() && "Invalid type"); 2100249423Sdim SmallVector<Constant*, 8> ShuffleMask; 2101243789Sdim for (unsigned i = 0; i < VF; ++i) 2102249423Sdim ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2103243789Sdim 2104249423Sdim return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2105249423Sdim ConstantVector::get(ShuffleMask), 2106249423Sdim "reverse"); 2107243789Sdim} 2108243789Sdim 2109288943Sdim// Get a mask to interleave \p NumVec vectors into a wide vector. 2110288943Sdim// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> 2111288943Sdim// E.g. For 2 interleaved vectors, if VF is 4, the mask is: 2112288943Sdim// <0, 4, 1, 5, 2, 6, 3, 7> 2113288943Sdimstatic Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, 2114288943Sdim unsigned NumVec) { 2115288943Sdim SmallVector<Constant *, 16> Mask; 2116288943Sdim for (unsigned i = 0; i < VF; i++) 2117288943Sdim for (unsigned j = 0; j < NumVec; j++) 2118288943Sdim Mask.push_back(Builder.getInt32(j * VF + i)); 2119288943Sdim 2120288943Sdim return ConstantVector::get(Mask); 2121288943Sdim} 2122288943Sdim 2123288943Sdim// Get the strided mask starting from index \p Start. 2124288943Sdim// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)> 2125288943Sdimstatic Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, 2126288943Sdim unsigned Stride, unsigned VF) { 2127288943Sdim SmallVector<Constant *, 16> Mask; 2128288943Sdim for (unsigned i = 0; i < VF; i++) 2129288943Sdim Mask.push_back(Builder.getInt32(Start + i * Stride)); 2130288943Sdim 2131288943Sdim return ConstantVector::get(Mask); 2132288943Sdim} 2133288943Sdim 2134288943Sdim// Get a mask of two parts: The first part consists of sequential integers 2135288943Sdim// starting from 0, The second part consists of UNDEFs. 2136288943Sdim// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> 2137288943Sdimstatic Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, 2138288943Sdim unsigned NumUndef) { 2139288943Sdim SmallVector<Constant *, 16> Mask; 2140288943Sdim for (unsigned i = 0; i < NumInt; i++) 2141288943Sdim Mask.push_back(Builder.getInt32(i)); 2142288943Sdim 2143288943Sdim Constant *Undef = UndefValue::get(Builder.getInt32Ty()); 2144288943Sdim for (unsigned i = 0; i < NumUndef; i++) 2145288943Sdim Mask.push_back(Undef); 2146288943Sdim 2147288943Sdim return ConstantVector::get(Mask); 2148288943Sdim} 2149288943Sdim 2150288943Sdim// Concatenate two vectors with the same element type. The 2nd vector should 2151288943Sdim// not have more elements than the 1st vector. If the 2nd vector has less 2152288943Sdim// elements, extend it with UNDEFs. 2153288943Sdimstatic Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, 2154288943Sdim Value *V2) { 2155288943Sdim VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType()); 2156288943Sdim VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType()); 2157288943Sdim assert(VecTy1 && VecTy2 && 2158288943Sdim VecTy1->getScalarType() == VecTy2->getScalarType() && 2159288943Sdim "Expect two vectors with the same element type"); 2160288943Sdim 2161288943Sdim unsigned NumElts1 = VecTy1->getNumElements(); 2162288943Sdim unsigned NumElts2 = VecTy2->getNumElements(); 2163288943Sdim assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); 2164288943Sdim 2165288943Sdim if (NumElts1 > NumElts2) { 2166288943Sdim // Extend with UNDEFs. 2167288943Sdim Constant *ExtMask = 2168288943Sdim getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); 2169288943Sdim V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); 2170288943Sdim } 2171288943Sdim 2172288943Sdim Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); 2173288943Sdim return Builder.CreateShuffleVector(V1, V2, Mask); 2174288943Sdim} 2175288943Sdim 2176288943Sdim// Concatenate vectors in the given list. All vectors have the same type. 2177288943Sdimstatic Value *ConcatenateVectors(IRBuilder<> &Builder, 2178288943Sdim ArrayRef<Value *> InputList) { 2179288943Sdim unsigned NumVec = InputList.size(); 2180288943Sdim assert(NumVec > 1 && "Should be at least two vectors"); 2181288943Sdim 2182288943Sdim SmallVector<Value *, 8> ResList; 2183288943Sdim ResList.append(InputList.begin(), InputList.end()); 2184288943Sdim do { 2185288943Sdim SmallVector<Value *, 8> TmpList; 2186288943Sdim for (unsigned i = 0; i < NumVec - 1; i += 2) { 2187288943Sdim Value *V0 = ResList[i], *V1 = ResList[i + 1]; 2188288943Sdim assert((V0->getType() == V1->getType() || i == NumVec - 2) && 2189288943Sdim "Only the last vector may have a different type"); 2190288943Sdim 2191288943Sdim TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); 2192288943Sdim } 2193288943Sdim 2194288943Sdim // Push the last vector if the total number of vectors is odd. 2195288943Sdim if (NumVec % 2 != 0) 2196288943Sdim TmpList.push_back(ResList[NumVec - 1]); 2197288943Sdim 2198288943Sdim ResList = TmpList; 2199288943Sdim NumVec = ResList.size(); 2200288943Sdim } while (NumVec > 1); 2201288943Sdim 2202288943Sdim return ResList[0]; 2203288943Sdim} 2204288943Sdim 2205288943Sdim// Try to vectorize the interleave group that \p Instr belongs to. 2206288943Sdim// 2207288943Sdim// E.g. Translate following interleaved load group (factor = 3): 2208288943Sdim// for (i = 0; i < N; i+=3) { 2209288943Sdim// R = Pic[i]; // Member of index 0 2210288943Sdim// G = Pic[i+1]; // Member of index 1 2211288943Sdim// B = Pic[i+2]; // Member of index 2 2212288943Sdim// ... // do something to R, G, B 2213288943Sdim// } 2214288943Sdim// To: 2215288943Sdim// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2216288943Sdim// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2217288943Sdim// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2218288943Sdim// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2219288943Sdim// 2220288943Sdim// Or translate following interleaved store group (factor = 3): 2221288943Sdim// for (i = 0; i < N; i+=3) { 2222288943Sdim// ... do something to R, G, B 2223288943Sdim// Pic[i] = R; // Member of index 0 2224288943Sdim// Pic[i+1] = G; // Member of index 1 2225288943Sdim// Pic[i+2] = B; // Member of index 2 2226288943Sdim// } 2227288943Sdim// To: 2228288943Sdim// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2229288943Sdim// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2230288943Sdim// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2231288943Sdim// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2232288943Sdim// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2233288943Sdimvoid InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { 2234288943Sdim const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); 2235288943Sdim assert(Group && "Fail to get an interleaved access group."); 2236288943Sdim 2237288943Sdim // Skip if current instruction is not the insert position. 2238288943Sdim if (Instr != Group->getInsertPos()) 2239288943Sdim return; 2240288943Sdim 2241288943Sdim LoadInst *LI = dyn_cast<LoadInst>(Instr); 2242288943Sdim StoreInst *SI = dyn_cast<StoreInst>(Instr); 2243288943Sdim Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2244288943Sdim 2245288943Sdim // Prepare for the vector type of the interleaved load/store. 2246288943Sdim Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2247288943Sdim unsigned InterleaveFactor = Group->getFactor(); 2248288943Sdim Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2249288943Sdim Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); 2250288943Sdim 2251288943Sdim // Prepare for the new pointers. 2252288943Sdim setDebugLocFromInst(Builder, Ptr); 2253288943Sdim VectorParts &PtrParts = getVectorValue(Ptr); 2254288943Sdim SmallVector<Value *, 2> NewPtrs; 2255288943Sdim unsigned Index = Group->getIndex(Instr); 2256288943Sdim for (unsigned Part = 0; Part < UF; Part++) { 2257288943Sdim // Extract the pointer for current instruction from the pointer vector. A 2258288943Sdim // reverse access uses the pointer in the last lane. 2259288943Sdim Value *NewPtr = Builder.CreateExtractElement( 2260288943Sdim PtrParts[Part], 2261288943Sdim Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); 2262288943Sdim 2263288943Sdim // Notice current instruction could be any index. Need to adjust the address 2264288943Sdim // to the member of index 0. 2265288943Sdim // 2266288943Sdim // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2267288943Sdim // b = A[i]; // Member of index 0 2268288943Sdim // Current pointer is pointed to A[i+1], adjust it to A[i]. 2269288943Sdim // 2270288943Sdim // E.g. A[i+1] = a; // Member of index 1 2271288943Sdim // A[i] = b; // Member of index 0 2272288943Sdim // A[i+2] = c; // Member of index 2 (Current instruction) 2273288943Sdim // Current pointer is pointed to A[i+2], adjust it to A[i]. 2274288943Sdim NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); 2275288943Sdim 2276288943Sdim // Cast to the vector pointer type. 2277288943Sdim NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2278288943Sdim } 2279288943Sdim 2280288943Sdim setDebugLocFromInst(Builder, Instr); 2281288943Sdim Value *UndefVec = UndefValue::get(VecTy); 2282288943Sdim 2283288943Sdim // Vectorize the interleaved load group. 2284288943Sdim if (LI) { 2285288943Sdim for (unsigned Part = 0; Part < UF; Part++) { 2286288943Sdim Instruction *NewLoadInstr = Builder.CreateAlignedLoad( 2287288943Sdim NewPtrs[Part], Group->getAlignment(), "wide.vec"); 2288288943Sdim 2289288943Sdim for (unsigned i = 0; i < InterleaveFactor; i++) { 2290288943Sdim Instruction *Member = Group->getMember(i); 2291288943Sdim 2292288943Sdim // Skip the gaps in the group. 2293288943Sdim if (!Member) 2294288943Sdim continue; 2295288943Sdim 2296288943Sdim Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF); 2297288943Sdim Value *StridedVec = Builder.CreateShuffleVector( 2298288943Sdim NewLoadInstr, UndefVec, StrideMask, "strided.vec"); 2299288943Sdim 2300288943Sdim // If this member has different type, cast the result type. 2301288943Sdim if (Member->getType() != ScalarTy) { 2302288943Sdim VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2303288943Sdim StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); 2304288943Sdim } 2305288943Sdim 2306288943Sdim VectorParts &Entry = WidenMap.get(Member); 2307288943Sdim Entry[Part] = 2308288943Sdim Group->isReverse() ? reverseVector(StridedVec) : StridedVec; 2309288943Sdim } 2310288943Sdim 2311288943Sdim propagateMetadata(NewLoadInstr, Instr); 2312288943Sdim } 2313288943Sdim return; 2314288943Sdim } 2315288943Sdim 2316288943Sdim // The sub vector type for current instruction. 2317288943Sdim VectorType *SubVT = VectorType::get(ScalarTy, VF); 2318288943Sdim 2319288943Sdim // Vectorize the interleaved store group. 2320288943Sdim for (unsigned Part = 0; Part < UF; Part++) { 2321288943Sdim // Collect the stored vector from each member. 2322288943Sdim SmallVector<Value *, 4> StoredVecs; 2323288943Sdim for (unsigned i = 0; i < InterleaveFactor; i++) { 2324288943Sdim // Interleaved store group doesn't allow a gap, so each index has a member 2325288943Sdim Instruction *Member = Group->getMember(i); 2326288943Sdim assert(Member && "Fail to get a member from an interleaved store group"); 2327288943Sdim 2328288943Sdim Value *StoredVec = 2329288943Sdim getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part]; 2330288943Sdim if (Group->isReverse()) 2331288943Sdim StoredVec = reverseVector(StoredVec); 2332288943Sdim 2333288943Sdim // If this member has different type, cast it to an unified type. 2334288943Sdim if (StoredVec->getType() != SubVT) 2335288943Sdim StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); 2336288943Sdim 2337288943Sdim StoredVecs.push_back(StoredVec); 2338288943Sdim } 2339288943Sdim 2340288943Sdim // Concatenate all vectors into a wide vector. 2341288943Sdim Value *WideVec = ConcatenateVectors(Builder, StoredVecs); 2342288943Sdim 2343288943Sdim // Interleave the elements in the wide vector. 2344288943Sdim Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor); 2345288943Sdim Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2346288943Sdim "interleaved.vec"); 2347288943Sdim 2348288943Sdim Instruction *NewStoreInstr = 2349288943Sdim Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); 2350288943Sdim propagateMetadata(NewStoreInstr, Instr); 2351288943Sdim } 2352288943Sdim} 2353288943Sdim 2354276479Sdimvoid InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { 2355249423Sdim // Attempt to issue a wide load. 2356249423Sdim LoadInst *LI = dyn_cast<LoadInst>(Instr); 2357249423Sdim StoreInst *SI = dyn_cast<StoreInst>(Instr); 2358249423Sdim 2359249423Sdim assert((LI || SI) && "Invalid Load/Store instruction"); 2360249423Sdim 2361288943Sdim // Try to vectorize the interleave group if this access is interleaved. 2362288943Sdim if (Legal->isAccessInterleaved(Instr)) 2363288943Sdim return vectorizeInterleaveGroup(Instr); 2364288943Sdim 2365249423Sdim Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 2366249423Sdim Type *DataTy = VectorType::get(ScalarDataTy, VF); 2367249423Sdim Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 2368249423Sdim unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 2369261991Sdim // An alignment of 0 means target abi alignment. We need to use the scalar's 2370261991Sdim // target abi alignment in such a case. 2371288943Sdim const DataLayout &DL = Instr->getModule()->getDataLayout(); 2372261991Sdim if (!Alignment) 2373288943Sdim Alignment = DL.getABITypeAlignment(ScalarDataTy); 2374261991Sdim unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2375288943Sdim unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); 2376288943Sdim unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; 2377251662Sdim 2378280031Sdim if (SI && Legal->blockNeedsPredication(SI->getParent()) && 2379280031Sdim !Legal->isMaskRequired(SI)) 2380276479Sdim return scalarizeInstruction(Instr, true); 2381276479Sdim 2382251662Sdim if (ScalarAllocatedSize != VectorElementSize) 2383251662Sdim return scalarizeInstruction(Instr); 2384251662Sdim 2385276479Sdim // If the pointer is loop invariant or if it is non-consecutive, 2386249423Sdim // scalarize the load. 2387251662Sdim int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 2388251662Sdim bool Reverse = ConsecutiveStride < 0; 2389249423Sdim bool UniformLoad = LI && Legal->isUniform(Ptr); 2390251662Sdim if (!ConsecutiveStride || UniformLoad) 2391249423Sdim return scalarizeInstruction(Instr); 2392249423Sdim 2393249423Sdim Constant *Zero = Builder.getInt32(0); 2394249423Sdim VectorParts &Entry = WidenMap.get(Instr); 2395249423Sdim 2396249423Sdim // Handle consecutive loads/stores. 2397296417Sdim GetElementPtrInst *Gep = getGEPInstruction(Ptr); 2398249423Sdim if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 2399261991Sdim setDebugLocFromInst(Builder, Gep); 2400249423Sdim Value *PtrOperand = Gep->getPointerOperand(); 2401249423Sdim Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 2402249423Sdim FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 2403249423Sdim 2404249423Sdim // Create the new GEP with the new induction variable. 2405249423Sdim GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2406249423Sdim Gep2->setOperand(0, FirstBasePtr); 2407249423Sdim Gep2->setName("gep.indvar.base"); 2408249423Sdim Ptr = Builder.Insert(Gep2); 2409249423Sdim } else if (Gep) { 2410261991Sdim setDebugLocFromInst(Builder, Gep); 2411296417Sdim assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), 2412296417Sdim OrigLoop) && 2413296417Sdim "Base ptr must be invariant"); 2414249423Sdim 2415249423Sdim // The last index does not have to be the induction. It can be 2416249423Sdim // consecutive and be a function of the index. For example A[I+1]; 2417249423Sdim unsigned NumOperands = Gep->getNumOperands(); 2418288943Sdim unsigned InductionOperand = getGEPInductionOperand(Gep); 2419261991Sdim // Create the new GEP with the new induction variable. 2420261991Sdim GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 2421249423Sdim 2422261991Sdim for (unsigned i = 0; i < NumOperands; ++i) { 2423261991Sdim Value *GepOperand = Gep->getOperand(i); 2424261991Sdim Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); 2425249423Sdim 2426261991Sdim // Update last index or loop invariant instruction anchored in loop. 2427261991Sdim if (i == InductionOperand || 2428261991Sdim (GepOperandInst && OrigLoop->contains(GepOperandInst))) { 2429261991Sdim assert((i == InductionOperand || 2430296417Sdim PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), 2431296417Sdim OrigLoop)) && 2432261991Sdim "Must be last index or loop invariant"); 2433261991Sdim 2434261991Sdim VectorParts &GEPParts = getVectorValue(GepOperand); 2435261991Sdim Value *Index = GEPParts[0]; 2436261991Sdim Index = Builder.CreateExtractElement(Index, Zero); 2437261991Sdim Gep2->setOperand(i, Index); 2438261991Sdim Gep2->setName("gep.indvar.idx"); 2439261991Sdim } 2440261991Sdim } 2441249423Sdim Ptr = Builder.Insert(Gep2); 2442249423Sdim } else { 2443249423Sdim // Use the induction element ptr. 2444249423Sdim assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 2445261991Sdim setDebugLocFromInst(Builder, Ptr); 2446249423Sdim VectorParts &PtrVal = getVectorValue(Ptr); 2447249423Sdim Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 2448249423Sdim } 2449249423Sdim 2450280031Sdim VectorParts Mask = createBlockInMask(Instr->getParent()); 2451249423Sdim // Handle Stores: 2452249423Sdim if (SI) { 2453249423Sdim assert(!Legal->isUniform(SI->getPointerOperand()) && 2454249423Sdim "We do not allow storing to uniform addresses"); 2455261991Sdim setDebugLocFromInst(Builder, SI); 2456261991Sdim // We don't want to update the value in the map as it might be used in 2457261991Sdim // another expression. So don't use a reference type for "StoredVal". 2458261991Sdim VectorParts StoredVal = getVectorValue(SI->getValueOperand()); 2459296417Sdim 2460249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2461249423Sdim // Calculate the pointer for the specific unroll-part. 2462288943Sdim Value *PartPtr = 2463288943Sdim Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2464249423Sdim 2465249423Sdim if (Reverse) { 2466296417Sdim // If we store to reverse consecutive memory locations, then we need 2467249423Sdim // to reverse the order of elements in the stored value. 2468249423Sdim StoredVal[Part] = reverseVector(StoredVal[Part]); 2469249423Sdim // If the address is consecutive but reversed, then the 2470249423Sdim // wide store needs to start at the last vector element. 2471288943Sdim PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2472288943Sdim PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2473280031Sdim Mask[Part] = reverseVector(Mask[Part]); 2474249423Sdim } 2475249423Sdim 2476261991Sdim Value *VecPtr = Builder.CreateBitCast(PartPtr, 2477261991Sdim DataTy->getPointerTo(AddressSpace)); 2478280031Sdim 2479280031Sdim Instruction *NewSI; 2480280031Sdim if (Legal->isMaskRequired(SI)) 2481280031Sdim NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, 2482280031Sdim Mask[Part]); 2483280031Sdim else 2484280031Sdim NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); 2485276479Sdim propagateMetadata(NewSI, SI); 2486249423Sdim } 2487261991Sdim return; 2488249423Sdim } 2489249423Sdim 2490261991Sdim // Handle loads. 2491261991Sdim assert(LI && "Must have a load instruction"); 2492261991Sdim setDebugLocFromInst(Builder, LI); 2493249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2494249423Sdim // Calculate the pointer for the specific unroll-part. 2495288943Sdim Value *PartPtr = 2496288943Sdim Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); 2497249423Sdim 2498249423Sdim if (Reverse) { 2499249423Sdim // If the address is consecutive but reversed, then the 2500280031Sdim // wide load needs to start at the last vector element. 2501288943Sdim PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); 2502288943Sdim PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); 2503280031Sdim Mask[Part] = reverseVector(Mask[Part]); 2504249423Sdim } 2505249423Sdim 2506280031Sdim Instruction* NewLI; 2507261991Sdim Value *VecPtr = Builder.CreateBitCast(PartPtr, 2508261991Sdim DataTy->getPointerTo(AddressSpace)); 2509280031Sdim if (Legal->isMaskRequired(LI)) 2510280031Sdim NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2511280031Sdim UndefValue::get(DataTy), 2512280031Sdim "wide.masked.load"); 2513280031Sdim else 2514280031Sdim NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); 2515276479Sdim propagateMetadata(NewLI, LI); 2516276479Sdim Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; 2517249423Sdim } 2518249423Sdim} 2519249423Sdim 2520296417Sdimvoid InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2521296417Sdim bool IfPredicateStore) { 2522243789Sdim assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2523243789Sdim // Holds vector parameters or scalars, in case of uniform vals. 2524249423Sdim SmallVector<VectorParts, 4> Params; 2525243789Sdim 2526261991Sdim setDebugLocFromInst(Builder, Instr); 2527261991Sdim 2528243789Sdim // Find all of the vectorized parameters. 2529243789Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2530243789Sdim Value *SrcOp = Instr->getOperand(op); 2531243789Sdim 2532243789Sdim // If we are accessing the old induction variable, use the new one. 2533243789Sdim if (SrcOp == OldInduction) { 2534249423Sdim Params.push_back(getVectorValue(SrcOp)); 2535243789Sdim continue; 2536243789Sdim } 2537243789Sdim 2538243789Sdim // Try using previously calculated values. 2539243789Sdim Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 2540243789Sdim 2541296417Sdim // If the src is an instruction that appeared earlier in the basic block, 2542243789Sdim // then it should already be vectorized. 2543249423Sdim if (SrcInst && OrigLoop->contains(SrcInst)) { 2544249423Sdim assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 2545243789Sdim // The parameter is a vector value from earlier. 2546249423Sdim Params.push_back(WidenMap.get(SrcInst)); 2547243789Sdim } else { 2548243789Sdim // The parameter is a scalar from outside the loop. Maybe even a constant. 2549249423Sdim VectorParts Scalars; 2550249423Sdim Scalars.append(UF, SrcOp); 2551249423Sdim Params.push_back(Scalars); 2552243789Sdim } 2553243789Sdim } 2554243789Sdim 2555243789Sdim assert(Params.size() == Instr->getNumOperands() && 2556243789Sdim "Invalid number of operands"); 2557243789Sdim 2558243789Sdim // Does this instruction return a value ? 2559243789Sdim bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2560243789Sdim 2561276479Sdim Value *UndefVec = IsVoidRetTy ? nullptr : 2562249423Sdim UndefValue::get(VectorType::get(Instr->getType(), VF)); 2563249423Sdim // Create a new entry in the WidenMap and initialize it to Undef or Null. 2564249423Sdim VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 2565243789Sdim 2566276479Sdim VectorParts Cond; 2567276479Sdim if (IfPredicateStore) { 2568276479Sdim assert(Instr->getParent()->getSinglePredecessor() && 2569276479Sdim "Only support single predecessor blocks"); 2570276479Sdim Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 2571276479Sdim Instr->getParent()); 2572276479Sdim } 2573276479Sdim 2574249817Sdim // For each vector unroll 'part': 2575249817Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2576249817Sdim // For each scalar that we create: 2577249817Sdim for (unsigned Width = 0; Width < VF; ++Width) { 2578276479Sdim 2579276479Sdim // Start if-block. 2580276479Sdim Value *Cmp = nullptr; 2581276479Sdim if (IfPredicateStore) { 2582276479Sdim Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); 2583296417Sdim Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, 2584296417Sdim ConstantInt::get(Cmp->getType(), 1)); 2585276479Sdim } 2586276479Sdim 2587249423Sdim Instruction *Cloned = Instr->clone(); 2588249423Sdim if (!IsVoidRetTy) 2589249423Sdim Cloned->setName(Instr->getName() + ".cloned"); 2590261991Sdim // Replace the operands of the cloned instructions with extracted scalars. 2591249423Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2592249423Sdim Value *Op = Params[op][Part]; 2593249423Sdim // Param is a vector. Need to extract the right lane. 2594249423Sdim if (Op->getType()->isVectorTy()) 2595249423Sdim Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 2596249423Sdim Cloned->setOperand(op, Op); 2597249423Sdim } 2598249423Sdim 2599249423Sdim // Place the cloned scalar in the new loop. 2600249423Sdim Builder.Insert(Cloned); 2601249423Sdim 2602249423Sdim // If the original scalar returns a value we need to place it in a vector 2603249423Sdim // so that future users will be able to use it. 2604249423Sdim if (!IsVoidRetTy) 2605249423Sdim VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 2606249423Sdim Builder.getInt32(Width)); 2607276479Sdim // End if-block. 2608296417Sdim if (IfPredicateStore) 2609296417Sdim PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), 2610296417Sdim Cmp)); 2611243789Sdim } 2612249423Sdim } 2613249423Sdim} 2614243789Sdim 2615296417SdimPHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2616296417Sdim Value *End, Value *Step, 2617296417Sdim Instruction *DL) { 2618296417Sdim BasicBlock *Header = L->getHeader(); 2619296417Sdim BasicBlock *Latch = L->getLoopLatch(); 2620296417Sdim // As we're just creating this loop, it's possible no latch exists 2621296417Sdim // yet. If so, use the header as this will be a single block loop. 2622296417Sdim if (!Latch) 2623296417Sdim Latch = Header; 2624296417Sdim 2625296417Sdim IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2626296417Sdim setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); 2627296417Sdim auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2628296417Sdim 2629296417Sdim Builder.SetInsertPoint(Latch->getTerminator()); 2630296417Sdim 2631296417Sdim // Create i+1 and fill the PHINode. 2632296417Sdim Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2633296417Sdim Induction->addIncoming(Start, L->getLoopPreheader()); 2634296417Sdim Induction->addIncoming(Next, Latch); 2635296417Sdim // Create the compare. 2636296417Sdim Value *ICmp = Builder.CreateICmpEQ(Next, End); 2637296417Sdim Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2638296417Sdim 2639296417Sdim // Now we have two terminators. Remove the old one from the block. 2640296417Sdim Latch->getTerminator()->eraseFromParent(); 2641296417Sdim 2642296417Sdim return Induction; 2643276479Sdim} 2644276479Sdim 2645296417SdimValue *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2646296417Sdim if (TripCount) 2647296417Sdim return TripCount; 2648276479Sdim 2649296417Sdim IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2650296417Sdim // Find the loop boundaries. 2651296417Sdim ScalarEvolution *SE = PSE.getSE(); 2652296417Sdim const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); 2653296417Sdim assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2654296417Sdim "Invalid loop count"); 2655276479Sdim 2656296417Sdim Type *IdxTy = Legal->getWidestInductionType(); 2657296417Sdim 2658296417Sdim // The exit count might have the type of i64 while the phi is i32. This can 2659296417Sdim // happen if we have an induction variable that is sign extended before the 2660296417Sdim // compare. The only way that we get a backedge taken count is that the 2661296417Sdim // induction variable was signed and as such will not overflow. In such a case 2662296417Sdim // truncation is legal. 2663296417Sdim if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2664296417Sdim IdxTy->getPrimitiveSizeInBits()) 2665296417Sdim BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2666296417Sdim BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2667296417Sdim 2668296417Sdim // Get the total trip count from the count by adding 1. 2669296417Sdim const SCEV *ExitCount = SE->getAddExpr( 2670296417Sdim BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2671276479Sdim 2672296417Sdim const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2673276479Sdim 2674296417Sdim // Expand the trip count and place the new instructions in the preheader. 2675296417Sdim // Notice that the pre-header does not change, only the loop body. 2676296417Sdim SCEVExpander Exp(*SE, DL, "induction"); 2677296417Sdim 2678296417Sdim // Count holds the overall loop count (N). 2679296417Sdim TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2680296417Sdim L->getLoopPreheader()->getTerminator()); 2681296417Sdim 2682296417Sdim if (TripCount->getType()->isPointerTy()) 2683296417Sdim TripCount = 2684296417Sdim CastInst::CreatePointerCast(TripCount, IdxTy, 2685296417Sdim "exitcount.ptrcnt.to.int", 2686296417Sdim L->getLoopPreheader()->getTerminator()); 2687296417Sdim 2688296417Sdim return TripCount; 2689276479Sdim} 2690276479Sdim 2691296417SdimValue *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2692296417Sdim if (VectorTripCount) 2693296417Sdim return VectorTripCount; 2694296417Sdim 2695296417Sdim Value *TC = getOrCreateTripCount(L); 2696296417Sdim IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2697296417Sdim 2698296417Sdim // Now we need to generate the expression for N - (N % VF), which is 2699296417Sdim // the part that the vectorized body will execute. 2700296417Sdim // The loop step is equal to the vectorization factor (num of SIMD elements) 2701296417Sdim // times the unroll factor (num of SIMD instructions). 2702296417Sdim Constant *Step = ConstantInt::get(TC->getType(), VF * UF); 2703296417Sdim Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2704296417Sdim VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2705296417Sdim 2706296417Sdim return VectorTripCount; 2707296417Sdim} 2708296417Sdim 2709296417Sdimvoid InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2710296417Sdim BasicBlock *Bypass) { 2711296417Sdim Value *Count = getOrCreateTripCount(L); 2712296417Sdim BasicBlock *BB = L->getLoopPreheader(); 2713296417Sdim IRBuilder<> Builder(BB->getTerminator()); 2714296417Sdim 2715296417Sdim // Generate code to check that the loop's trip count that we computed by 2716296417Sdim // adding one to the backedge-taken count will not overflow. 2717296417Sdim Value *CheckMinIters = 2718296417Sdim Builder.CreateICmpULT(Count, 2719296417Sdim ConstantInt::get(Count->getType(), VF * UF), 2720296417Sdim "min.iters.check"); 2721296417Sdim 2722296417Sdim BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), 2723296417Sdim "min.iters.checked"); 2724296417Sdim if (L->getParentLoop()) 2725296417Sdim L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2726296417Sdim ReplaceInstWithInst(BB->getTerminator(), 2727296417Sdim BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2728296417Sdim LoopBypassBlocks.push_back(BB); 2729296417Sdim} 2730296417Sdim 2731296417Sdimvoid InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, 2732296417Sdim BasicBlock *Bypass) { 2733296417Sdim Value *TC = getOrCreateVectorTripCount(L); 2734296417Sdim BasicBlock *BB = L->getLoopPreheader(); 2735296417Sdim IRBuilder<> Builder(BB->getTerminator()); 2736296417Sdim 2737296417Sdim // Now, compare the new count to zero. If it is zero skip the vector loop and 2738296417Sdim // jump to the scalar loop. 2739296417Sdim Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), 2740296417Sdim "cmp.zero"); 2741296417Sdim 2742296417Sdim // Generate code to check that the loop's trip count that we computed by 2743296417Sdim // adding one to the backedge-taken count will not overflow. 2744296417Sdim BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), 2745296417Sdim "vector.ph"); 2746296417Sdim if (L->getParentLoop()) 2747296417Sdim L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2748296417Sdim ReplaceInstWithInst(BB->getTerminator(), 2749296417Sdim BranchInst::Create(Bypass, NewBB, Cmp)); 2750296417Sdim LoopBypassBlocks.push_back(BB); 2751296417Sdim} 2752296417Sdim 2753296417Sdimvoid InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2754296417Sdim BasicBlock *BB = L->getLoopPreheader(); 2755296417Sdim 2756296417Sdim // Generate the code to check that the SCEV assumptions that we made. 2757296417Sdim // We want the new basic block to start at the first instruction in a 2758296417Sdim // sequence of instructions that form a check. 2759296417Sdim SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2760296417Sdim "scev.check"); 2761296417Sdim Value *SCEVCheck = 2762296417Sdim Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2763296417Sdim 2764296417Sdim if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2765296417Sdim if (C->isZero()) 2766296417Sdim return; 2767296417Sdim 2768296417Sdim // Create a new block containing the stride check. 2769296417Sdim BB->setName("vector.scevcheck"); 2770296417Sdim auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2771296417Sdim if (L->getParentLoop()) 2772296417Sdim L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2773296417Sdim ReplaceInstWithInst(BB->getTerminator(), 2774296417Sdim BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2775296417Sdim LoopBypassBlocks.push_back(BB); 2776296417Sdim AddedSafetyChecks = true; 2777296417Sdim} 2778296417Sdim 2779296417Sdimvoid InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, 2780296417Sdim BasicBlock *Bypass) { 2781296417Sdim BasicBlock *BB = L->getLoopPreheader(); 2782296417Sdim 2783296417Sdim // Generate the code that checks in runtime if arrays overlap. We put the 2784296417Sdim // checks into a separate block to make the more common case of few elements 2785296417Sdim // faster. 2786296417Sdim Instruction *FirstCheckInst; 2787296417Sdim Instruction *MemRuntimeCheck; 2788296417Sdim std::tie(FirstCheckInst, MemRuntimeCheck) = 2789296417Sdim Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2790296417Sdim if (!MemRuntimeCheck) 2791296417Sdim return; 2792296417Sdim 2793296417Sdim // Create a new block containing the memory check. 2794296417Sdim BB->setName("vector.memcheck"); 2795296417Sdim auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2796296417Sdim if (L->getParentLoop()) 2797296417Sdim L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2798296417Sdim ReplaceInstWithInst(BB->getTerminator(), 2799296417Sdim BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2800296417Sdim LoopBypassBlocks.push_back(BB); 2801296417Sdim AddedSafetyChecks = true; 2802296417Sdim} 2803296417Sdim 2804296417Sdim 2805276479Sdimvoid InnerLoopVectorizer::createEmptyLoop() { 2806243789Sdim /* 2807243789Sdim In this function we generate a new loop. The new loop will contain 2808243789Sdim the vectorized instructions while the old loop will continue to run the 2809243789Sdim scalar remainder. 2810243789Sdim 2811296417Sdim [ ] <-- loop iteration number check. 2812276479Sdim / | 2813276479Sdim / v 2814276479Sdim | [ ] <-- vector loop bypass (may consist of multiple blocks). 2815276479Sdim | / | 2816276479Sdim | / v 2817276479Sdim || [ ] <-- vector pre header. 2818296417Sdim |/ | 2819296417Sdim | v 2820296417Sdim | [ ] \ 2821296417Sdim | [ ]_| <-- vector loop. 2822296417Sdim | | 2823296417Sdim | v 2824296417Sdim | -[ ] <--- middle-block. 2825276479Sdim | / | 2826276479Sdim | / v 2827276479Sdim -|- >[ ] <--- new preheader. 2828249423Sdim | | 2829249423Sdim | v 2830249423Sdim | [ ] \ 2831249423Sdim | [ ]_| <-- old scalar loop to handle remainder. 2832249423Sdim \ | 2833249423Sdim \ v 2834249423Sdim >[ ] <-- exit block. 2835243789Sdim ... 2836243789Sdim */ 2837243789Sdim 2838249423Sdim BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2839288943Sdim BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2840249423Sdim BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2841288943Sdim assert(VectorPH && "Invalid loop structure"); 2842249423Sdim assert(ExitBlock && "Must have an exit block"); 2843249423Sdim 2844249423Sdim // Some loops have a single integer induction variable, while other loops 2845249423Sdim // don't. One example is c++ iterators that often have multiple pointer 2846249423Sdim // induction variables. In the code below we also support a case where we 2847249423Sdim // don't have a single induction variable. 2848296417Sdim // 2849296417Sdim // We try to obtain an induction variable from the original loop as hard 2850296417Sdim // as possible. However if we don't find one that: 2851296417Sdim // - is an integer 2852296417Sdim // - counts from zero, stepping by one 2853296417Sdim // - is the size of the widest induction variable type 2854296417Sdim // then we create a new one. 2855243789Sdim OldInduction = Legal->getInduction(); 2856261991Sdim Type *IdxTy = Legal->getWidestInductionType(); 2857243789Sdim 2858249423Sdim // Split the single block loop into the two loop structure described above. 2859249423Sdim BasicBlock *VecBody = 2860288943Sdim VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2861249423Sdim BasicBlock *MiddleBlock = 2862249423Sdim VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2863243789Sdim BasicBlock *ScalarPH = 2864249423Sdim MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2865243789Sdim 2866261991Sdim // Create and register the new vector loop. 2867261991Sdim Loop* Lp = new Loop(); 2868261991Sdim Loop *ParentLoop = OrigLoop->getParentLoop(); 2869261991Sdim 2870261991Sdim // Insert the new loop into the loop nest and register the new basic blocks 2871261991Sdim // before calling any utilities such as SCEV that require valid LoopInfo. 2872261991Sdim if (ParentLoop) { 2873261991Sdim ParentLoop->addChildLoop(Lp); 2874288943Sdim ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2875288943Sdim ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2876261991Sdim } else { 2877261991Sdim LI->addTopLevelLoop(Lp); 2878261991Sdim } 2879288943Sdim Lp->addBasicBlockToLoop(VecBody, *LI); 2880261991Sdim 2881296417Sdim // Find the loop boundaries. 2882296417Sdim Value *Count = getOrCreateTripCount(Lp); 2883243789Sdim 2884296417Sdim Value *StartIdx = ConstantInt::get(IdxTy, 0); 2885243789Sdim 2886296417Sdim // We need to test whether the backedge-taken count is uint##_max. Adding one 2887296417Sdim // to it will cause overflow and an incorrect loop trip count in the vector 2888296417Sdim // body. In case of overflow we want to directly jump to the scalar remainder 2889296417Sdim // loop. 2890296417Sdim emitMinimumIterationCountCheck(Lp, ScalarPH); 2891249423Sdim // Now, compare the new count to zero. If it is zero skip the vector loop and 2892249423Sdim // jump to the scalar loop. 2893296417Sdim emitVectorLoopEnteredCheck(Lp, ScalarPH); 2894296417Sdim // Generate the code to check any assumptions that we've made for SCEV 2895296417Sdim // expressions. 2896296417Sdim emitSCEVChecks(Lp, ScalarPH); 2897243789Sdim 2898249423Sdim // Generate the code that checks in runtime if arrays overlap. We put the 2899249423Sdim // checks into a separate block to make the more common case of few elements 2900249423Sdim // faster. 2901296417Sdim emitMemRuntimeChecks(Lp, ScalarPH); 2902296417Sdim 2903296417Sdim // Generate the induction variable. 2904296417Sdim // The loop step is equal to the vectorization factor (num of SIMD elements) 2905296417Sdim // times the unroll factor (num of SIMD instructions). 2906296417Sdim Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2907296417Sdim Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2908296417Sdim Induction = 2909296417Sdim createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2910296417Sdim getDebugLocFromInstOrOperands(OldInduction)); 2911243789Sdim 2912249423Sdim // We are going to resume the execution of the scalar loop. 2913249423Sdim // Go over all of the induction variables that we found and fix the 2914249423Sdim // PHIs that are left in the scalar version of the loop. 2915249423Sdim // The starting values of PHI nodes depend on the counter of the last 2916249423Sdim // iteration in the vectorized loop. 2917249423Sdim // If we come from a bypass edge then we need to start from the original 2918249423Sdim // start value. 2919249423Sdim 2920296417Sdim // This variable saves the new starting index for the scalar loop. It is used 2921296417Sdim // to test if there are any tail iterations left once the vector loop has 2922296417Sdim // completed. 2923249423Sdim LoopVectorizationLegality::InductionList::iterator I, E; 2924249423Sdim LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2925249423Sdim for (I = List->begin(), E = List->end(); I != E; ++I) { 2926249423Sdim PHINode *OrigPhi = I->first; 2927296417Sdim InductionDescriptor II = I->second; 2928261991Sdim 2929276479Sdim // Create phi nodes to merge from the backedge-taken check block. 2930296417Sdim PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, 2931296417Sdim "bc.resume.val", 2932276479Sdim ScalarPH->getTerminator()); 2933296417Sdim Value *EndValue; 2934276479Sdim if (OrigPhi == OldInduction) { 2935296417Sdim // We know what the end value is. 2936296417Sdim EndValue = CountRoundDown; 2937296417Sdim } else { 2938296417Sdim IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); 2939296417Sdim Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, 2940296417Sdim II.getStepValue()->getType(), 2941296417Sdim "cast.crd"); 2942296417Sdim EndValue = II.transform(B, CRD); 2943288943Sdim EndValue->setName("ind.end"); 2944243789Sdim } 2945243789Sdim 2946249423Sdim // The new PHI merges the original incoming value, in case of a bypass, 2947249423Sdim // or the value at the end of the vectorized loop. 2948296417Sdim BCResumeVal->addIncoming(EndValue, MiddleBlock); 2949249423Sdim 2950249423Sdim // Fix the scalar body counter (PHI node). 2951249423Sdim unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 2952276479Sdim 2953276479Sdim // The old induction's phi node in the scalar body needs the truncated 2954276479Sdim // value. 2955296417Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 2956296417Sdim BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); 2957296417Sdim OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); 2958243789Sdim } 2959243789Sdim 2960243789Sdim // Add a check in the middle block to see if we have completed 2961243789Sdim // all of the iterations in the first vector loop. 2962243789Sdim // If (N - N%VF) == N, then we *don't* need to run the remainder. 2963296417Sdim Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 2964296417Sdim CountRoundDown, "cmp.n", 2965243789Sdim MiddleBlock->getTerminator()); 2966288943Sdim ReplaceInstWithInst(MiddleBlock->getTerminator(), 2967288943Sdim BranchInst::Create(ExitBlock, ScalarPH, CmpN)); 2968243789Sdim 2969243789Sdim // Get ready to start creating new instructions into the vectorized body. 2970296417Sdim Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 2971243789Sdim 2972243789Sdim // Save the state. 2973296417Sdim LoopVectorPreHeader = Lp->getLoopPreheader(); 2974243789Sdim LoopScalarPreHeader = ScalarPH; 2975243789Sdim LoopMiddleBlock = MiddleBlock; 2976243789Sdim LoopExitBlock = ExitBlock; 2977276479Sdim LoopVectorBody.push_back(VecBody); 2978243789Sdim LoopScalarBody = OldBasicBlock; 2979261991Sdim 2980261991Sdim LoopVectorizeHints Hints(Lp, true); 2981280031Sdim Hints.setAlreadyVectorized(); 2982243789Sdim} 2983243789Sdim 2984261991Sdimnamespace { 2985261991Sdimstruct CSEDenseMapInfo { 2986261991Sdim static bool canHandle(Instruction *I) { 2987261991Sdim return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2988261991Sdim isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2989261991Sdim } 2990261991Sdim static inline Instruction *getEmptyKey() { 2991261991Sdim return DenseMapInfo<Instruction *>::getEmptyKey(); 2992261991Sdim } 2993261991Sdim static inline Instruction *getTombstoneKey() { 2994261991Sdim return DenseMapInfo<Instruction *>::getTombstoneKey(); 2995261991Sdim } 2996261991Sdim static unsigned getHashValue(Instruction *I) { 2997261991Sdim assert(canHandle(I) && "Unknown instruction!"); 2998261991Sdim return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2999261991Sdim I->value_op_end())); 3000261991Sdim } 3001261991Sdim static bool isEqual(Instruction *LHS, Instruction *RHS) { 3002261991Sdim if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3003261991Sdim LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3004261991Sdim return LHS == RHS; 3005261991Sdim return LHS->isIdenticalTo(RHS); 3006261991Sdim } 3007261991Sdim}; 3008261991Sdim} 3009261991Sdim 3010276479Sdim/// \brief Check whether this block is a predicated block. 3011276479Sdim/// Due to if predication of stores we might create a sequence of "if(pred) a[i] 3012276479Sdim/// = ...; " blocks. We start with one vectorized basic block. For every 3013276479Sdim/// conditional block we split this vectorized block. Therefore, every second 3014276479Sdim/// block will be a predicated one. 3015276479Sdimstatic bool isPredicatedBlock(unsigned BlockNum) { 3016276479Sdim return BlockNum % 2; 3017276479Sdim} 3018276479Sdim 3019261991Sdim///\brief Perform cse of induction variable instructions. 3020276479Sdimstatic void cse(SmallVector<BasicBlock *, 4> &BBs) { 3021261991Sdim // Perform simple cse. 3022261991Sdim SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3023276479Sdim for (unsigned i = 0, e = BBs.size(); i != e; ++i) { 3024276479Sdim BasicBlock *BB = BBs[i]; 3025276479Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3026296417Sdim Instruction *In = &*I++; 3027261991Sdim 3028276479Sdim if (!CSEDenseMapInfo::canHandle(In)) 3029276479Sdim continue; 3030261991Sdim 3031276479Sdim // Check if we can replace this instruction with any of the 3032276479Sdim // visited instructions. 3033276479Sdim if (Instruction *V = CSEMap.lookup(In)) { 3034276479Sdim In->replaceAllUsesWith(V); 3035276479Sdim In->eraseFromParent(); 3036276479Sdim continue; 3037276479Sdim } 3038276479Sdim // Ignore instructions in conditional blocks. We create "if (pred) a[i] = 3039276479Sdim // ...;" blocks for predicated stores. Every second block is a predicated 3040276479Sdim // block. 3041276479Sdim if (isPredicatedBlock(i)) 3042276479Sdim continue; 3043276479Sdim 3044276479Sdim CSEMap[In] = In; 3045261991Sdim } 3046276479Sdim } 3047276479Sdim} 3048261991Sdim 3049276479Sdim/// \brief Adds a 'fast' flag to floating point operations. 3050276479Sdimstatic Value *addFastMathFlag(Value *V) { 3051276479Sdim if (isa<FPMathOperator>(V)){ 3052276479Sdim FastMathFlags Flags; 3053276479Sdim Flags.setUnsafeAlgebra(); 3054276479Sdim cast<Instruction>(V)->setFastMathFlags(Flags); 3055261991Sdim } 3056276479Sdim return V; 3057261991Sdim} 3058261991Sdim 3059288943Sdim/// Estimate the overhead of scalarizing a value. Insert and Extract are set if 3060288943Sdim/// the result needs to be inserted and/or extracted from vectors. 3061288943Sdimstatic unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, 3062288943Sdim const TargetTransformInfo &TTI) { 3063288943Sdim if (Ty->isVoidTy()) 3064288943Sdim return 0; 3065288943Sdim 3066288943Sdim assert(Ty->isVectorTy() && "Can only scalarize vectors"); 3067288943Sdim unsigned Cost = 0; 3068288943Sdim 3069288943Sdim for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 3070288943Sdim if (Insert) 3071288943Sdim Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); 3072288943Sdim if (Extract) 3073288943Sdim Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); 3074288943Sdim } 3075288943Sdim 3076288943Sdim return Cost; 3077288943Sdim} 3078288943Sdim 3079288943Sdim// Estimate cost of a call instruction CI if it were vectorized with factor VF. 3080288943Sdim// Return the cost of the instruction, including scalarization overhead if it's 3081288943Sdim// needed. The flag NeedToScalarize shows if the call needs to be scalarized - 3082288943Sdim// i.e. either vector version isn't available, or is too expensive. 3083288943Sdimstatic unsigned getVectorCallCost(CallInst *CI, unsigned VF, 3084288943Sdim const TargetTransformInfo &TTI, 3085288943Sdim const TargetLibraryInfo *TLI, 3086288943Sdim bool &NeedToScalarize) { 3087288943Sdim Function *F = CI->getCalledFunction(); 3088288943Sdim StringRef FnName = CI->getCalledFunction()->getName(); 3089288943Sdim Type *ScalarRetTy = CI->getType(); 3090288943Sdim SmallVector<Type *, 4> Tys, ScalarTys; 3091288943Sdim for (auto &ArgOp : CI->arg_operands()) 3092288943Sdim ScalarTys.push_back(ArgOp->getType()); 3093288943Sdim 3094288943Sdim // Estimate cost of scalarized vector call. The source operands are assumed 3095288943Sdim // to be vectors, so we need to extract individual elements from there, 3096288943Sdim // execute VF scalar calls, and then gather the result into the vector return 3097288943Sdim // value. 3098288943Sdim unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3099288943Sdim if (VF == 1) 3100288943Sdim return ScalarCallCost; 3101288943Sdim 3102288943Sdim // Compute corresponding vector type for return value and arguments. 3103288943Sdim Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3104288943Sdim for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) 3105288943Sdim Tys.push_back(ToVectorTy(ScalarTys[i], VF)); 3106288943Sdim 3107288943Sdim // Compute costs of unpacking argument values for the scalar calls and 3108288943Sdim // packing the return values to a vector. 3109288943Sdim unsigned ScalarizationCost = 3110288943Sdim getScalarizationOverhead(RetTy, true, false, TTI); 3111288943Sdim for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) 3112288943Sdim ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); 3113288943Sdim 3114288943Sdim unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3115288943Sdim 3116288943Sdim // If we can't emit a vector call for this function, then the currently found 3117288943Sdim // cost is the cost we need to return. 3118288943Sdim NeedToScalarize = true; 3119288943Sdim if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3120288943Sdim return Cost; 3121288943Sdim 3122288943Sdim // If the corresponding vector cost is cheaper, return its cost. 3123288943Sdim unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3124288943Sdim if (VectorCallCost < Cost) { 3125288943Sdim NeedToScalarize = false; 3126288943Sdim return VectorCallCost; 3127288943Sdim } 3128288943Sdim return Cost; 3129288943Sdim} 3130288943Sdim 3131288943Sdim// Estimate cost of an intrinsic call instruction CI if it were vectorized with 3132288943Sdim// factor VF. Return the cost of the instruction, including scalarization 3133288943Sdim// overhead if it's needed. 3134288943Sdimstatic unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, 3135288943Sdim const TargetTransformInfo &TTI, 3136288943Sdim const TargetLibraryInfo *TLI) { 3137288943Sdim Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3138288943Sdim assert(ID && "Expected intrinsic call!"); 3139288943Sdim 3140288943Sdim Type *RetTy = ToVectorTy(CI->getType(), VF); 3141288943Sdim SmallVector<Type *, 4> Tys; 3142288943Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3143288943Sdim Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3144288943Sdim 3145288943Sdim return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 3146288943Sdim} 3147288943Sdim 3148296417Sdimstatic Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3149296417Sdim IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); 3150296417Sdim IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); 3151296417Sdim return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3152296417Sdim} 3153296417Sdimstatic Type *largestIntegerVectorType(Type *T1, Type *T2) { 3154296417Sdim IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType()); 3155296417Sdim IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType()); 3156296417Sdim return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3157296417Sdim} 3158296417Sdim 3159296417Sdimvoid InnerLoopVectorizer::truncateToMinimalBitwidths() { 3160296417Sdim // For every instruction `I` in MinBWs, truncate the operands, create a 3161296417Sdim // truncated version of `I` and reextend its result. InstCombine runs 3162296417Sdim // later and will remove any ext/trunc pairs. 3163296417Sdim // 3164300974Sdim SmallPtrSet<Value *, 4> Erased; 3165296417Sdim for (auto &KV : MinBWs) { 3166296417Sdim VectorParts &Parts = WidenMap.get(KV.first); 3167296417Sdim for (Value *&I : Parts) { 3168300974Sdim if (Erased.count(I) || I->use_empty()) 3169296417Sdim continue; 3170296417Sdim Type *OriginalTy = I->getType(); 3171296417Sdim Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), 3172296417Sdim KV.second); 3173296417Sdim Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3174296417Sdim OriginalTy->getVectorNumElements()); 3175296417Sdim if (TruncatedTy == OriginalTy) 3176296417Sdim continue; 3177296417Sdim 3178296417Sdim IRBuilder<> B(cast<Instruction>(I)); 3179296417Sdim auto ShrinkOperand = [&](Value *V) -> Value* { 3180296417Sdim if (auto *ZI = dyn_cast<ZExtInst>(V)) 3181296417Sdim if (ZI->getSrcTy() == TruncatedTy) 3182296417Sdim return ZI->getOperand(0); 3183296417Sdim return B.CreateZExtOrTrunc(V, TruncatedTy); 3184296417Sdim }; 3185296417Sdim 3186296417Sdim // The actual instruction modification depends on the instruction type, 3187296417Sdim // unfortunately. 3188296417Sdim Value *NewI = nullptr; 3189296417Sdim if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { 3190296417Sdim NewI = B.CreateBinOp(BO->getOpcode(), 3191296417Sdim ShrinkOperand(BO->getOperand(0)), 3192296417Sdim ShrinkOperand(BO->getOperand(1))); 3193296417Sdim cast<BinaryOperator>(NewI)->copyIRFlags(I); 3194296417Sdim } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) { 3195296417Sdim NewI = B.CreateICmp(CI->getPredicate(), 3196296417Sdim ShrinkOperand(CI->getOperand(0)), 3197296417Sdim ShrinkOperand(CI->getOperand(1))); 3198296417Sdim } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 3199296417Sdim NewI = B.CreateSelect(SI->getCondition(), 3200296417Sdim ShrinkOperand(SI->getTrueValue()), 3201296417Sdim ShrinkOperand(SI->getFalseValue())); 3202296417Sdim } else if (CastInst *CI = dyn_cast<CastInst>(I)) { 3203296417Sdim switch (CI->getOpcode()) { 3204296417Sdim default: llvm_unreachable("Unhandled cast!"); 3205296417Sdim case Instruction::Trunc: 3206296417Sdim NewI = ShrinkOperand(CI->getOperand(0)); 3207296417Sdim break; 3208296417Sdim case Instruction::SExt: 3209296417Sdim NewI = B.CreateSExtOrTrunc(CI->getOperand(0), 3210296417Sdim smallestIntegerVectorType(OriginalTy, 3211296417Sdim TruncatedTy)); 3212296417Sdim break; 3213296417Sdim case Instruction::ZExt: 3214296417Sdim NewI = B.CreateZExtOrTrunc(CI->getOperand(0), 3215296417Sdim smallestIntegerVectorType(OriginalTy, 3216296417Sdim TruncatedTy)); 3217296417Sdim break; 3218296417Sdim } 3219296417Sdim } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 3220296417Sdim auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3221296417Sdim auto *O0 = 3222296417Sdim B.CreateZExtOrTrunc(SI->getOperand(0), 3223296417Sdim VectorType::get(ScalarTruncatedTy, Elements0)); 3224296417Sdim auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3225296417Sdim auto *O1 = 3226296417Sdim B.CreateZExtOrTrunc(SI->getOperand(1), 3227296417Sdim VectorType::get(ScalarTruncatedTy, Elements1)); 3228296417Sdim 3229296417Sdim NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3230296417Sdim } else if (isa<LoadInst>(I)) { 3231296417Sdim // Don't do anything with the operands, just extend the result. 3232296417Sdim continue; 3233296417Sdim } else { 3234296417Sdim llvm_unreachable("Unhandled instruction type!"); 3235296417Sdim } 3236296417Sdim 3237296417Sdim // Lastly, extend the result. 3238296417Sdim NewI->takeName(cast<Instruction>(I)); 3239296417Sdim Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3240296417Sdim I->replaceAllUsesWith(Res); 3241296417Sdim cast<Instruction>(I)->eraseFromParent(); 3242300974Sdim Erased.insert(I); 3243296417Sdim I = Res; 3244296417Sdim } 3245296417Sdim } 3246296417Sdim 3247296417Sdim // We'll have created a bunch of ZExts that are now parentless. Clean up. 3248296417Sdim for (auto &KV : MinBWs) { 3249296417Sdim VectorParts &Parts = WidenMap.get(KV.first); 3250296417Sdim for (Value *&I : Parts) { 3251296417Sdim ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3252296417Sdim if (Inst && Inst->use_empty()) { 3253296417Sdim Value *NewI = Inst->getOperand(0); 3254296417Sdim Inst->eraseFromParent(); 3255296417Sdim I = NewI; 3256296417Sdim } 3257296417Sdim } 3258296417Sdim } 3259296417Sdim} 3260296417Sdim 3261276479Sdimvoid InnerLoopVectorizer::vectorizeLoop() { 3262243789Sdim //===------------------------------------------------===// 3263243789Sdim // 3264243789Sdim // Notice: any optimization or new instruction that go 3265243789Sdim // into the code below should be also be implemented in 3266243789Sdim // the cost-model. 3267243789Sdim // 3268243789Sdim //===------------------------------------------------===// 3269249423Sdim Constant *Zero = Builder.getInt32(0); 3270243789Sdim 3271243789Sdim // In order to support reduction variables we need to be able to vectorize 3272243789Sdim // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 3273249423Sdim // stages. First, we create a new vector PHI node with no incoming edges. 3274243789Sdim // We use this value when we vectorize all of the instructions that use the 3275243789Sdim // PHI. Next, after all of the instructions in the block are complete we 3276243789Sdim // add the new incoming edges to the PHI. At this point all of the 3277243789Sdim // instructions in the basic block are vectorized, so we can use them to 3278243789Sdim // construct the PHI. 3279249423Sdim PhiVector RdxPHIsToFix; 3280243789Sdim 3281249423Sdim // Scan the loop in a topological order to ensure that defs are vectorized 3282249423Sdim // before users. 3283249423Sdim LoopBlocksDFS DFS(OrigLoop); 3284249423Sdim DFS.perform(LI); 3285243789Sdim 3286249423Sdim // Vectorize all of the blocks in the original loop. 3287249423Sdim for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 3288249423Sdim be = DFS.endRPO(); bb != be; ++bb) 3289276479Sdim vectorizeBlockInLoop(*bb, &RdxPHIsToFix); 3290243789Sdim 3291296417Sdim // Insert truncates and extends for any truncated instructions as hints to 3292296417Sdim // InstCombine. 3293296417Sdim if (VF > 1) 3294296417Sdim truncateToMinimalBitwidths(); 3295296417Sdim 3296249423Sdim // At this point every instruction in the original loop is widened to 3297243789Sdim // a vector form. We are almost done. Now, we need to fix the PHI nodes 3298243789Sdim // that we vectorized. The PHI nodes are currently empty because we did 3299243789Sdim // not want to introduce cycles. Notice that the remaining PHI nodes 3300243789Sdim // that we need to fix are reduction variables. 3301243789Sdim 3302243789Sdim // Create the 'reduced' values for each of the induction vars. 3303243789Sdim // The reduced values are the vector values that we scalarize and combine 3304243789Sdim // after the loop is finished. 3305249423Sdim for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 3306243789Sdim it != e; ++it) { 3307243789Sdim PHINode *RdxPhi = *it; 3308243789Sdim assert(RdxPhi && "Unable to recover vectorized PHI"); 3309243789Sdim 3310243789Sdim // Find the reduction variable descriptor. 3311296417Sdim assert(Legal->isReductionVariable(RdxPhi) && 3312243789Sdim "Unable to find the reduction variable"); 3313288943Sdim RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; 3314243789Sdim 3315288943Sdim RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3316288943Sdim TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3317288943Sdim Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3318288943Sdim RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3319288943Sdim RdxDesc.getMinMaxRecurrenceKind(); 3320288943Sdim setDebugLocFromInst(Builder, ReductionStartValue); 3321261991Sdim 3322243789Sdim // We need to generate a reduction vector from the incoming scalar. 3323276479Sdim // To do so, we need to generate the 'identity' vector and override 3324243789Sdim // one of the elements with the incoming scalar reduction. We need 3325243789Sdim // to do it in the vector-loop preheader. 3326276479Sdim Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); 3327243789Sdim 3328243789Sdim // This is the vector-clone of the value that leaves the loop. 3329288943Sdim VectorParts &VectorExit = getVectorValue(LoopExitInst); 3330249423Sdim Type *VecTy = VectorExit[0]->getType(); 3331243789Sdim 3332243789Sdim // Find the reduction identity variable. Zero for addition, or, xor, 3333243789Sdim // one for multiplication, -1 for And. 3334251662Sdim Value *Identity; 3335251662Sdim Value *VectorStart; 3336288943Sdim if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3337288943Sdim RK == RecurrenceDescriptor::RK_FloatMinMax) { 3338251662Sdim // MinMax reduction have the start value as their identify. 3339261991Sdim if (VF == 1) { 3340288943Sdim VectorStart = Identity = ReductionStartValue; 3341261991Sdim } else { 3342288943Sdim VectorStart = Identity = 3343288943Sdim Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3344261991Sdim } 3345251662Sdim } else { 3346261991Sdim // Handle other reduction kinds: 3347288943Sdim Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3348288943Sdim RK, VecTy->getScalarType()); 3349261991Sdim if (VF == 1) { 3350261991Sdim Identity = Iden; 3351261991Sdim // This vector is the Identity vector where the first element is the 3352261991Sdim // incoming scalar reduction. 3353288943Sdim VectorStart = ReductionStartValue; 3354261991Sdim } else { 3355261991Sdim Identity = ConstantVector::getSplat(VF, Iden); 3356243789Sdim 3357261991Sdim // This vector is the Identity vector where the first element is the 3358261991Sdim // incoming scalar reduction. 3359288943Sdim VectorStart = 3360288943Sdim Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3361261991Sdim } 3362251662Sdim } 3363243789Sdim 3364243789Sdim // Fix the vector-loop phi. 3365243789Sdim 3366243789Sdim // Reductions do not have to start at zero. They can start with 3367243789Sdim // any loop invariant values. 3368249423Sdim VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 3369249423Sdim BasicBlock *Latch = OrigLoop->getLoopLatch(); 3370249423Sdim Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 3371249423Sdim VectorParts &Val = getVectorValue(LoopVal); 3372249423Sdim for (unsigned part = 0; part < UF; ++part) { 3373261991Sdim // Make sure to add the reduction stat value only to the 3374249423Sdim // first unroll part. 3375249423Sdim Value *StartVal = (part == 0) ? VectorStart : Identity; 3376280031Sdim cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, 3377280031Sdim LoopVectorPreHeader); 3378276479Sdim cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], 3379276479Sdim LoopVectorBody.back()); 3380249423Sdim } 3381243789Sdim 3382243789Sdim // Before each round, move the insertion point right between 3383243789Sdim // the PHIs and the values we are going to write. 3384243789Sdim // This allows us to write both PHINodes and the extractelement 3385243789Sdim // instructions. 3386296417Sdim Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3387243789Sdim 3388296417Sdim VectorParts RdxParts = getVectorValue(LoopExitInst); 3389288943Sdim setDebugLocFromInst(Builder, LoopExitInst); 3390296417Sdim 3391296417Sdim // If the vector reduction can be performed in a smaller type, we truncate 3392296417Sdim // then extend the loop exit value to enable InstCombine to evaluate the 3393296417Sdim // entire expression in the smaller type. 3394296417Sdim if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { 3395296417Sdim Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3396296417Sdim Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); 3397296417Sdim for (unsigned part = 0; part < UF; ++part) { 3398296417Sdim Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3399296417Sdim Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3400296417Sdim : Builder.CreateZExt(Trunc, VecTy); 3401296417Sdim for (Value::user_iterator UI = RdxParts[part]->user_begin(); 3402296417Sdim UI != RdxParts[part]->user_end();) 3403296417Sdim if (*UI != Trunc) { 3404296417Sdim (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); 3405296417Sdim RdxParts[part] = Extnd; 3406296417Sdim } else { 3407296417Sdim ++UI; 3408296417Sdim } 3409296417Sdim } 3410296417Sdim Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3411296417Sdim for (unsigned part = 0; part < UF; ++part) 3412296417Sdim RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); 3413249423Sdim } 3414243789Sdim 3415249423Sdim // Reduce all of the unrolled parts into a single vector. 3416249423Sdim Value *ReducedPartRdx = RdxParts[0]; 3417288943Sdim unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3418261991Sdim setDebugLocFromInst(Builder, ReducedPartRdx); 3419249423Sdim for (unsigned part = 1; part < UF; ++part) { 3420251662Sdim if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3421276479Sdim // Floating point operations had to be 'fast' to enable the reduction. 3422276479Sdim ReducedPartRdx = addFastMathFlag( 3423276479Sdim Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], 3424276479Sdim ReducedPartRdx, "bin.rdx")); 3425251662Sdim else 3426288943Sdim ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp( 3427288943Sdim Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]); 3428243789Sdim } 3429243789Sdim 3430261991Sdim if (VF > 1) { 3431261991Sdim // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 3432261991Sdim // and vector ops, reducing the set of values being computed by half each 3433261991Sdim // round. 3434261991Sdim assert(isPowerOf2_32(VF) && 3435261991Sdim "Reduction emission only supported for pow2 vectors!"); 3436261991Sdim Value *TmpVec = ReducedPartRdx; 3437276479Sdim SmallVector<Constant*, 32> ShuffleMask(VF, nullptr); 3438261991Sdim for (unsigned i = VF; i != 1; i >>= 1) { 3439261991Sdim // Move the upper half of the vector to the lower half. 3440261991Sdim for (unsigned j = 0; j != i/2; ++j) 3441261991Sdim ShuffleMask[j] = Builder.getInt32(i/2 + j); 3442249423Sdim 3443261991Sdim // Fill the rest of the mask with undef. 3444261991Sdim std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 3445261991Sdim UndefValue::get(Builder.getInt32Ty())); 3446249423Sdim 3447261991Sdim Value *Shuf = 3448249423Sdim Builder.CreateShuffleVector(TmpVec, 3449249423Sdim UndefValue::get(TmpVec->getType()), 3450249423Sdim ConstantVector::get(ShuffleMask), 3451249423Sdim "rdx.shuf"); 3452249423Sdim 3453261991Sdim if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3454276479Sdim // Floating point operations had to be 'fast' to enable the reduction. 3455276479Sdim TmpVec = addFastMathFlag(Builder.CreateBinOp( 3456276479Sdim (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); 3457261991Sdim else 3458288943Sdim TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, 3459288943Sdim TmpVec, Shuf); 3460261991Sdim } 3461261991Sdim 3462261991Sdim // The result is in the first element of the vector. 3463261991Sdim ReducedPartRdx = Builder.CreateExtractElement(TmpVec, 3464261991Sdim Builder.getInt32(0)); 3465296417Sdim 3466296417Sdim // If the reduction can be performed in a smaller type, we need to extend 3467296417Sdim // the reduction to the wider type before we branch to the original loop. 3468296417Sdim if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) 3469296417Sdim ReducedPartRdx = 3470296417Sdim RdxDesc.isSigned() 3471296417Sdim ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) 3472296417Sdim : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType()); 3473249423Sdim } 3474249423Sdim 3475276479Sdim // Create a phi node that merges control-flow from the backedge-taken check 3476276479Sdim // block and the middle block. 3477276479Sdim PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", 3478276479Sdim LoopScalarPreHeader->getTerminator()); 3479296417Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3480296417Sdim BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3481276479Sdim BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3482276479Sdim 3483243789Sdim // Now, we need to fix the users of the reduction variable 3484243789Sdim // inside and outside of the scalar remainder loop. 3485243789Sdim // We know that the loop is in LCSSA form. We need to update the 3486243789Sdim // PHI nodes in the exit blocks. 3487243789Sdim for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3488243789Sdim LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3489243789Sdim PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3490261991Sdim if (!LCSSAPhi) break; 3491243789Sdim 3492243789Sdim // All PHINodes need to have a single entry edge, or two if 3493243789Sdim // we already fixed them. 3494243789Sdim assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3495243789Sdim 3496243789Sdim // We found our reduction value exit-PHI. Update it with the 3497243789Sdim // incoming bypass edge. 3498288943Sdim if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) { 3499243789Sdim // Add an edge coming from the bypass. 3500261991Sdim LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3501243789Sdim break; 3502243789Sdim } 3503243789Sdim }// end of the LCSSA phi scan. 3504243789Sdim 3505243789Sdim // Fix the scalar loop reduction variable with the incoming reduction sum 3506243789Sdim // from the vector body and from the backedge value. 3507249423Sdim int IncomingEdgeBlockIdx = 3508249423Sdim (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3509249423Sdim assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3510249423Sdim // Pick the other block. 3511249423Sdim int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3512276479Sdim (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3513288943Sdim (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3514243789Sdim }// end of for each redux variable. 3515249423Sdim 3516261991Sdim fixLCSSAPHIs(); 3517261991Sdim 3518296417Sdim // Make sure DomTree is updated. 3519296417Sdim updateAnalysis(); 3520296417Sdim 3521296417Sdim // Predicate any stores. 3522296417Sdim for (auto KV : PredicatedStores) { 3523296417Sdim BasicBlock::iterator I(KV.first); 3524296417Sdim auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); 3525296417Sdim auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, 3526296417Sdim /*BranchWeights=*/nullptr, DT); 3527296417Sdim I->moveBefore(T); 3528296417Sdim I->getParent()->setName("pred.store.if"); 3529296417Sdim BB->setName("pred.store.continue"); 3530296417Sdim } 3531296417Sdim DEBUG(DT->verifyDomTree()); 3532261991Sdim // Remove redundant induction instructions. 3533261991Sdim cse(LoopVectorBody); 3534261991Sdim} 3535261991Sdim 3536261991Sdimvoid InnerLoopVectorizer::fixLCSSAPHIs() { 3537249423Sdim for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 3538249423Sdim LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 3539249423Sdim PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 3540261991Sdim if (!LCSSAPhi) break; 3541249423Sdim if (LCSSAPhi->getNumIncomingValues() == 1) 3542249423Sdim LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 3543249423Sdim LoopMiddleBlock); 3544249423Sdim } 3545280031Sdim} 3546243789Sdim 3547249423SdimInnerLoopVectorizer::VectorParts 3548249423SdimInnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 3549249423Sdim assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 3550249423Sdim "Invalid edge"); 3551249423Sdim 3552261991Sdim // Look for cached value. 3553261991Sdim std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); 3554261991Sdim EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); 3555261991Sdim if (ECEntryIt != MaskCache.end()) 3556261991Sdim return ECEntryIt->second; 3557261991Sdim 3558249423Sdim VectorParts SrcMask = createBlockInMask(Src); 3559249423Sdim 3560249423Sdim // The terminator has to be a branch inst! 3561249423Sdim BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 3562249423Sdim assert(BI && "Unexpected terminator found"); 3563249423Sdim 3564249423Sdim if (BI->isConditional()) { 3565249423Sdim VectorParts EdgeMask = getVectorValue(BI->getCondition()); 3566249423Sdim 3567249423Sdim if (BI->getSuccessor(0) != Dst) 3568249423Sdim for (unsigned part = 0; part < UF; ++part) 3569249423Sdim EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 3570249423Sdim 3571249423Sdim for (unsigned part = 0; part < UF; ++part) 3572249423Sdim EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 3573261991Sdim 3574261991Sdim MaskCache[Edge] = EdgeMask; 3575249423Sdim return EdgeMask; 3576249423Sdim } 3577249423Sdim 3578261991Sdim MaskCache[Edge] = SrcMask; 3579249423Sdim return SrcMask; 3580249423Sdim} 3581249423Sdim 3582249423SdimInnerLoopVectorizer::VectorParts 3583249423SdimInnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 3584249423Sdim assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 3585249423Sdim 3586249423Sdim // Loop incoming mask is all-one. 3587249423Sdim if (OrigLoop->getHeader() == BB) { 3588249423Sdim Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 3589249423Sdim return getVectorValue(C); 3590249423Sdim } 3591249423Sdim 3592249423Sdim // This is the block mask. We OR all incoming edges, and with zero. 3593249423Sdim Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 3594249423Sdim VectorParts BlockMask = getVectorValue(Zero); 3595249423Sdim 3596249423Sdim // For each pred: 3597249423Sdim for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 3598249423Sdim VectorParts EM = createEdgeMask(*it, BB); 3599249423Sdim for (unsigned part = 0; part < UF; ++part) 3600249423Sdim BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 3601249423Sdim } 3602249423Sdim 3603249423Sdim return BlockMask; 3604249423Sdim} 3605249423Sdim 3606296417Sdimvoid InnerLoopVectorizer::widenPHIInstruction( 3607296417Sdim Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, 3608296417Sdim unsigned VF, PhiVector *PV) { 3609261991Sdim PHINode* P = cast<PHINode>(PN); 3610261991Sdim // Handle reduction variables: 3611296417Sdim if (Legal->isReductionVariable(P)) { 3612261991Sdim for (unsigned part = 0; part < UF; ++part) { 3613261991Sdim // This is phase one of vectorizing PHIs. 3614261991Sdim Type *VecTy = (VF == 1) ? PN->getType() : 3615261991Sdim VectorType::get(PN->getType(), VF); 3616296417Sdim Entry[part] = PHINode::Create( 3617296417Sdim VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); 3618261991Sdim } 3619261991Sdim PV->push_back(P); 3620261991Sdim return; 3621261991Sdim } 3622249423Sdim 3623261991Sdim setDebugLocFromInst(Builder, P); 3624261991Sdim // Check for PHI nodes that are lowered to vector selects. 3625261991Sdim if (P->getParent() != OrigLoop->getHeader()) { 3626276479Sdim // We know that all PHIs in non-header blocks are converted into 3627261991Sdim // selects, so we don't have to worry about the insertion order and we 3628261991Sdim // can just use the builder. 3629261991Sdim // At this point we generate the predication tree. There may be 3630261991Sdim // duplications since this is a simple recursive scan, but future 3631261991Sdim // optimizations will clean it up. 3632249423Sdim 3633261991Sdim unsigned NumIncoming = P->getNumIncomingValues(); 3634251662Sdim 3635261991Sdim // Generate a sequence of selects of the form: 3636261991Sdim // SELECT(Mask3, In3, 3637261991Sdim // SELECT(Mask2, In2, 3638261991Sdim // ( ...))) 3639261991Sdim for (unsigned In = 0; In < NumIncoming; In++) { 3640261991Sdim VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 3641261991Sdim P->getParent()); 3642261991Sdim VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 3643251662Sdim 3644261991Sdim for (unsigned part = 0; part < UF; ++part) { 3645261991Sdim // We might have single edge PHIs (blocks) - use an identity 3646261991Sdim // 'select' for the first PHI operand. 3647261991Sdim if (In == 0) 3648261991Sdim Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3649261991Sdim In0[part]); 3650261991Sdim else 3651261991Sdim // Select between the current value and the previous incoming edge 3652261991Sdim // based on the incoming mask. 3653261991Sdim Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 3654261991Sdim Entry[part], "predphi"); 3655249423Sdim } 3656261991Sdim } 3657261991Sdim return; 3658261991Sdim } 3659249423Sdim 3660261991Sdim // This PHINode must be an induction variable. 3661261991Sdim // Make sure that we know about it. 3662261991Sdim assert(Legal->getInductionVars()->count(P) && 3663261991Sdim "Not an induction variable"); 3664249423Sdim 3665296417Sdim InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3666249423Sdim 3667288943Sdim // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3668288943Sdim // which can be found from the original scalar operations. 3669296417Sdim switch (II.getKind()) { 3670296417Sdim case InductionDescriptor::IK_NoInduction: 3671261991Sdim llvm_unreachable("Unknown induction"); 3672296417Sdim case InductionDescriptor::IK_IntInduction: { 3673296417Sdim assert(P->getType() == II.getStartValue()->getType() && 3674296417Sdim "Types must match"); 3675296417Sdim // Handle other induction variables that are now based on the 3676296417Sdim // canonical one. 3677296417Sdim Value *V = Induction; 3678296417Sdim if (P != OldInduction) { 3679296417Sdim V = Builder.CreateSExtOrTrunc(Induction, P->getType()); 3680296417Sdim V = II.transform(Builder, V); 3681296417Sdim V->setName("offset.idx"); 3682261991Sdim } 3683296417Sdim Value *Broadcasted = getBroadcastInstrs(V); 3684261991Sdim // After broadcasting the induction variable we need to make the vector 3685261991Sdim // consecutive by adding 0, 1, 2, etc. 3686261991Sdim for (unsigned part = 0; part < UF; ++part) 3687296417Sdim Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); 3688261991Sdim return; 3689261991Sdim } 3690296417Sdim case InductionDescriptor::IK_PtrInduction: 3691261991Sdim // Handle the pointer induction variable case. 3692261991Sdim assert(P->getType()->isPointerTy() && "Unexpected type."); 3693288943Sdim // This is the normalized GEP that starts counting at zero. 3694296417Sdim Value *PtrInd = Induction; 3695296417Sdim PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); 3696261991Sdim // This is the vector of results. Notice that we don't generate 3697261991Sdim // vector geps because scalar geps result in better code. 3698261991Sdim for (unsigned part = 0; part < UF; ++part) { 3699261991Sdim if (VF == 1) { 3700288943Sdim int EltIndex = part; 3701296417Sdim Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 3702296417Sdim Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3703288943Sdim Value *SclrGep = II.transform(Builder, GlobalIdx); 3704288943Sdim SclrGep->setName("next.gep"); 3705261991Sdim Entry[part] = SclrGep; 3706249423Sdim continue; 3707249423Sdim } 3708249423Sdim 3709261991Sdim Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 3710261991Sdim for (unsigned int i = 0; i < VF; ++i) { 3711288943Sdim int EltIndex = i + part * VF; 3712296417Sdim Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); 3713296417Sdim Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3714288943Sdim Value *SclrGep = II.transform(Builder, GlobalIdx); 3715288943Sdim SclrGep->setName("next.gep"); 3716261991Sdim VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 3717261991Sdim Builder.getInt32(i), 3718261991Sdim "insert.gep"); 3719249423Sdim } 3720261991Sdim Entry[part] = VecVal; 3721249423Sdim } 3722261991Sdim return; 3723261991Sdim } 3724261991Sdim} 3725249423Sdim 3726276479Sdimvoid InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { 3727261991Sdim // For each instruction in the old loop. 3728261991Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 3729296417Sdim VectorParts &Entry = WidenMap.get(&*it); 3730296417Sdim 3731261991Sdim switch (it->getOpcode()) { 3732261991Sdim case Instruction::Br: 3733261991Sdim // Nothing to do for PHIs and BR, since we already took care of the 3734261991Sdim // loop control flow instructions. 3735261991Sdim continue; 3736288943Sdim case Instruction::PHI: { 3737261991Sdim // Vectorize PHINodes. 3738296417Sdim widenPHIInstruction(&*it, Entry, UF, VF, PV); 3739261991Sdim continue; 3740249423Sdim }// End of PHI. 3741249423Sdim 3742249423Sdim case Instruction::Add: 3743249423Sdim case Instruction::FAdd: 3744249423Sdim case Instruction::Sub: 3745249423Sdim case Instruction::FSub: 3746249423Sdim case Instruction::Mul: 3747249423Sdim case Instruction::FMul: 3748249423Sdim case Instruction::UDiv: 3749249423Sdim case Instruction::SDiv: 3750249423Sdim case Instruction::FDiv: 3751249423Sdim case Instruction::URem: 3752249423Sdim case Instruction::SRem: 3753249423Sdim case Instruction::FRem: 3754249423Sdim case Instruction::Shl: 3755249423Sdim case Instruction::LShr: 3756249423Sdim case Instruction::AShr: 3757249423Sdim case Instruction::And: 3758249423Sdim case Instruction::Or: 3759249423Sdim case Instruction::Xor: { 3760249423Sdim // Just widen binops. 3761249423Sdim BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 3762261991Sdim setDebugLocFromInst(Builder, BinOp); 3763249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 3764249423Sdim VectorParts &B = getVectorValue(it->getOperand(1)); 3765249423Sdim 3766249423Sdim // Use this vector value for all users of the original instruction. 3767249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 3768249423Sdim Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 3769249423Sdim 3770280031Sdim if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V)) 3771280031Sdim VecOp->copyIRFlags(BinOp); 3772249423Sdim 3773249423Sdim Entry[Part] = V; 3774249423Sdim } 3775276479Sdim 3776296417Sdim propagateMetadata(Entry, &*it); 3777249423Sdim break; 3778249423Sdim } 3779249423Sdim case Instruction::Select: { 3780249423Sdim // Widen selects. 3781249423Sdim // If the selector is loop invariant we can create a select 3782249423Sdim // instruction with a scalar condition. Otherwise, use vector-select. 3783296417Sdim auto *SE = PSE.getSE(); 3784296417Sdim bool InvariantCond = 3785296417Sdim SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); 3786296417Sdim setDebugLocFromInst(Builder, &*it); 3787249423Sdim 3788249423Sdim // The condition can be loop invariant but still defined inside the 3789249423Sdim // loop. This means that we can't just use the original 'cond' value. 3790249423Sdim // We have to take the 'vectorized' value and pick the first lane. 3791249423Sdim // Instcombine will make this a no-op. 3792249423Sdim VectorParts &Cond = getVectorValue(it->getOperand(0)); 3793249423Sdim VectorParts &Op0 = getVectorValue(it->getOperand(1)); 3794249423Sdim VectorParts &Op1 = getVectorValue(it->getOperand(2)); 3795296417Sdim 3796261991Sdim Value *ScalarCond = (VF == 1) ? Cond[0] : 3797261991Sdim Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); 3798261991Sdim 3799249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 3800249423Sdim Entry[Part] = Builder.CreateSelect( 3801249423Sdim InvariantCond ? ScalarCond : Cond[Part], 3802249423Sdim Op0[Part], 3803249423Sdim Op1[Part]); 3804249423Sdim } 3805276479Sdim 3806296417Sdim propagateMetadata(Entry, &*it); 3807249423Sdim break; 3808249423Sdim } 3809249423Sdim 3810249423Sdim case Instruction::ICmp: 3811249423Sdim case Instruction::FCmp: { 3812249423Sdim // Widen compares. Generate vector compares. 3813249423Sdim bool FCmp = (it->getOpcode() == Instruction::FCmp); 3814249423Sdim CmpInst *Cmp = dyn_cast<CmpInst>(it); 3815296417Sdim setDebugLocFromInst(Builder, &*it); 3816249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 3817249423Sdim VectorParts &B = getVectorValue(it->getOperand(1)); 3818249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 3819276479Sdim Value *C = nullptr; 3820296417Sdim if (FCmp) { 3821249423Sdim C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 3822296417Sdim cast<FCmpInst>(C)->copyFastMathFlags(&*it); 3823296417Sdim } else { 3824249423Sdim C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 3825296417Sdim } 3826249423Sdim Entry[Part] = C; 3827249423Sdim } 3828276479Sdim 3829296417Sdim propagateMetadata(Entry, &*it); 3830249423Sdim break; 3831249423Sdim } 3832249423Sdim 3833249423Sdim case Instruction::Store: 3834249423Sdim case Instruction::Load: 3835296417Sdim vectorizeMemoryInstruction(&*it); 3836249423Sdim break; 3837249423Sdim case Instruction::ZExt: 3838249423Sdim case Instruction::SExt: 3839249423Sdim case Instruction::FPToUI: 3840249423Sdim case Instruction::FPToSI: 3841249423Sdim case Instruction::FPExt: 3842249423Sdim case Instruction::PtrToInt: 3843249423Sdim case Instruction::IntToPtr: 3844249423Sdim case Instruction::SIToFP: 3845249423Sdim case Instruction::UIToFP: 3846249423Sdim case Instruction::Trunc: 3847249423Sdim case Instruction::FPTrunc: 3848249423Sdim case Instruction::BitCast: { 3849249423Sdim CastInst *CI = dyn_cast<CastInst>(it); 3850296417Sdim setDebugLocFromInst(Builder, &*it); 3851249423Sdim /// Optimize the special case where the source is the induction 3852249423Sdim /// variable. Notice that we can only optimize the 'trunc' case 3853249423Sdim /// because: a. FP conversions lose precision, b. sext/zext may wrap, 3854249423Sdim /// c. other casts depend on pointer size. 3855249423Sdim if (CI->getOperand(0) == OldInduction && 3856249423Sdim it->getOpcode() == Instruction::Trunc) { 3857249423Sdim Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 3858249423Sdim CI->getType()); 3859249423Sdim Value *Broadcasted = getBroadcastInstrs(ScalarCast); 3860296417Sdim InductionDescriptor II = 3861288943Sdim Legal->getInductionVars()->lookup(OldInduction); 3862296417Sdim Constant *Step = ConstantInt::getSigned( 3863296417Sdim CI->getType(), II.getStepValue()->getSExtValue()); 3864249423Sdim for (unsigned Part = 0; Part < UF; ++Part) 3865288943Sdim Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); 3866296417Sdim propagateMetadata(Entry, &*it); 3867249423Sdim break; 3868249423Sdim } 3869249423Sdim /// Vectorize casts. 3870261991Sdim Type *DestTy = (VF == 1) ? CI->getType() : 3871261991Sdim VectorType::get(CI->getType(), VF); 3872249423Sdim 3873249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 3874249423Sdim for (unsigned Part = 0; Part < UF; ++Part) 3875249423Sdim Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 3876296417Sdim propagateMetadata(Entry, &*it); 3877249423Sdim break; 3878249423Sdim } 3879249423Sdim 3880249423Sdim case Instruction::Call: { 3881249423Sdim // Ignore dbg intrinsics. 3882249423Sdim if (isa<DbgInfoIntrinsic>(it)) 3883249423Sdim break; 3884296417Sdim setDebugLocFromInst(Builder, &*it); 3885249423Sdim 3886249423Sdim Module *M = BB->getParent()->getParent(); 3887249423Sdim CallInst *CI = cast<CallInst>(it); 3888288943Sdim 3889288943Sdim StringRef FnName = CI->getCalledFunction()->getName(); 3890288943Sdim Function *F = CI->getCalledFunction(); 3891288943Sdim Type *RetTy = ToVectorTy(CI->getType(), VF); 3892288943Sdim SmallVector<Type *, 4> Tys; 3893288943Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 3894288943Sdim Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 3895288943Sdim 3896249423Sdim Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 3897288943Sdim if (ID && 3898288943Sdim (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 3899288943Sdim ID == Intrinsic::lifetime_start)) { 3900296417Sdim scalarizeInstruction(&*it); 3901261991Sdim break; 3902288943Sdim } 3903288943Sdim // The flag shows whether we use Intrinsic or a usual Call for vectorized 3904288943Sdim // version of the instruction. 3905288943Sdim // Is it beneficial to perform intrinsic call compared to lib call? 3906288943Sdim bool NeedToScalarize; 3907288943Sdim unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); 3908288943Sdim bool UseVectorIntrinsic = 3909288943Sdim ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; 3910288943Sdim if (!UseVectorIntrinsic && NeedToScalarize) { 3911296417Sdim scalarizeInstruction(&*it); 3912288943Sdim break; 3913288943Sdim } 3914288943Sdim 3915288943Sdim for (unsigned Part = 0; Part < UF; ++Part) { 3916288943Sdim SmallVector<Value *, 4> Args; 3917288943Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 3918288943Sdim Value *Arg = CI->getArgOperand(i); 3919288943Sdim // Some intrinsics have a scalar argument - don't replace it with a 3920288943Sdim // vector. 3921288943Sdim if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { 3922288943Sdim VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); 3923288943Sdim Arg = VectorArg[Part]; 3924261991Sdim } 3925288943Sdim Args.push_back(Arg); 3926288943Sdim } 3927288943Sdim 3928288943Sdim Function *VectorF; 3929288943Sdim if (UseVectorIntrinsic) { 3930288943Sdim // Use vector version of the intrinsic. 3931288943Sdim Type *TysForDecl[] = {CI->getType()}; 3932261991Sdim if (VF > 1) 3933288943Sdim TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 3934288943Sdim VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 3935288943Sdim } else { 3936288943Sdim // Use vector version of the library call. 3937288943Sdim StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 3938288943Sdim assert(!VFnName.empty() && "Vector function name is empty."); 3939288943Sdim VectorF = M->getFunction(VFnName); 3940288943Sdim if (!VectorF) { 3941288943Sdim // Generate a declaration 3942288943Sdim FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 3943288943Sdim VectorF = 3944288943Sdim Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 3945288943Sdim VectorF->copyAttributesFrom(F); 3946288943Sdim } 3947249423Sdim } 3948288943Sdim assert(VectorF && "Can't create vector function."); 3949288943Sdim Entry[Part] = Builder.CreateCall(VectorF, Args); 3950288943Sdim } 3951276479Sdim 3952296417Sdim propagateMetadata(Entry, &*it); 3953249423Sdim break; 3954249423Sdim } 3955249423Sdim 3956249423Sdim default: 3957249423Sdim // All other instructions are unsupported. Scalarize them. 3958296417Sdim scalarizeInstruction(&*it); 3959249423Sdim break; 3960249423Sdim }// end of switch. 3961249423Sdim }// end of for_each instr. 3962249423Sdim} 3963249423Sdim 3964249423Sdimvoid InnerLoopVectorizer::updateAnalysis() { 3965249423Sdim // Forget the original basic block. 3966296417Sdim PSE.getSE()->forgetLoop(OrigLoop); 3967243789Sdim 3968243789Sdim // Update the dominator tree information. 3969249423Sdim assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 3970243789Sdim "Entry does not dominate exit."); 3971243789Sdim 3972249423Sdim for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 3973249423Sdim DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 3974249423Sdim DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 3975276479Sdim 3976296417Sdim // We don't predicate stores by this point, so the vector body should be a 3977296417Sdim // single loop. 3978296417Sdim assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); 3979296417Sdim DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); 3980276479Sdim 3981296417Sdim DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); 3982276479Sdim DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 3983243789Sdim DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 3984277320Sdim DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 3985243789Sdim 3986276479Sdim DEBUG(DT->verifyDomTree()); 3987243789Sdim} 3988243789Sdim 3989261991Sdim/// \brief Check whether it is safe to if-convert this phi node. 3990261991Sdim/// 3991261991Sdim/// Phi nodes with constant expressions that can trap are not safe to if 3992261991Sdim/// convert. 3993261991Sdimstatic bool canIfConvertPHINodes(BasicBlock *BB) { 3994261991Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 3995261991Sdim PHINode *Phi = dyn_cast<PHINode>(I); 3996261991Sdim if (!Phi) 3997261991Sdim return true; 3998261991Sdim for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) 3999261991Sdim if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) 4000261991Sdim if (C->canTrap()) 4001261991Sdim return false; 4002261991Sdim } 4003261991Sdim return true; 4004261991Sdim} 4005261991Sdim 4006249423Sdimbool LoopVectorizationLegality::canVectorizeWithIfConvert() { 4007276479Sdim if (!EnableIfConversion) { 4008288943Sdim emitAnalysis(VectorizationReport() << "if-conversion is disabled"); 4009249423Sdim return false; 4010276479Sdim } 4011249423Sdim 4012249423Sdim assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 4013249423Sdim 4014261991Sdim // A list of pointers that we can safely read and write to. 4015261991Sdim SmallPtrSet<Value *, 8> SafePointes; 4016261991Sdim 4017261991Sdim // Collect safe addresses. 4018261991Sdim for (Loop::block_iterator BI = TheLoop->block_begin(), 4019261991Sdim BE = TheLoop->block_end(); BI != BE; ++BI) { 4020261991Sdim BasicBlock *BB = *BI; 4021261991Sdim 4022261991Sdim if (blockNeedsPredication(BB)) 4023261991Sdim continue; 4024261991Sdim 4025261991Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 4026261991Sdim if (LoadInst *LI = dyn_cast<LoadInst>(I)) 4027261991Sdim SafePointes.insert(LI->getPointerOperand()); 4028261991Sdim else if (StoreInst *SI = dyn_cast<StoreInst>(I)) 4029261991Sdim SafePointes.insert(SI->getPointerOperand()); 4030261991Sdim } 4031261991Sdim } 4032261991Sdim 4033249423Sdim // Collect the blocks that need predication. 4034261991Sdim BasicBlock *Header = TheLoop->getHeader(); 4035261991Sdim for (Loop::block_iterator BI = TheLoop->block_begin(), 4036261991Sdim BE = TheLoop->block_end(); BI != BE; ++BI) { 4037261991Sdim BasicBlock *BB = *BI; 4038249423Sdim 4039249423Sdim // We don't support switch statements inside loops. 4040276479Sdim if (!isa<BranchInst>(BB->getTerminator())) { 4041288943Sdim emitAnalysis(VectorizationReport(BB->getTerminator()) 4042276479Sdim << "loop contains a switch statement"); 4043249423Sdim return false; 4044276479Sdim } 4045249423Sdim 4046249423Sdim // We must be able to predicate all blocks that need to be predicated. 4047261991Sdim if (blockNeedsPredication(BB)) { 4048276479Sdim if (!blockCanBePredicated(BB, SafePointes)) { 4049288943Sdim emitAnalysis(VectorizationReport(BB->getTerminator()) 4050276479Sdim << "control flow cannot be substituted for a select"); 4051261991Sdim return false; 4052276479Sdim } 4053276479Sdim } else if (BB != Header && !canIfConvertPHINodes(BB)) { 4054288943Sdim emitAnalysis(VectorizationReport(BB->getTerminator()) 4055276479Sdim << "control flow cannot be substituted for a select"); 4056249423Sdim return false; 4057276479Sdim } 4058243789Sdim } 4059243789Sdim 4060249423Sdim // We can if-convert this loop. 4061249423Sdim return true; 4062249423Sdim} 4063249423Sdim 4064249423Sdimbool LoopVectorizationLegality::canVectorize() { 4065250997Sdim // We must have a loop in canonical form. Loops with indirectbr in them cannot 4066250997Sdim // be canonicalized. 4067276479Sdim if (!TheLoop->getLoopPreheader()) { 4068276479Sdim emitAnalysis( 4069288943Sdim VectorizationReport() << 4070288943Sdim "loop control flow is not understood by vectorizer"); 4071250997Sdim return false; 4072276479Sdim } 4073249423Sdim 4074249423Sdim // We can only vectorize innermost loops. 4075288943Sdim if (!TheLoop->empty()) { 4076288943Sdim emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); 4077249423Sdim return false; 4078276479Sdim } 4079249423Sdim 4080249423Sdim // We must have a single backedge. 4081276479Sdim if (TheLoop->getNumBackEdges() != 1) { 4082276479Sdim emitAnalysis( 4083288943Sdim VectorizationReport() << 4084288943Sdim "loop control flow is not understood by vectorizer"); 4085249423Sdim return false; 4086276479Sdim } 4087249423Sdim 4088249423Sdim // We must have a single exiting block. 4089276479Sdim if (!TheLoop->getExitingBlock()) { 4090276479Sdim emitAnalysis( 4091288943Sdim VectorizationReport() << 4092288943Sdim "loop control flow is not understood by vectorizer"); 4093249423Sdim return false; 4094276479Sdim } 4095249423Sdim 4096275633Sdim // We only handle bottom-tested loops, i.e. loop in which the condition is 4097275633Sdim // checked at the end of each iteration. With that we can assume that all 4098275633Sdim // instructions in the loop are executed the same number of times. 4099275633Sdim if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 4100276479Sdim emitAnalysis( 4101288943Sdim VectorizationReport() << 4102288943Sdim "loop control flow is not understood by vectorizer"); 4103275633Sdim return false; 4104275633Sdim } 4105275633Sdim 4106261991Sdim // We need to have a loop header. 4107261991Sdim DEBUG(dbgs() << "LV: Found a loop: " << 4108261991Sdim TheLoop->getHeader()->getName() << '\n'); 4109249423Sdim 4110276479Sdim // Check if we can if-convert non-single-bb loops. 4111261991Sdim unsigned NumBlocks = TheLoop->getNumBlocks(); 4112249423Sdim if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 4113249423Sdim DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 4114243789Sdim return false; 4115243789Sdim } 4116243789Sdim 4117243789Sdim // ScalarEvolution needs to be able to find the exit count. 4118296417Sdim const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); 4119296417Sdim if (ExitCount == PSE.getSE()->getCouldNotCompute()) { 4120296417Sdim emitAnalysis(VectorizationReport() 4121296417Sdim << "could not determine number of loop iterations"); 4122243789Sdim DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 4123243789Sdim return false; 4124243789Sdim } 4125243789Sdim 4126249423Sdim // Check if we can vectorize the instructions and CFG in this loop. 4127249423Sdim if (!canVectorizeInstrs()) { 4128249423Sdim DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 4129249423Sdim return false; 4130249423Sdim } 4131249423Sdim 4132243789Sdim // Go over each instruction and look at memory deps. 4133249423Sdim if (!canVectorizeMemory()) { 4134249423Sdim DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 4135243789Sdim return false; 4136243789Sdim } 4137243789Sdim 4138249423Sdim // Collect all of the variables that remain uniform after vectorization. 4139249423Sdim collectLoopUniforms(); 4140249423Sdim 4141288943Sdim DEBUG(dbgs() << "LV: We can vectorize this loop" 4142288943Sdim << (LAI->getRuntimePointerChecking()->Need 4143288943Sdim ? " (with a runtime bound check)" 4144288943Sdim : "") 4145288943Sdim << "!\n"); 4146243789Sdim 4147296417Sdim bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 4148296417Sdim 4149296417Sdim // If an override option has been passed in for interleaved accesses, use it. 4150296417Sdim if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 4151296417Sdim UseInterleaved = EnableInterleavedMemAccesses; 4152296417Sdim 4153288943Sdim // Analyze interleaved memory accesses. 4154296417Sdim if (UseInterleaved) 4155288943Sdim InterleaveInfo.analyzeInterleaving(Strides); 4156288943Sdim 4157296417Sdim unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; 4158296417Sdim if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) 4159296417Sdim SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; 4160296417Sdim 4161296417Sdim if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { 4162296417Sdim emitAnalysis(VectorizationReport() 4163296417Sdim << "Too many SCEV assumptions need to be made and checked " 4164296417Sdim << "at runtime"); 4165296417Sdim DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); 4166296417Sdim return false; 4167296417Sdim } 4168296417Sdim 4169243789Sdim // Okay! We can vectorize. At this point we don't have any other mem analysis 4170243789Sdim // which may limit our maximum vectorization factor, so just return true with 4171243789Sdim // no restrictions. 4172243789Sdim return true; 4173243789Sdim} 4174243789Sdim 4175276479Sdimstatic Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { 4176261991Sdim if (Ty->isPointerTy()) 4177261991Sdim return DL.getIntPtrType(Ty); 4178261991Sdim 4179261991Sdim // It is possible that char's or short's overflow when we ask for the loop's 4180261991Sdim // trip count, work around this by changing the type size. 4181261991Sdim if (Ty->getScalarSizeInBits() < 32) 4182261991Sdim return Type::getInt32Ty(Ty->getContext()); 4183261991Sdim 4184261991Sdim return Ty; 4185261991Sdim} 4186261991Sdim 4187276479Sdimstatic Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { 4188261991Sdim Ty0 = convertPointerToIntegerType(DL, Ty0); 4189261991Sdim Ty1 = convertPointerToIntegerType(DL, Ty1); 4190261991Sdim if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 4191261991Sdim return Ty0; 4192261991Sdim return Ty1; 4193261991Sdim} 4194261991Sdim 4195251662Sdim/// \brief Check that the instruction has outside loop users and is not an 4196251662Sdim/// identified reduction variable. 4197251662Sdimstatic bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 4198280031Sdim SmallPtrSetImpl<Value *> &Reductions) { 4199251662Sdim // Reduction instructions are allowed to have exit users. All other 4200251662Sdim // instructions must not have external users. 4201251662Sdim if (!Reductions.count(Inst)) 4202251662Sdim //Check that all of the users of the loop are inside the BB. 4203276479Sdim for (User *U : Inst->users()) { 4204276479Sdim Instruction *UI = cast<Instruction>(U); 4205251662Sdim // This user may be a reduction exit value. 4206276479Sdim if (!TheLoop->contains(UI)) { 4207276479Sdim DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); 4208251662Sdim return true; 4209251662Sdim } 4210251662Sdim } 4211251662Sdim return false; 4212251662Sdim} 4213251662Sdim 4214249423Sdimbool LoopVectorizationLegality::canVectorizeInstrs() { 4215249423Sdim BasicBlock *Header = TheLoop->getHeader(); 4216243789Sdim 4217251662Sdim // Look for the attribute signaling the absence of NaNs. 4218251662Sdim Function &F = *Header->getParent(); 4219288943Sdim const DataLayout &DL = F.getParent()->getDataLayout(); 4220251662Sdim if (F.hasFnAttribute("no-nans-fp-math")) 4221288943Sdim HasFunNoNaNAttr = 4222288943Sdim F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; 4223251662Sdim 4224249423Sdim // For each block in the loop. 4225249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 4226249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 4227249423Sdim 4228249423Sdim // Scan the instructions in the block and look for hazards. 4229249423Sdim for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 4230249423Sdim ++it) { 4231249423Sdim 4232249423Sdim if (PHINode *Phi = dyn_cast<PHINode>(it)) { 4233261991Sdim Type *PhiTy = Phi->getType(); 4234249423Sdim // Check that this PHI type is allowed. 4235261991Sdim if (!PhiTy->isIntegerTy() && 4236261991Sdim !PhiTy->isFloatingPointTy() && 4237261991Sdim !PhiTy->isPointerTy()) { 4238296417Sdim emitAnalysis(VectorizationReport(&*it) 4239276479Sdim << "loop control flow is not understood by vectorizer"); 4240249423Sdim DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 4241249423Sdim return false; 4242249423Sdim } 4243249423Sdim 4244249423Sdim // If this PHINode is not in the header block, then we know that we 4245249423Sdim // can convert it to select during if-conversion. No need to check if 4246249423Sdim // the PHIs in this block are induction or reduction variables. 4247251662Sdim if (*bb != Header) { 4248251662Sdim // Check that this instruction has no outside users or is an 4249251662Sdim // identified reduction value with an outside user. 4250296417Sdim if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) 4251251662Sdim continue; 4252296417Sdim emitAnalysis(VectorizationReport(&*it) << 4253288943Sdim "value could not be identified as " 4254288943Sdim "an induction or reduction variable"); 4255251662Sdim return false; 4256251662Sdim } 4257249423Sdim 4258280031Sdim // We only allow if-converted PHIs with exactly two incoming values. 4259251662Sdim if (Phi->getNumIncomingValues() != 2) { 4260296417Sdim emitAnalysis(VectorizationReport(&*it) 4261276479Sdim << "control flow not understood by vectorizer"); 4262251662Sdim DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 4263251662Sdim return false; 4264251662Sdim } 4265251662Sdim 4266296417Sdim InductionDescriptor ID; 4267296417Sdim if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { 4268296417Sdim Inductions[Phi] = ID; 4269261991Sdim // Get the widest type. 4270261991Sdim if (!WidestIndTy) 4271288943Sdim WidestIndTy = convertPointerToIntegerType(DL, PhiTy); 4272261991Sdim else 4273288943Sdim WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); 4274261991Sdim 4275249423Sdim // Int inductions are special because we only allow one IV. 4276296417Sdim if (ID.getKind() == InductionDescriptor::IK_IntInduction && 4277296417Sdim ID.getStepValue()->isOne() && 4278296417Sdim isa<Constant>(ID.getStartValue()) && 4279296417Sdim cast<Constant>(ID.getStartValue())->isNullValue()) { 4280261991Sdim // Use the phi node with the widest type as induction. Use the last 4281261991Sdim // one if there are multiple (no good reason for doing this other 4282296417Sdim // than it is expedient). We've checked that it begins at zero and 4283296417Sdim // steps by one, so this is a canonical induction variable. 4284261991Sdim if (!Induction || PhiTy == WidestIndTy) 4285261991Sdim Induction = Phi; 4286249423Sdim } 4287249423Sdim 4288249423Sdim DEBUG(dbgs() << "LV: Found an induction variable.\n"); 4289261991Sdim 4290261991Sdim // Until we explicitly handle the case of an induction variable with 4291261991Sdim // an outside loop user we have to give up vectorizing this loop. 4292296417Sdim if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { 4293296417Sdim emitAnalysis(VectorizationReport(&*it) << 4294288943Sdim "use of induction value outside of the " 4295288943Sdim "loop is not handled by vectorizer"); 4296261991Sdim return false; 4297276479Sdim } 4298261991Sdim 4299249423Sdim continue; 4300249423Sdim } 4301249423Sdim 4302296417Sdim RecurrenceDescriptor RedDes; 4303296417Sdim if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { 4304296417Sdim if (RedDes.hasUnsafeAlgebra()) 4305296417Sdim Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); 4306296417Sdim AllowedExit.insert(RedDes.getLoopExitInstr()); 4307296417Sdim Reductions[Phi] = RedDes; 4308249423Sdim continue; 4309249423Sdim } 4310249423Sdim 4311296417Sdim emitAnalysis(VectorizationReport(&*it) << 4312288943Sdim "value that could not be identified as " 4313288943Sdim "reduction is used outside the loop"); 4314249423Sdim DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 4315243789Sdim return false; 4316249423Sdim }// end of PHI handling 4317249423Sdim 4318288943Sdim // We handle calls that: 4319288943Sdim // * Are debug info intrinsics. 4320288943Sdim // * Have a mapping to an IR intrinsic. 4321288943Sdim // * Have a vector version available. 4322249423Sdim CallInst *CI = dyn_cast<CallInst>(it); 4323288943Sdim if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) && 4324288943Sdim !(CI->getCalledFunction() && TLI && 4325288943Sdim TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { 4326296417Sdim emitAnalysis(VectorizationReport(&*it) 4327296417Sdim << "call instruction cannot be vectorized"); 4328288943Sdim DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); 4329249423Sdim return false; 4330243789Sdim } 4331249423Sdim 4332276479Sdim // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the 4333276479Sdim // second argument is the same (i.e. loop invariant) 4334276479Sdim if (CI && 4335276479Sdim hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { 4336296417Sdim auto *SE = PSE.getSE(); 4337296417Sdim if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { 4338296417Sdim emitAnalysis(VectorizationReport(&*it) 4339276479Sdim << "intrinsic instruction cannot be vectorized"); 4340276479Sdim DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); 4341276479Sdim return false; 4342276479Sdim } 4343276479Sdim } 4344276479Sdim 4345249423Sdim // Check that the instruction return type is vectorizable. 4346261991Sdim // Also, we can't vectorize extractelement instructions. 4347261991Sdim if ((!VectorType::isValidElementType(it->getType()) && 4348261991Sdim !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { 4349296417Sdim emitAnalysis(VectorizationReport(&*it) 4350276479Sdim << "instruction return type cannot be vectorized"); 4351261991Sdim DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); 4352243789Sdim return false; 4353243789Sdim } 4354243789Sdim 4355249423Sdim // Check that the stored type is vectorizable. 4356249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 4357249423Sdim Type *T = ST->getValueOperand()->getType(); 4358276479Sdim if (!VectorType::isValidElementType(T)) { 4359288943Sdim emitAnalysis(VectorizationReport(ST) << 4360288943Sdim "store instruction cannot be vectorized"); 4361243789Sdim return false; 4362276479Sdim } 4363276479Sdim if (EnableMemAccessVersioning) 4364280031Sdim collectStridedAccess(ST); 4365243789Sdim } 4366243789Sdim 4367276479Sdim if (EnableMemAccessVersioning) 4368276479Sdim if (LoadInst *LI = dyn_cast<LoadInst>(it)) 4369280031Sdim collectStridedAccess(LI); 4370276479Sdim 4371249423Sdim // Reduction instructions are allowed to have exit users. 4372249423Sdim // All other instructions must not have external users. 4373296417Sdim if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { 4374296417Sdim emitAnalysis(VectorizationReport(&*it) << 4375288943Sdim "value cannot be used outside the loop"); 4376251662Sdim return false; 4377276479Sdim } 4378251662Sdim 4379249423Sdim } // next instr. 4380243789Sdim 4381249423Sdim } 4382243789Sdim 4383243789Sdim if (!Induction) { 4384249423Sdim DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 4385276479Sdim if (Inductions.empty()) { 4386288943Sdim emitAnalysis(VectorizationReport() 4387276479Sdim << "loop induction variable could not be identified"); 4388261991Sdim return false; 4389276479Sdim } 4390243789Sdim } 4391243789Sdim 4392296417Sdim // Now we know the widest induction type, check if our found induction 4393296417Sdim // is the same size. If it's not, unset it here and InnerLoopVectorizer 4394296417Sdim // will create another. 4395296417Sdim if (Induction && WidestIndTy != Induction->getType()) 4396296417Sdim Induction = nullptr; 4397296417Sdim 4398249423Sdim return true; 4399249423Sdim} 4400243789Sdim 4401280031Sdimvoid LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { 4402276479Sdim Value *Ptr = nullptr; 4403276479Sdim if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess)) 4404276479Sdim Ptr = LI->getPointerOperand(); 4405276479Sdim else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess)) 4406276479Sdim Ptr = SI->getPointerOperand(); 4407276479Sdim else 4408276479Sdim return; 4409276479Sdim 4410296417Sdim Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); 4411276479Sdim if (!Stride) 4412276479Sdim return; 4413276479Sdim 4414276479Sdim DEBUG(dbgs() << "LV: Found a strided access that we can version"); 4415276479Sdim DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); 4416276479Sdim Strides[Ptr] = Stride; 4417276479Sdim StrideSet.insert(Stride); 4418276479Sdim} 4419276479Sdim 4420249423Sdimvoid LoopVectorizationLegality::collectLoopUniforms() { 4421243789Sdim // We now know that the loop is vectorizable! 4422243789Sdim // Collect variables that will remain uniform after vectorization. 4423243789Sdim std::vector<Value*> Worklist; 4424249423Sdim BasicBlock *Latch = TheLoop->getLoopLatch(); 4425243789Sdim 4426243789Sdim // Start with the conditional branch and walk up the block. 4427249423Sdim Worklist.push_back(Latch->getTerminator()->getOperand(0)); 4428243789Sdim 4429276479Sdim // Also add all consecutive pointer values; these values will be uniform 4430276479Sdim // after vectorization (and subsequent cleanup) and, until revectorization is 4431276479Sdim // supported, all dependencies must also be uniform. 4432276479Sdim for (Loop::block_iterator B = TheLoop->block_begin(), 4433276479Sdim BE = TheLoop->block_end(); B != BE; ++B) 4434276479Sdim for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); 4435276479Sdim I != IE; ++I) 4436296417Sdim if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) 4437276479Sdim Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4438276479Sdim 4439288943Sdim while (!Worklist.empty()) { 4440243789Sdim Instruction *I = dyn_cast<Instruction>(Worklist.back()); 4441243789Sdim Worklist.pop_back(); 4442243789Sdim 4443249423Sdim // Look at instructions inside this loop. 4444243789Sdim // Stop when reaching PHI nodes. 4445249423Sdim // TODO: we need to follow values all over the loop, not only in this block. 4446249423Sdim if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 4447249423Sdim continue; 4448243789Sdim 4449243789Sdim // This is a known uniform. 4450243789Sdim Uniforms.insert(I); 4451243789Sdim 4452243789Sdim // Insert all operands. 4453261991Sdim Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 4454243789Sdim } 4455249423Sdim} 4456243789Sdim 4457288943Sdimbool LoopVectorizationLegality::canVectorizeMemory() { 4458288943Sdim LAI = &LAA->getInfo(TheLoop, Strides); 4459288943Sdim auto &OptionalReport = LAI->getReport(); 4460288943Sdim if (OptionalReport) 4461288943Sdim emitAnalysis(VectorizationReport(*OptionalReport)); 4462288943Sdim if (!LAI->canVectorizeMemory()) 4463261991Sdim return false; 4464261991Sdim 4465288943Sdim if (LAI->hasStoreToLoopInvariantAddress()) { 4466288943Sdim emitAnalysis( 4467288943Sdim VectorizationReport() 4468288943Sdim << "write to a loop invariant address could not be vectorized"); 4469288943Sdim DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 4470261991Sdim return false; 4471261991Sdim } 4472261991Sdim 4473296417Sdim Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); 4474296417Sdim PSE.addPredicate(LAI->PSE.getUnionPredicate()); 4475296417Sdim 4476261991Sdim return true; 4477261991Sdim} 4478261991Sdim 4479249423Sdimbool LoopVectorizationLegality::isInductionVariable(const Value *V) { 4480249423Sdim Value *In0 = const_cast<Value*>(V); 4481249423Sdim PHINode *PN = dyn_cast_or_null<PHINode>(In0); 4482249423Sdim if (!PN) 4483243789Sdim return false; 4484249423Sdim 4485249423Sdim return Inductions.count(PN); 4486249423Sdim} 4487249423Sdim 4488249423Sdimbool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 4489288943Sdim return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4490249423Sdim} 4491249423Sdim 4492261991Sdimbool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, 4493280031Sdim SmallPtrSetImpl<Value *> &SafePtrs) { 4494280031Sdim 4495249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4496280031Sdim // Check that we don't have a constant expression that can trap as operand. 4497280031Sdim for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); 4498280031Sdim OI != OE; ++OI) { 4499280031Sdim if (Constant *C = dyn_cast<Constant>(*OI)) 4500280031Sdim if (C->canTrap()) 4501280031Sdim return false; 4502280031Sdim } 4503261991Sdim // We might be able to hoist the load. 4504261991Sdim if (it->mayReadFromMemory()) { 4505261991Sdim LoadInst *LI = dyn_cast<LoadInst>(it); 4506280031Sdim if (!LI) 4507261991Sdim return false; 4508280031Sdim if (!SafePtrs.count(LI->getPointerOperand())) { 4509280031Sdim if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { 4510280031Sdim MaskedOp.insert(LI); 4511280031Sdim continue; 4512280031Sdim } 4513280031Sdim return false; 4514280031Sdim } 4515261991Sdim } 4516261991Sdim 4517261991Sdim // We don't predicate stores at the moment. 4518276479Sdim if (it->mayWriteToMemory()) { 4519276479Sdim StoreInst *SI = dyn_cast<StoreInst>(it); 4520276479Sdim // We only support predication of stores in basic blocks with one 4521276479Sdim // predecessor. 4522280031Sdim if (!SI) 4523276479Sdim return false; 4524280031Sdim 4525280031Sdim bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); 4526280031Sdim bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); 4527280031Sdim 4528280031Sdim if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || 4529280031Sdim !isSinglePredecessor) { 4530296417Sdim // Build a masked store if it is legal for the target, otherwise 4531296417Sdim // scalarize the block. 4532280031Sdim bool isLegalMaskedOp = 4533280031Sdim isLegalMaskedStore(SI->getValueOperand()->getType(), 4534280031Sdim SI->getPointerOperand()); 4535280031Sdim if (isLegalMaskedOp) { 4536280031Sdim --NumPredStores; 4537280031Sdim MaskedOp.insert(SI); 4538280031Sdim continue; 4539280031Sdim } 4540280031Sdim return false; 4541280031Sdim } 4542276479Sdim } 4543276479Sdim if (it->mayThrow()) 4544249423Sdim return false; 4545249423Sdim 4546249423Sdim // The instructions below can trap. 4547249423Sdim switch (it->getOpcode()) { 4548249423Sdim default: continue; 4549249423Sdim case Instruction::UDiv: 4550249423Sdim case Instruction::SDiv: 4551249423Sdim case Instruction::URem: 4552249423Sdim case Instruction::SRem: 4553280031Sdim return false; 4554249423Sdim } 4555243789Sdim } 4556249423Sdim 4557243789Sdim return true; 4558243789Sdim} 4559243789Sdim 4560288943Sdimvoid InterleavedAccessInfo::collectConstStridedAccesses( 4561288943Sdim MapVector<Instruction *, StrideDescriptor> &StrideAccesses, 4562288943Sdim const ValueToValueMap &Strides) { 4563288943Sdim // Holds load/store instructions in program order. 4564288943Sdim SmallVector<Instruction *, 16> AccessList; 4565288943Sdim 4566288943Sdim for (auto *BB : TheLoop->getBlocks()) { 4567288943Sdim bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); 4568288943Sdim 4569288943Sdim for (auto &I : *BB) { 4570288943Sdim if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I)) 4571288943Sdim continue; 4572288943Sdim // FIXME: Currently we can't handle mixed accesses and predicated accesses 4573288943Sdim if (IsPred) 4574288943Sdim return; 4575288943Sdim 4576288943Sdim AccessList.push_back(&I); 4577288943Sdim } 4578288943Sdim } 4579288943Sdim 4580288943Sdim if (AccessList.empty()) 4581288943Sdim return; 4582288943Sdim 4583288943Sdim auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); 4584288943Sdim for (auto I : AccessList) { 4585288943Sdim LoadInst *LI = dyn_cast<LoadInst>(I); 4586288943Sdim StoreInst *SI = dyn_cast<StoreInst>(I); 4587288943Sdim 4588288943Sdim Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 4589296417Sdim int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); 4590288943Sdim 4591288943Sdim // The factor of the corresponding interleave group. 4592288943Sdim unsigned Factor = std::abs(Stride); 4593288943Sdim 4594288943Sdim // Ignore the access if the factor is too small or too large. 4595288943Sdim if (Factor < 2 || Factor > MaxInterleaveGroupFactor) 4596288943Sdim continue; 4597288943Sdim 4598296417Sdim const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); 4599288943Sdim PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 4600288943Sdim unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); 4601288943Sdim 4602288943Sdim // An alignment of 0 means target ABI alignment. 4603288943Sdim unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); 4604288943Sdim if (!Align) 4605288943Sdim Align = DL.getABITypeAlignment(PtrTy->getElementType()); 4606288943Sdim 4607288943Sdim StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); 4608288943Sdim } 4609288943Sdim} 4610288943Sdim 4611288943Sdim// Analyze interleaved accesses and collect them into interleave groups. 4612288943Sdim// 4613288943Sdim// Notice that the vectorization on interleaved groups will change instruction 4614288943Sdim// orders and may break dependences. But the memory dependence check guarantees 4615288943Sdim// that there is no overlap between two pointers of different strides, element 4616288943Sdim// sizes or underlying bases. 4617288943Sdim// 4618288943Sdim// For pointers sharing the same stride, element size and underlying base, no 4619288943Sdim// need to worry about Read-After-Write dependences and Write-After-Read 4620288943Sdim// dependences. 4621288943Sdim// 4622288943Sdim// E.g. The RAW dependence: A[i] = a; 4623288943Sdim// b = A[i]; 4624288943Sdim// This won't exist as it is a store-load forwarding conflict, which has 4625288943Sdim// already been checked and forbidden in the dependence check. 4626288943Sdim// 4627288943Sdim// E.g. The WAR dependence: a = A[i]; // (1) 4628288943Sdim// A[i] = b; // (2) 4629288943Sdim// The store group of (2) is always inserted at or below (2), and the load group 4630288943Sdim// of (1) is always inserted at or above (1). The dependence is safe. 4631288943Sdimvoid InterleavedAccessInfo::analyzeInterleaving( 4632288943Sdim const ValueToValueMap &Strides) { 4633288943Sdim DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); 4634288943Sdim 4635288943Sdim // Holds all the stride accesses. 4636288943Sdim MapVector<Instruction *, StrideDescriptor> StrideAccesses; 4637288943Sdim collectConstStridedAccesses(StrideAccesses, Strides); 4638288943Sdim 4639288943Sdim if (StrideAccesses.empty()) 4640288943Sdim return; 4641288943Sdim 4642288943Sdim // Holds all interleaved store groups temporarily. 4643288943Sdim SmallSetVector<InterleaveGroup *, 4> StoreGroups; 4644296417Sdim // Holds all interleaved load groups temporarily. 4645296417Sdim SmallSetVector<InterleaveGroup *, 4> LoadGroups; 4646288943Sdim 4647288943Sdim // Search the load-load/write-write pair B-A in bottom-up order and try to 4648288943Sdim // insert B into the interleave group of A according to 3 rules: 4649288943Sdim // 1. A and B have the same stride. 4650288943Sdim // 2. A and B have the same memory object size. 4651288943Sdim // 3. B belongs to the group according to the distance. 4652288943Sdim // 4653288943Sdim // The bottom-up order can avoid breaking the Write-After-Write dependences 4654288943Sdim // between two pointers of the same base. 4655288943Sdim // E.g. A[i] = a; (1) 4656288943Sdim // A[i] = b; (2) 4657288943Sdim // A[i+1] = c (3) 4658288943Sdim // We form the group (2)+(3) in front, so (1) has to form groups with accesses 4659288943Sdim // above (1), which guarantees that (1) is always above (2). 4660288943Sdim for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; 4661288943Sdim ++I) { 4662288943Sdim Instruction *A = I->first; 4663288943Sdim StrideDescriptor DesA = I->second; 4664288943Sdim 4665288943Sdim InterleaveGroup *Group = getInterleaveGroup(A); 4666288943Sdim if (!Group) { 4667288943Sdim DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); 4668288943Sdim Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); 4669288943Sdim } 4670288943Sdim 4671288943Sdim if (A->mayWriteToMemory()) 4672288943Sdim StoreGroups.insert(Group); 4673296417Sdim else 4674296417Sdim LoadGroups.insert(Group); 4675288943Sdim 4676288943Sdim for (auto II = std::next(I); II != E; ++II) { 4677288943Sdim Instruction *B = II->first; 4678288943Sdim StrideDescriptor DesB = II->second; 4679288943Sdim 4680288943Sdim // Ignore if B is already in a group or B is a different memory operation. 4681288943Sdim if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) 4682288943Sdim continue; 4683288943Sdim 4684288943Sdim // Check the rule 1 and 2. 4685288943Sdim if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) 4686288943Sdim continue; 4687288943Sdim 4688288943Sdim // Calculate the distance and prepare for the rule 3. 4689296417Sdim const SCEVConstant *DistToA = dyn_cast<SCEVConstant>( 4690296417Sdim PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); 4691288943Sdim if (!DistToA) 4692288943Sdim continue; 4693288943Sdim 4694296417Sdim int DistanceToA = DistToA->getAPInt().getSExtValue(); 4695288943Sdim 4696288943Sdim // Skip if the distance is not multiple of size as they are not in the 4697288943Sdim // same group. 4698288943Sdim if (DistanceToA % static_cast<int>(DesA.Size)) 4699288943Sdim continue; 4700288943Sdim 4701288943Sdim // The index of B is the index of A plus the related index to A. 4702288943Sdim int IndexB = 4703288943Sdim Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size); 4704288943Sdim 4705288943Sdim // Try to insert B into the group. 4706288943Sdim if (Group->insertMember(B, IndexB, DesB.Align)) { 4707288943Sdim DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' 4708288943Sdim << " into the interleave group with" << *A << '\n'); 4709288943Sdim InterleaveGroupMap[B] = Group; 4710288943Sdim 4711288943Sdim // Set the first load in program order as the insert position. 4712288943Sdim if (B->mayReadFromMemory()) 4713288943Sdim Group->setInsertPos(B); 4714288943Sdim } 4715288943Sdim } // Iteration on instruction B 4716288943Sdim } // Iteration on instruction A 4717288943Sdim 4718288943Sdim // Remove interleaved store groups with gaps. 4719288943Sdim for (InterleaveGroup *Group : StoreGroups) 4720288943Sdim if (Group->getNumMembers() != Group->getFactor()) 4721288943Sdim releaseGroup(Group); 4722296417Sdim 4723296417Sdim // Remove interleaved load groups that don't have the first and last member. 4724296417Sdim // This guarantees that we won't do speculative out of bounds loads. 4725296417Sdim for (InterleaveGroup *Group : LoadGroups) 4726296417Sdim if (!Group->getMember(0) || !Group->getMember(Group->getFactor() - 1)) 4727296417Sdim releaseGroup(Group); 4728288943Sdim} 4729288943Sdim 4730249423SdimLoopVectorizationCostModel::VectorizationFactor 4731280031SdimLoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { 4732249423Sdim // Width 1 means no vectorize 4733249423Sdim VectorizationFactor Factor = { 1U, 0U }; 4734288943Sdim if (OptForSize && Legal->getRuntimePointerChecking()->Need) { 4735288943Sdim emitAnalysis(VectorizationReport() << 4736288943Sdim "runtime pointer checks needed. Enable vectorization of this " 4737288943Sdim "loop with '#pragma clang loop vectorize(enable)' when " 4738296417Sdim "compiling with -Os/-Oz"); 4739296417Sdim DEBUG(dbgs() << 4740296417Sdim "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); 4741249423Sdim return Factor; 4742243789Sdim } 4743243789Sdim 4744288943Sdim if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { 4745288943Sdim emitAnalysis(VectorizationReport() << 4746288943Sdim "store that is conditionally executed prevents vectorization"); 4747276479Sdim DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); 4748276479Sdim return Factor; 4749276479Sdim } 4750276479Sdim 4751249423Sdim // Find the trip count. 4752280031Sdim unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4753261991Sdim DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4754249423Sdim 4755296417Sdim MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4756296417Sdim unsigned SmallestType, WidestType; 4757296417Sdim std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4758249423Sdim unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4759261991Sdim unsigned MaxSafeDepDist = -1U; 4760261991Sdim if (Legal->getMaxSafeDepDistBytes() != -1U) 4761261991Sdim MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 4762261991Sdim WidestRegister = ((WidestRegister < MaxSafeDepDist) ? 4763261991Sdim WidestRegister : MaxSafeDepDist); 4764249423Sdim unsigned MaxVectorSize = WidestRegister / WidestType; 4765296417Sdim 4766296417Sdim DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " 4767296417Sdim << WidestType << " bits.\n"); 4768261991Sdim DEBUG(dbgs() << "LV: The Widest register is: " 4769261991Sdim << WidestRegister << " bits.\n"); 4770249423Sdim 4771249423Sdim if (MaxVectorSize == 0) { 4772249423Sdim DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4773249423Sdim MaxVectorSize = 1; 4774249423Sdim } 4775249423Sdim 4776280031Sdim assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" 4777249423Sdim " into one vector!"); 4778249423Sdim 4779249423Sdim unsigned VF = MaxVectorSize; 4780296417Sdim if (MaximizeBandwidth && !OptForSize) { 4781296417Sdim // Collect all viable vectorization factors. 4782296417Sdim SmallVector<unsigned, 8> VFs; 4783296417Sdim unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4784296417Sdim for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) 4785296417Sdim VFs.push_back(VS); 4786249423Sdim 4787296417Sdim // For each VF calculate its register usage. 4788296417Sdim auto RUs = calculateRegisterUsage(VFs); 4789296417Sdim 4790296417Sdim // Select the largest VF which doesn't require more registers than existing 4791296417Sdim // ones. 4792296417Sdim unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4793296417Sdim for (int i = RUs.size() - 1; i >= 0; --i) { 4794296417Sdim if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4795296417Sdim VF = VFs[i]; 4796296417Sdim break; 4797296417Sdim } 4798296417Sdim } 4799296417Sdim } 4800296417Sdim 4801249423Sdim // If we optimize the program for size, avoid creating the tail loop. 4802249423Sdim if (OptForSize) { 4803249423Sdim // If we are unable to calculate the trip count then don't try to vectorize. 4804249423Sdim if (TC < 2) { 4805288943Sdim emitAnalysis 4806288943Sdim (VectorizationReport() << 4807288943Sdim "unable to calculate the loop count due to complex control flow"); 4808296417Sdim DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 4809249423Sdim return Factor; 4810249423Sdim } 4811249423Sdim 4812249423Sdim // Find the maximum SIMD width that can fit within the trip count. 4813249423Sdim VF = TC % MaxVectorSize; 4814249423Sdim 4815249423Sdim if (VF == 0) 4816249423Sdim VF = MaxVectorSize; 4817288943Sdim else { 4818288943Sdim // If the trip count that we found modulo the vectorization factor is not 4819288943Sdim // zero then we require a tail. 4820288943Sdim emitAnalysis(VectorizationReport() << 4821288943Sdim "cannot optimize for size and vectorize at the " 4822288943Sdim "same time. Enable vectorization of this loop " 4823288943Sdim "with '#pragma clang loop vectorize(enable)' " 4824296417Sdim "when compiling with -Os/-Oz"); 4825296417Sdim DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); 4826249423Sdim return Factor; 4827249423Sdim } 4828249423Sdim } 4829249423Sdim 4830280031Sdim int UserVF = Hints->getWidth(); 4831249423Sdim if (UserVF != 0) { 4832249423Sdim assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 4833261991Sdim DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 4834249423Sdim 4835249423Sdim Factor.Width = UserVF; 4836249423Sdim return Factor; 4837249423Sdim } 4838249423Sdim 4839243789Sdim float Cost = expectedCost(1); 4840276479Sdim#ifndef NDEBUG 4841276479Sdim const float ScalarCost = Cost; 4842276479Sdim#endif /* NDEBUG */ 4843243789Sdim unsigned Width = 1; 4844276479Sdim DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4845276479Sdim 4846280031Sdim bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4847276479Sdim // Ignore scalar width, because the user explicitly wants vectorization. 4848276479Sdim if (ForceVectorization && VF > 1) { 4849276479Sdim Width = 2; 4850276479Sdim Cost = expectedCost(Width) / (float)Width; 4851276479Sdim } 4852276479Sdim 4853243789Sdim for (unsigned i=2; i <= VF; i*=2) { 4854243789Sdim // Notice that the vector loop needs to be executed less times, so 4855243789Sdim // we need to divide the cost of the vector loops by the width of 4856243789Sdim // the vector elements. 4857243789Sdim float VectorCost = expectedCost(i) / (float)i; 4858261991Sdim DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << 4859243789Sdim (int)VectorCost << ".\n"); 4860243789Sdim if (VectorCost < Cost) { 4861243789Sdim Cost = VectorCost; 4862243789Sdim Width = i; 4863243789Sdim } 4864243789Sdim } 4865243789Sdim 4866276479Sdim DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 4867276479Sdim << "LV: Vectorization seems to be not beneficial, " 4868276479Sdim << "but was forced by a user.\n"); 4869276479Sdim DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); 4870249423Sdim Factor.Width = Width; 4871249423Sdim Factor.Cost = Width * Cost; 4872249423Sdim return Factor; 4873243789Sdim} 4874243789Sdim 4875296417Sdimstd::pair<unsigned, unsigned> 4876296417SdimLoopVectorizationCostModel::getSmallestAndWidestTypes() { 4877296417Sdim unsigned MinWidth = -1U; 4878249423Sdim unsigned MaxWidth = 8; 4879288943Sdim const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 4880249423Sdim 4881249423Sdim // For each block. 4882249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 4883249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 4884249423Sdim BasicBlock *BB = *bb; 4885249423Sdim 4886249423Sdim // For each instruction in the loop. 4887249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4888249423Sdim Type *T = it->getType(); 4889249423Sdim 4890296417Sdim // Skip ignored values. 4891296417Sdim if (ValuesToIgnore.count(&*it)) 4892280031Sdim continue; 4893280031Sdim 4894249423Sdim // Only examine Loads, Stores and PHINodes. 4895249423Sdim if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 4896249423Sdim continue; 4897249423Sdim 4898296417Sdim // Examine PHI nodes that are reduction variables. Update the type to 4899296417Sdim // account for the recurrence type. 4900296417Sdim if (PHINode *PN = dyn_cast<PHINode>(it)) { 4901296417Sdim if (!Legal->isReductionVariable(PN)) 4902249423Sdim continue; 4903296417Sdim RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 4904296417Sdim T = RdxDesc.getRecurrenceType(); 4905296417Sdim } 4906249423Sdim 4907249423Sdim // Examine the stored values. 4908249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(it)) 4909249423Sdim T = ST->getValueOperand()->getType(); 4910249423Sdim 4911249423Sdim // Ignore loaded pointer types and stored pointer types that are not 4912249423Sdim // consecutive. However, we do want to take consecutive stores/loads of 4913249423Sdim // pointer vectors into account. 4914296417Sdim if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) 4915249423Sdim continue; 4916249423Sdim 4917296417Sdim MinWidth = std::min(MinWidth, 4918296417Sdim (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 4919249423Sdim MaxWidth = std::max(MaxWidth, 4920288943Sdim (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 4921249423Sdim } 4922249423Sdim } 4923249423Sdim 4924296417Sdim return {MinWidth, MaxWidth}; 4925249423Sdim} 4926249423Sdim 4927288943Sdimunsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, 4928288943Sdim unsigned VF, 4929288943Sdim unsigned LoopCost) { 4930249423Sdim 4931288943Sdim // -- The interleave heuristics -- 4932288943Sdim // We interleave the loop in order to expose ILP and reduce the loop overhead. 4933249423Sdim // There are many micro-architectural considerations that we can't predict 4934280031Sdim // at this level. For example, frontend pressure (on decode or fetch) due to 4935249423Sdim // code size, or the number and capabilities of the execution ports. 4936249423Sdim // 4937288943Sdim // We use the following heuristics to select the interleave count: 4938288943Sdim // 1. If the code has reductions, then we interleave to break the cross 4939249423Sdim // iteration dependency. 4940288943Sdim // 2. If the loop is really small, then we interleave to reduce the loop 4941249423Sdim // overhead. 4942288943Sdim // 3. We don't interleave if we think that we will spill registers to memory 4943288943Sdim // due to the increased register pressure. 4944249423Sdim 4945288943Sdim // When we optimize for size, we don't interleave. 4946249423Sdim if (OptForSize) 4947249423Sdim return 1; 4948249423Sdim 4949288943Sdim // We used the distance for the interleave count. 4950261991Sdim if (Legal->getMaxSafeDepDistBytes() != -1U) 4951261991Sdim return 1; 4952261991Sdim 4953288943Sdim // Do not interleave loops with a relatively small trip count. 4954280031Sdim unsigned TC = SE->getSmallConstantTripCount(TheLoop); 4955288943Sdim if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 4956249423Sdim return 1; 4957249423Sdim 4958276479Sdim unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 4959276479Sdim DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << 4960276479Sdim " registers\n"); 4961249423Sdim 4962276479Sdim if (VF == 1) { 4963276479Sdim if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 4964276479Sdim TargetNumRegisters = ForceTargetNumScalarRegs; 4965276479Sdim } else { 4966276479Sdim if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 4967276479Sdim TargetNumRegisters = ForceTargetNumVectorRegs; 4968276479Sdim } 4969276479Sdim 4970296417Sdim RegisterUsage R = calculateRegisterUsage({VF})[0]; 4971249423Sdim // We divide by these constants so assume that we have at least one 4972249423Sdim // instruction that uses at least one register. 4973249423Sdim R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 4974249423Sdim R.NumInstructions = std::max(R.NumInstructions, 1U); 4975249423Sdim 4976288943Sdim // We calculate the interleave count using the following formula. 4977249423Sdim // Subtract the number of loop invariants from the number of available 4978288943Sdim // registers. These registers are used by all of the interleaved instances. 4979249423Sdim // Next, divide the remaining registers by the number of registers that is 4980249423Sdim // required by the loop, in order to estimate how many parallel instances 4981276479Sdim // fit without causing spills. All of this is rounded down if necessary to be 4982288943Sdim // a power of two. We want power of two interleave count to simplify any 4983276479Sdim // addressing operations or alignment considerations. 4984288943Sdim unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 4985276479Sdim R.MaxLocalUsers); 4986249423Sdim 4987288943Sdim // Don't count the induction variable as interleaved. 4988276479Sdim if (EnableIndVarRegisterHeur) 4989288943Sdim IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 4990276479Sdim std::max(1U, (R.MaxLocalUsers - 1))); 4991276479Sdim 4992288943Sdim // Clamp the interleave ranges to reasonable counts. 4993288943Sdim unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 4994249423Sdim 4995288943Sdim // Check if the user has overridden the max. 4996276479Sdim if (VF == 1) { 4997280031Sdim if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 4998288943Sdim MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 4999276479Sdim } else { 5000280031Sdim if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5001288943Sdim MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5002276479Sdim } 5003276479Sdim 5004249423Sdim // If we did not calculate the cost for VF (because the user selected the VF) 5005249423Sdim // then we calculate the cost of VF here. 5006249423Sdim if (LoopCost == 0) 5007249423Sdim LoopCost = expectedCost(VF); 5008249423Sdim 5009288943Sdim // Clamp the calculated IC to be between the 1 and the max interleave count 5010249423Sdim // that the target allows. 5011288943Sdim if (IC > MaxInterleaveCount) 5012288943Sdim IC = MaxInterleaveCount; 5013288943Sdim else if (IC < 1) 5014288943Sdim IC = 1; 5015249423Sdim 5016288943Sdim // Interleave if we vectorized this loop and there is a reduction that could 5017288943Sdim // benefit from interleaving. 5018276479Sdim if (VF > 1 && Legal->getReductionVars()->size()) { 5019288943Sdim DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5020288943Sdim return IC; 5021261991Sdim } 5022261991Sdim 5023276479Sdim // Note that if we've already vectorized the loop we will have done the 5024288943Sdim // runtime check and so interleaving won't require further checks. 5025288943Sdim bool InterleavingRequiresRuntimePointerCheck = 5026288943Sdim (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5027276479Sdim 5028288943Sdim // We want to interleave small loops in order to reduce the loop overhead and 5029276479Sdim // potentially expose ILP opportunities. 5030261991Sdim DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5031288943Sdim if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5032276479Sdim // We assume that the cost overhead is 1 and we use the cost model 5033288943Sdim // to estimate the cost of the loop and interleave until the cost of the 5034276479Sdim // loop overhead is about 5% of the cost of the loop. 5035288943Sdim unsigned SmallIC = 5036288943Sdim std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5037276479Sdim 5038288943Sdim // Interleave until store/load ports (estimated by max interleave count) are 5039276479Sdim // saturated. 5040288943Sdim unsigned NumStores = Legal->getNumStores(); 5041288943Sdim unsigned NumLoads = Legal->getNumLoads(); 5042288943Sdim unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5043288943Sdim unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5044276479Sdim 5045280031Sdim // If we have a scalar reduction (vector reductions are already dealt with 5046280031Sdim // by this point), we can increase the critical path length if the loop 5047288943Sdim // we're interleaving is inside another loop. Limit, by default to 2, so the 5048280031Sdim // critical path only gets increased by one reduction operation. 5049280031Sdim if (Legal->getReductionVars()->size() && 5050280031Sdim TheLoop->getLoopDepth() > 1) { 5051288943Sdim unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5052288943Sdim SmallIC = std::min(SmallIC, F); 5053288943Sdim StoresIC = std::min(StoresIC, F); 5054288943Sdim LoadsIC = std::min(LoadsIC, F); 5055280031Sdim } 5056280031Sdim 5057288943Sdim if (EnableLoadStoreRuntimeInterleave && 5058288943Sdim std::max(StoresIC, LoadsIC) > SmallIC) { 5059288943Sdim DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5060288943Sdim return std::max(StoresIC, LoadsIC); 5061276479Sdim } 5062276479Sdim 5063288943Sdim DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5064288943Sdim return SmallIC; 5065249423Sdim } 5066249423Sdim 5067288943Sdim // Interleave if this is a large loop (small loops are already dealt with by 5068296417Sdim // this point) that could benefit from interleaving. 5069288943Sdim bool HasReductions = (Legal->getReductionVars()->size() > 0); 5070288943Sdim if (TTI.enableAggressiveInterleaving(HasReductions)) { 5071288943Sdim DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5072288943Sdim return IC; 5073288943Sdim } 5074288943Sdim 5075288943Sdim DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5076249423Sdim return 1; 5077249423Sdim} 5078249423Sdim 5079296417SdimSmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5080296417SdimLoopVectorizationCostModel::calculateRegisterUsage( 5081296417Sdim const SmallVector<unsigned, 8> &VFs) { 5082249423Sdim // This function calculates the register usage by measuring the highest number 5083249423Sdim // of values that are alive at a single location. Obviously, this is a very 5084249423Sdim // rough estimation. We scan the loop in a topological order in order and 5085249423Sdim // assign a number to each instruction. We use RPO to ensure that defs are 5086249423Sdim // met before their users. We assume that each instruction that has in-loop 5087249423Sdim // users starts an interval. We record every time that an in-loop value is 5088249423Sdim // used, so we have a list of the first and last occurrences of each 5089249423Sdim // instruction. Next, we transpose this data structure into a multi map that 5090249423Sdim // holds the list of intervals that *end* at a specific location. This multi 5091249423Sdim // map allows us to perform a linear search. We scan the instructions linearly 5092249423Sdim // and record each time that a new interval starts, by placing it in a set. 5093249423Sdim // If we find this value in the multi-map then we remove it from the set. 5094249423Sdim // The max register usage is the maximum size of the set. 5095249423Sdim // We also search for instructions that are defined outside the loop, but are 5096249423Sdim // used inside the loop. We need this number separately from the max-interval 5097249423Sdim // usage number because when we unroll, loop-invariant values do not take 5098249423Sdim // more register. 5099249423Sdim LoopBlocksDFS DFS(TheLoop); 5100249423Sdim DFS.perform(LI); 5101249423Sdim 5102296417Sdim RegisterUsage RU; 5103296417Sdim RU.NumInstructions = 0; 5104249423Sdim 5105249423Sdim // Each 'key' in the map opens a new interval. The values 5106249423Sdim // of the map are the index of the 'last seen' usage of the 5107249423Sdim // instruction that is the key. 5108249423Sdim typedef DenseMap<Instruction*, unsigned> IntervalMap; 5109249423Sdim // Maps instruction to its index. 5110249423Sdim DenseMap<unsigned, Instruction*> IdxToInstr; 5111249423Sdim // Marks the end of each interval. 5112249423Sdim IntervalMap EndPoint; 5113249423Sdim // Saves the list of instruction indices that are used in the loop. 5114249423Sdim SmallSet<Instruction*, 8> Ends; 5115249423Sdim // Saves the list of values that are used in the loop but are 5116249423Sdim // defined outside the loop, such as arguments and constants. 5117249423Sdim SmallPtrSet<Value*, 8> LoopInvariants; 5118249423Sdim 5119249423Sdim unsigned Index = 0; 5120249423Sdim for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 5121249423Sdim be = DFS.endRPO(); bb != be; ++bb) { 5122296417Sdim RU.NumInstructions += (*bb)->size(); 5123296417Sdim for (Instruction &I : **bb) { 5124296417Sdim IdxToInstr[Index++] = &I; 5125249423Sdim 5126249423Sdim // Save the end location of each USE. 5127296417Sdim for (unsigned i = 0; i < I.getNumOperands(); ++i) { 5128296417Sdim Value *U = I.getOperand(i); 5129249423Sdim Instruction *Instr = dyn_cast<Instruction>(U); 5130249423Sdim 5131249423Sdim // Ignore non-instruction values such as arguments, constants, etc. 5132249423Sdim if (!Instr) continue; 5133249423Sdim 5134249423Sdim // If this instruction is outside the loop then record it and continue. 5135249423Sdim if (!TheLoop->contains(Instr)) { 5136249423Sdim LoopInvariants.insert(Instr); 5137249423Sdim continue; 5138249423Sdim } 5139249423Sdim 5140249423Sdim // Overwrite previous end points. 5141249423Sdim EndPoint[Instr] = Index; 5142249423Sdim Ends.insert(Instr); 5143249423Sdim } 5144249423Sdim } 5145249423Sdim } 5146249423Sdim 5147249423Sdim // Saves the list of intervals that end with the index in 'key'. 5148249423Sdim typedef SmallVector<Instruction*, 2> InstrList; 5149249423Sdim DenseMap<unsigned, InstrList> TransposeEnds; 5150249423Sdim 5151249423Sdim // Transpose the EndPoints to a list of values that end at each index. 5152249423Sdim for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 5153249423Sdim it != e; ++it) 5154249423Sdim TransposeEnds[it->second].push_back(it->first); 5155249423Sdim 5156249423Sdim SmallSet<Instruction*, 8> OpenIntervals; 5157249423Sdim 5158296417Sdim // Get the size of the widest register. 5159296417Sdim unsigned MaxSafeDepDist = -1U; 5160296417Sdim if (Legal->getMaxSafeDepDistBytes() != -1U) 5161296417Sdim MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5162296417Sdim unsigned WidestRegister = 5163296417Sdim std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5164296417Sdim const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5165249423Sdim 5166296417Sdim SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5167296417Sdim SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5168296417Sdim 5169249423Sdim DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5170296417Sdim 5171296417Sdim // A lambda that gets the register usage for the given type and VF. 5172296417Sdim auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5173296417Sdim unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5174296417Sdim return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5175296417Sdim }; 5176296417Sdim 5177249423Sdim for (unsigned int i = 0; i < Index; ++i) { 5178249423Sdim Instruction *I = IdxToInstr[i]; 5179249423Sdim // Ignore instructions that are never used within the loop. 5180249423Sdim if (!Ends.count(I)) continue; 5181249423Sdim 5182296417Sdim // Skip ignored values. 5183296417Sdim if (ValuesToIgnore.count(I)) 5184280031Sdim continue; 5185280031Sdim 5186249423Sdim // Remove all of the instructions that end at this location. 5187249423Sdim InstrList &List = TransposeEnds[i]; 5188296417Sdim for (unsigned int j = 0, e = List.size(); j < e; ++j) 5189249423Sdim OpenIntervals.erase(List[j]); 5190249423Sdim 5191296417Sdim // For each VF find the maximum usage of registers. 5192296417Sdim for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5193296417Sdim if (VFs[j] == 1) { 5194296417Sdim MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5195296417Sdim continue; 5196296417Sdim } 5197249423Sdim 5198296417Sdim // Count the number of live intervals. 5199296417Sdim unsigned RegUsage = 0; 5200296417Sdim for (auto Inst : OpenIntervals) 5201296417Sdim RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5202296417Sdim MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5203296417Sdim } 5204249423Sdim 5205296417Sdim DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5206296417Sdim << OpenIntervals.size() << '\n'); 5207296417Sdim 5208249423Sdim // Add the current instruction to the list of open intervals. 5209249423Sdim OpenIntervals.insert(I); 5210249423Sdim } 5211249423Sdim 5212296417Sdim for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5213296417Sdim unsigned Invariant = 0; 5214296417Sdim if (VFs[i] == 1) 5215296417Sdim Invariant = LoopInvariants.size(); 5216296417Sdim else { 5217296417Sdim for (auto Inst : LoopInvariants) 5218296417Sdim Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5219296417Sdim } 5220249423Sdim 5221296417Sdim DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5222296417Sdim DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5223296417Sdim DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); 5224296417Sdim DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); 5225296417Sdim 5226296417Sdim RU.LoopInvariantRegs = Invariant; 5227296417Sdim RU.MaxLocalUsers = MaxUsages[i]; 5228296417Sdim RUs[i] = RU; 5229296417Sdim } 5230296417Sdim 5231296417Sdim return RUs; 5232249423Sdim} 5233249423Sdim 5234243789Sdimunsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 5235243789Sdim unsigned Cost = 0; 5236243789Sdim 5237249423Sdim // For each block. 5238249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 5239249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 5240249423Sdim unsigned BlockCost = 0; 5241249423Sdim BasicBlock *BB = *bb; 5242249423Sdim 5243249423Sdim // For each instruction in the old loop. 5244249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 5245249423Sdim // Skip dbg intrinsics. 5246249423Sdim if (isa<DbgInfoIntrinsic>(it)) 5247249423Sdim continue; 5248249423Sdim 5249296417Sdim // Skip ignored values. 5250296417Sdim if (ValuesToIgnore.count(&*it)) 5251280031Sdim continue; 5252280031Sdim 5253296417Sdim unsigned C = getInstructionCost(&*it, VF); 5254276479Sdim 5255276479Sdim // Check if we should override the cost. 5256276479Sdim if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5257276479Sdim C = ForceTargetInstructionCost; 5258276479Sdim 5259261991Sdim BlockCost += C; 5260261991Sdim DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << 5261261991Sdim VF << " For instruction: " << *it << '\n'); 5262249423Sdim } 5263249423Sdim 5264249423Sdim // We assume that if-converted blocks have a 50% chance of being executed. 5265249423Sdim // When the code is scalar then some of the blocks are avoided due to CF. 5266249423Sdim // When the code is vectorized we execute all code paths. 5267261991Sdim if (VF == 1 && Legal->blockNeedsPredication(*bb)) 5268249423Sdim BlockCost /= 2; 5269249423Sdim 5270249423Sdim Cost += BlockCost; 5271243789Sdim } 5272243789Sdim 5273243789Sdim return Cost; 5274243789Sdim} 5275243789Sdim 5276261991Sdim/// \brief Check whether the address computation for a non-consecutive memory 5277261991Sdim/// access looks like an unlikely candidate for being merged into the indexing 5278261991Sdim/// mode. 5279261991Sdim/// 5280261991Sdim/// We look for a GEP which has one index that is an induction variable and all 5281261991Sdim/// other indices are loop invariant. If the stride of this access is also 5282261991Sdim/// within a small bound we decide that this address computation can likely be 5283261991Sdim/// merged into the addressing mode. 5284261991Sdim/// In all other cases, we identify the address computation as complex. 5285261991Sdimstatic bool isLikelyComplexAddressComputation(Value *Ptr, 5286261991Sdim LoopVectorizationLegality *Legal, 5287261991Sdim ScalarEvolution *SE, 5288261991Sdim const Loop *TheLoop) { 5289261991Sdim GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5290261991Sdim if (!Gep) 5291261991Sdim return true; 5292261991Sdim 5293261991Sdim // We are looking for a gep with all loop invariant indices except for one 5294261991Sdim // which should be an induction variable. 5295261991Sdim unsigned NumOperands = Gep->getNumOperands(); 5296261991Sdim for (unsigned i = 1; i < NumOperands; ++i) { 5297261991Sdim Value *Opd = Gep->getOperand(i); 5298261991Sdim if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5299261991Sdim !Legal->isInductionVariable(Opd)) 5300261991Sdim return true; 5301261991Sdim } 5302261991Sdim 5303261991Sdim // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step 5304261991Sdim // can likely be merged into the address computation. 5305261991Sdim unsigned MaxMergeDistance = 64; 5306261991Sdim 5307261991Sdim const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); 5308261991Sdim if (!AddRec) 5309261991Sdim return true; 5310261991Sdim 5311261991Sdim // Check the step is constant. 5312261991Sdim const SCEV *Step = AddRec->getStepRecurrence(*SE); 5313261991Sdim // Calculate the pointer stride and check if it is consecutive. 5314261991Sdim const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 5315261991Sdim if (!C) 5316261991Sdim return true; 5317261991Sdim 5318296417Sdim const APInt &APStepVal = C->getAPInt(); 5319261991Sdim 5320261991Sdim // Huge step value - give up. 5321261991Sdim if (APStepVal.getBitWidth() > 64) 5322261991Sdim return true; 5323261991Sdim 5324261991Sdim int64_t StepVal = APStepVal.getSExtValue(); 5325261991Sdim 5326261991Sdim return StepVal > MaxMergeDistance; 5327261991Sdim} 5328261991Sdim 5329276479Sdimstatic bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5330296417Sdim return Legal->hasStride(I->getOperand(0)) || 5331296417Sdim Legal->hasStride(I->getOperand(1)); 5332276479Sdim} 5333276479Sdim 5334243789Sdimunsigned 5335243789SdimLoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5336243789Sdim // If we know that this instruction will remain uniform, check the cost of 5337243789Sdim // the scalar version. 5338243789Sdim if (Legal->isUniformAfterVectorization(I)) 5339243789Sdim VF = 1; 5340243789Sdim 5341243789Sdim Type *RetTy = I->getType(); 5342296417Sdim if (VF > 1 && MinBWs.count(I)) 5343296417Sdim RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5344243789Sdim Type *VectorTy = ToVectorTy(RetTy, VF); 5345243789Sdim 5346243789Sdim // TODO: We need to estimate the cost of intrinsic calls. 5347243789Sdim switch (I->getOpcode()) { 5348249423Sdim case Instruction::GetElementPtr: 5349249423Sdim // We mark this instruction as zero-cost because the cost of GEPs in 5350249423Sdim // vectorized code depends on whether the corresponding memory instruction 5351249423Sdim // is scalarized or not. Therefore, we handle GEPs with the memory 5352249423Sdim // instruction cost. 5353249423Sdim return 0; 5354249423Sdim case Instruction::Br: { 5355249423Sdim return TTI.getCFInstrCost(I->getOpcode()); 5356249423Sdim } 5357249423Sdim case Instruction::PHI: 5358249423Sdim //TODO: IF-converted IFs become selects. 5359249423Sdim return 0; 5360249423Sdim case Instruction::Add: 5361249423Sdim case Instruction::FAdd: 5362249423Sdim case Instruction::Sub: 5363249423Sdim case Instruction::FSub: 5364249423Sdim case Instruction::Mul: 5365249423Sdim case Instruction::FMul: 5366249423Sdim case Instruction::UDiv: 5367249423Sdim case Instruction::SDiv: 5368249423Sdim case Instruction::FDiv: 5369249423Sdim case Instruction::URem: 5370249423Sdim case Instruction::SRem: 5371249423Sdim case Instruction::FRem: 5372249423Sdim case Instruction::Shl: 5373249423Sdim case Instruction::LShr: 5374249423Sdim case Instruction::AShr: 5375249423Sdim case Instruction::And: 5376249423Sdim case Instruction::Or: 5377249423Sdim case Instruction::Xor: { 5378276479Sdim // Since we will replace the stride by 1 the multiplication should go away. 5379276479Sdim if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 5380276479Sdim return 0; 5381249423Sdim // Certain instructions can be cheaper to vectorize if they have a constant 5382249423Sdim // second vector operand. One example of this are shifts on x86. 5383249423Sdim TargetTransformInfo::OperandValueKind Op1VK = 5384249423Sdim TargetTransformInfo::OK_AnyValue; 5385249423Sdim TargetTransformInfo::OperandValueKind Op2VK = 5386249423Sdim TargetTransformInfo::OK_AnyValue; 5387280031Sdim TargetTransformInfo::OperandValueProperties Op1VP = 5388280031Sdim TargetTransformInfo::OP_None; 5389280031Sdim TargetTransformInfo::OperandValueProperties Op2VP = 5390280031Sdim TargetTransformInfo::OP_None; 5391276479Sdim Value *Op2 = I->getOperand(1); 5392243789Sdim 5393276479Sdim // Check for a splat of a constant or for a non uniform vector of constants. 5394280031Sdim if (isa<ConstantInt>(Op2)) { 5395280031Sdim ConstantInt *CInt = cast<ConstantInt>(Op2); 5396280031Sdim if (CInt && CInt->getValue().isPowerOf2()) 5397280031Sdim Op2VP = TargetTransformInfo::OP_PowerOf2; 5398249423Sdim Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5399280031Sdim } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) { 5400276479Sdim Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 5401280031Sdim Constant *SplatValue = cast<Constant>(Op2)->getSplatValue(); 5402280031Sdim if (SplatValue) { 5403280031Sdim ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue); 5404280031Sdim if (CInt && CInt->getValue().isPowerOf2()) 5405280031Sdim Op2VP = TargetTransformInfo::OP_PowerOf2; 5406276479Sdim Op2VK = TargetTransformInfo::OK_UniformConstantValue; 5407280031Sdim } 5408276479Sdim } 5409243789Sdim 5410280031Sdim return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, 5411280031Sdim Op1VP, Op2VP); 5412249423Sdim } 5413249423Sdim case Instruction::Select: { 5414249423Sdim SelectInst *SI = cast<SelectInst>(I); 5415249423Sdim const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 5416249423Sdim bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 5417249423Sdim Type *CondTy = SI->getCondition()->getType(); 5418249423Sdim if (!ScalarCond) 5419249423Sdim CondTy = VectorType::get(CondTy, VF); 5420243789Sdim 5421249423Sdim return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 5422249423Sdim } 5423249423Sdim case Instruction::ICmp: 5424249423Sdim case Instruction::FCmp: { 5425249423Sdim Type *ValTy = I->getOperand(0)->getType(); 5426296417Sdim Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 5427296417Sdim auto It = MinBWs.find(Op0AsInstruction); 5428296417Sdim if (VF > 1 && It != MinBWs.end()) 5429296417Sdim ValTy = IntegerType::get(ValTy->getContext(), It->second); 5430249423Sdim VectorTy = ToVectorTy(ValTy, VF); 5431249423Sdim return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 5432249423Sdim } 5433249423Sdim case Instruction::Store: 5434249423Sdim case Instruction::Load: { 5435249423Sdim StoreInst *SI = dyn_cast<StoreInst>(I); 5436249423Sdim LoadInst *LI = dyn_cast<LoadInst>(I); 5437249423Sdim Type *ValTy = (SI ? SI->getValueOperand()->getType() : 5438249423Sdim LI->getType()); 5439249423Sdim VectorTy = ToVectorTy(ValTy, VF); 5440243789Sdim 5441249423Sdim unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 5442249423Sdim unsigned AS = SI ? SI->getPointerAddressSpace() : 5443249423Sdim LI->getPointerAddressSpace(); 5444249423Sdim Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 5445249423Sdim // We add the cost of address computation here instead of with the gep 5446249423Sdim // instruction because only here we know whether the operation is 5447249423Sdim // scalarized. 5448249423Sdim if (VF == 1) 5449249423Sdim return TTI.getAddressComputationCost(VectorTy) + 5450249423Sdim TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5451243789Sdim 5452288943Sdim // For an interleaved access, calculate the total cost of the whole 5453288943Sdim // interleave group. 5454288943Sdim if (Legal->isAccessInterleaved(I)) { 5455288943Sdim auto Group = Legal->getInterleavedAccessGroup(I); 5456288943Sdim assert(Group && "Fail to get an interleaved access group."); 5457288943Sdim 5458288943Sdim // Only calculate the cost once at the insert position. 5459288943Sdim if (Group->getInsertPos() != I) 5460288943Sdim return 0; 5461288943Sdim 5462288943Sdim unsigned InterleaveFactor = Group->getFactor(); 5463288943Sdim Type *WideVecTy = 5464288943Sdim VectorType::get(VectorTy->getVectorElementType(), 5465288943Sdim VectorTy->getVectorNumElements() * InterleaveFactor); 5466288943Sdim 5467288943Sdim // Holds the indices of existing members in an interleaved load group. 5468288943Sdim // An interleaved store group doesn't need this as it dones't allow gaps. 5469288943Sdim SmallVector<unsigned, 4> Indices; 5470288943Sdim if (LI) { 5471288943Sdim for (unsigned i = 0; i < InterleaveFactor; i++) 5472288943Sdim if (Group->getMember(i)) 5473288943Sdim Indices.push_back(i); 5474288943Sdim } 5475288943Sdim 5476288943Sdim // Calculate the cost of the whole interleaved group. 5477288943Sdim unsigned Cost = TTI.getInterleavedMemoryOpCost( 5478288943Sdim I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5479288943Sdim Group->getAlignment(), AS); 5480288943Sdim 5481288943Sdim if (Group->isReverse()) 5482288943Sdim Cost += 5483288943Sdim Group->getNumMembers() * 5484288943Sdim TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5485288943Sdim 5486288943Sdim // FIXME: The interleaved load group with a huge gap could be even more 5487288943Sdim // expensive than scalar operations. Then we could ignore such group and 5488288943Sdim // use scalar operations instead. 5489288943Sdim return Cost; 5490288943Sdim } 5491288943Sdim 5492249423Sdim // Scalarized loads/stores. 5493251662Sdim int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5494251662Sdim bool Reverse = ConsecutiveStride < 0; 5495288943Sdim const DataLayout &DL = I->getModule()->getDataLayout(); 5496288943Sdim unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); 5497288943Sdim unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; 5498251662Sdim if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { 5499261991Sdim bool IsComplexComputation = 5500261991Sdim isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); 5501249423Sdim unsigned Cost = 0; 5502249423Sdim // The cost of extracting from the value vector and pointer vector. 5503249423Sdim Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5504249423Sdim for (unsigned i = 0; i < VF; ++i) { 5505249423Sdim // The cost of extracting the pointer operand. 5506249423Sdim Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 5507249423Sdim // In case of STORE, the cost of ExtractElement from the vector. 5508249423Sdim // In case of LOAD, the cost of InsertElement into the returned 5509249423Sdim // vector. 5510249423Sdim Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 5511249423Sdim Instruction::InsertElement, 5512249423Sdim VectorTy, i); 5513243789Sdim } 5514243789Sdim 5515249423Sdim // The cost of the scalar loads/stores. 5516261991Sdim Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); 5517249423Sdim Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5518249423Sdim Alignment, AS); 5519249423Sdim return Cost; 5520243789Sdim } 5521243789Sdim 5522249423Sdim // Wide load/stores. 5523249423Sdim unsigned Cost = TTI.getAddressComputationCost(VectorTy); 5524288943Sdim if (Legal->isMaskRequired(I)) 5525288943Sdim Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, 5526288943Sdim AS); 5527288943Sdim else 5528288943Sdim Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5529243789Sdim 5530249423Sdim if (Reverse) 5531249423Sdim Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 5532249423Sdim VectorTy, 0); 5533249423Sdim return Cost; 5534249423Sdim } 5535249423Sdim case Instruction::ZExt: 5536249423Sdim case Instruction::SExt: 5537249423Sdim case Instruction::FPToUI: 5538249423Sdim case Instruction::FPToSI: 5539249423Sdim case Instruction::FPExt: 5540249423Sdim case Instruction::PtrToInt: 5541249423Sdim case Instruction::IntToPtr: 5542249423Sdim case Instruction::SIToFP: 5543249423Sdim case Instruction::UIToFP: 5544249423Sdim case Instruction::Trunc: 5545249423Sdim case Instruction::FPTrunc: 5546249423Sdim case Instruction::BitCast: { 5547249423Sdim // We optimize the truncation of induction variable. 5548249423Sdim // The cost of these is the same as the scalar operation. 5549249423Sdim if (I->getOpcode() == Instruction::Trunc && 5550249423Sdim Legal->isInductionVariable(I->getOperand(0))) 5551249423Sdim return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 5552249423Sdim I->getOperand(0)->getType()); 5553296417Sdim 5554296417Sdim Type *SrcScalarTy = I->getOperand(0)->getType(); 5555296417Sdim Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); 5556296417Sdim if (VF > 1 && MinBWs.count(I)) { 5557296417Sdim // This cast is going to be shrunk. This may remove the cast or it might 5558296417Sdim // turn it into slightly different cast. For example, if MinBW == 16, 5559296417Sdim // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 5560296417Sdim // 5561296417Sdim // Calculate the modified src and dest types. 5562296417Sdim Type *MinVecTy = VectorTy; 5563296417Sdim if (I->getOpcode() == Instruction::Trunc) { 5564296417Sdim SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 5565296417Sdim VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), 5566296417Sdim MinVecTy); 5567296417Sdim } else if (I->getOpcode() == Instruction::ZExt || 5568296417Sdim I->getOpcode() == Instruction::SExt) { 5569296417Sdim SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 5570296417Sdim VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), 5571296417Sdim MinVecTy); 5572296417Sdim } 5573296417Sdim } 5574296417Sdim 5575249423Sdim return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 5576249423Sdim } 5577249423Sdim case Instruction::Call: { 5578288943Sdim bool NeedToScalarize; 5579249423Sdim CallInst *CI = cast<CallInst>(I); 5580288943Sdim unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); 5581288943Sdim if (getIntrinsicIDForCall(CI, TLI)) 5582288943Sdim return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); 5583288943Sdim return CallCost; 5584249423Sdim } 5585249423Sdim default: { 5586249423Sdim // We are scalarizing the instruction. Return the cost of the scalar 5587249423Sdim // instruction, plus the cost of insert and extract into vector 5588249423Sdim // elements, times the vector width. 5589249423Sdim unsigned Cost = 0; 5590243789Sdim 5591249423Sdim if (!RetTy->isVoidTy() && VF != 1) { 5592249423Sdim unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 5593249423Sdim VectorTy); 5594249423Sdim unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 5595249423Sdim VectorTy); 5596249423Sdim 5597243789Sdim // The cost of inserting the results plus extracting each one of the 5598243789Sdim // operands. 5599243789Sdim Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 5600249423Sdim } 5601243789Sdim 5602249423Sdim // The cost of executing VF copies of the scalar instruction. This opcode 5603249423Sdim // is unknown. Assume that it is the same as 'mul'. 5604249423Sdim Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 5605249423Sdim return Cost; 5606249423Sdim } 5607243789Sdim }// end of switch. 5608243789Sdim} 5609243789Sdim 5610243789Sdimchar LoopVectorize::ID = 0; 5611243789Sdimstatic const char lv_name[] = "Loop Vectorization"; 5612243789SdimINITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 5613288943SdimINITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 5614296417SdimINITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 5615296417SdimINITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 5616296417SdimINITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 5617280031SdimINITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 5618296417SdimINITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 5619276479SdimINITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5620296417SdimINITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 5621261991SdimINITIALIZE_PASS_DEPENDENCY(LCSSA) 5622288943SdimINITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 5623243789SdimINITIALIZE_PASS_DEPENDENCY(LoopSimplify) 5624288943SdimINITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 5625296417SdimINITIALIZE_PASS_DEPENDENCY(DemandedBits) 5626243789SdimINITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 5627243789Sdim 5628243789Sdimnamespace llvm { 5629276479Sdim Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) { 5630276479Sdim return new LoopVectorize(NoUnrolling, AlwaysVectorize); 5631243789Sdim } 5632243789Sdim} 5633243789Sdim 5634249423Sdimbool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 5635249423Sdim // Check for a store. 5636249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 5637249423Sdim return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 5638249423Sdim 5639249423Sdim // Check for a load. 5640249423Sdim if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 5641249423Sdim return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 5642249423Sdim 5643249423Sdim return false; 5644249423Sdim} 5645261991Sdim 5646261991Sdim 5647276479Sdimvoid InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, 5648276479Sdim bool IfPredicateStore) { 5649261991Sdim assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 5650261991Sdim // Holds vector parameters or scalars, in case of uniform vals. 5651261991Sdim SmallVector<VectorParts, 4> Params; 5652261991Sdim 5653261991Sdim setDebugLocFromInst(Builder, Instr); 5654261991Sdim 5655261991Sdim // Find all of the vectorized parameters. 5656261991Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5657261991Sdim Value *SrcOp = Instr->getOperand(op); 5658261991Sdim 5659261991Sdim // If we are accessing the old induction variable, use the new one. 5660261991Sdim if (SrcOp == OldInduction) { 5661261991Sdim Params.push_back(getVectorValue(SrcOp)); 5662261991Sdim continue; 5663261991Sdim } 5664261991Sdim 5665261991Sdim // Try using previously calculated values. 5666261991Sdim Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 5667261991Sdim 5668261991Sdim // If the src is an instruction that appeared earlier in the basic block 5669261991Sdim // then it should already be vectorized. 5670261991Sdim if (SrcInst && OrigLoop->contains(SrcInst)) { 5671261991Sdim assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 5672261991Sdim // The parameter is a vector value from earlier. 5673261991Sdim Params.push_back(WidenMap.get(SrcInst)); 5674261991Sdim } else { 5675261991Sdim // The parameter is a scalar from outside the loop. Maybe even a constant. 5676261991Sdim VectorParts Scalars; 5677261991Sdim Scalars.append(UF, SrcOp); 5678261991Sdim Params.push_back(Scalars); 5679261991Sdim } 5680261991Sdim } 5681261991Sdim 5682261991Sdim assert(Params.size() == Instr->getNumOperands() && 5683261991Sdim "Invalid number of operands"); 5684261991Sdim 5685261991Sdim // Does this instruction return a value ? 5686261991Sdim bool IsVoidRetTy = Instr->getType()->isVoidTy(); 5687261991Sdim 5688276479Sdim Value *UndefVec = IsVoidRetTy ? nullptr : 5689261991Sdim UndefValue::get(Instr->getType()); 5690261991Sdim // Create a new entry in the WidenMap and initialize it to Undef or Null. 5691261991Sdim VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 5692261991Sdim 5693276479Sdim VectorParts Cond; 5694276479Sdim if (IfPredicateStore) { 5695276479Sdim assert(Instr->getParent()->getSinglePredecessor() && 5696276479Sdim "Only support single predecessor blocks"); 5697276479Sdim Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), 5698276479Sdim Instr->getParent()); 5699276479Sdim } 5700276479Sdim 5701261991Sdim // For each vector unroll 'part': 5702261991Sdim for (unsigned Part = 0; Part < UF; ++Part) { 5703261991Sdim // For each scalar that we create: 5704261991Sdim 5705276479Sdim // Start an "if (pred) a[i] = ..." block. 5706276479Sdim Value *Cmp = nullptr; 5707276479Sdim if (IfPredicateStore) { 5708276479Sdim if (Cond[Part]->getType()->isVectorTy()) 5709276479Sdim Cond[Part] = 5710276479Sdim Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); 5711276479Sdim Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], 5712276479Sdim ConstantInt::get(Cond[Part]->getType(), 1)); 5713276479Sdim } 5714276479Sdim 5715261991Sdim Instruction *Cloned = Instr->clone(); 5716261991Sdim if (!IsVoidRetTy) 5717261991Sdim Cloned->setName(Instr->getName() + ".cloned"); 5718261991Sdim // Replace the operands of the cloned instructions with extracted scalars. 5719261991Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5720261991Sdim Value *Op = Params[op][Part]; 5721261991Sdim Cloned->setOperand(op, Op); 5722261991Sdim } 5723261991Sdim 5724261991Sdim // Place the cloned scalar in the new loop. 5725261991Sdim Builder.Insert(Cloned); 5726261991Sdim 5727261991Sdim // If the original scalar returns a value we need to place it in a vector 5728261991Sdim // so that future users will be able to use it. 5729261991Sdim if (!IsVoidRetTy) 5730261991Sdim VecResults[Part] = Cloned; 5731276479Sdim 5732296417Sdim // End if-block. 5733296417Sdim if (IfPredicateStore) 5734296417Sdim PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), 5735296417Sdim Cmp)); 5736261991Sdim } 5737261991Sdim} 5738261991Sdim 5739276479Sdimvoid InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { 5740276479Sdim StoreInst *SI = dyn_cast<StoreInst>(Instr); 5741276479Sdim bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); 5742276479Sdim 5743276479Sdim return scalarizeInstruction(Instr, IfPredicateStore); 5744261991Sdim} 5745261991Sdim 5746261991SdimValue *InnerLoopUnroller::reverseVector(Value *Vec) { 5747261991Sdim return Vec; 5748261991Sdim} 5749261991Sdim 5750261991SdimValue *InnerLoopUnroller::getBroadcastInstrs(Value *V) { 5751261991Sdim return V; 5752261991Sdim} 5753261991Sdim 5754288943SdimValue *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { 5755261991Sdim // When unrolling and the VF is 1, we only need to add a simple scalar. 5756261991Sdim Type *ITy = Val->getType(); 5757261991Sdim assert(!ITy->isVectorTy() && "Val must be a scalar"); 5758288943Sdim Constant *C = ConstantInt::get(ITy, StartIdx); 5759288943Sdim return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 5760261991Sdim} 5761