1243789Sdim//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2243789Sdim// 3243789Sdim// The LLVM Compiler Infrastructure 4243789Sdim// 5243789Sdim// This file is distributed under the University of Illinois Open Source 6243789Sdim// License. See LICENSE.TXT for details. 7243789Sdim// 8243789Sdim//===----------------------------------------------------------------------===// 9243789Sdim// 10243789Sdim// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 11251662Sdim// and generates target-independent LLVM-IR. 12251662Sdim// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 13251662Sdim// of instructions in order to estimate the profitability of vectorization. 14243789Sdim// 15249423Sdim// The loop vectorizer combines consecutive loop iterations into a single 16243789Sdim// 'wide' iteration. After this transformation the index is incremented 17243789Sdim// by the SIMD vector width, and not by one. 18243789Sdim// 19243789Sdim// This pass has three parts: 20243789Sdim// 1. The main loop pass that drives the different parts. 21243789Sdim// 2. LoopVectorizationLegality - A unit that checks for the legality 22243789Sdim// of the vectorization. 23249423Sdim// 3. InnerLoopVectorizer - A unit that performs the actual 24243789Sdim// widening of instructions. 25243789Sdim// 4. LoopVectorizationCostModel - A unit that checks for the profitability 26243789Sdim// of vectorization. It decides on the optimal vector width, which 27243789Sdim// can be one, if vectorization is not profitable. 28249423Sdim// 29243789Sdim//===----------------------------------------------------------------------===// 30243789Sdim// 31243789Sdim// The reduction-variable vectorization is based on the paper: 32243789Sdim// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 33243789Sdim// 34243789Sdim// Variable uniformity checks are inspired by: 35249423Sdim// Karrenberg, R. and Hack, S. Whole Function Vectorization. 36243789Sdim// 37243789Sdim// Other ideas/concepts are from: 38243789Sdim// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 39243789Sdim// 40249423Sdim// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 41249423Sdim// Vectorizing Compilers. 42249423Sdim// 43243789Sdim//===----------------------------------------------------------------------===// 44249423Sdim 45243789Sdim#define LV_NAME "loop-vectorize" 46243789Sdim#define DEBUG_TYPE LV_NAME 47249423Sdim 48249423Sdim#include "llvm/Transforms/Vectorize.h" 49249423Sdim#include "llvm/ADT/DenseMap.h" 50263508Sdim#include "llvm/ADT/EquivalenceClasses.h" 51263508Sdim#include "llvm/ADT/Hashing.h" 52249423Sdim#include "llvm/ADT/MapVector.h" 53263508Sdim#include "llvm/ADT/SetVector.h" 54249423Sdim#include "llvm/ADT/SmallPtrSet.h" 55249423Sdim#include "llvm/ADT/SmallSet.h" 56243789Sdim#include "llvm/ADT/SmallVector.h" 57243789Sdim#include "llvm/ADT/StringExtras.h" 58243789Sdim#include "llvm/Analysis/AliasAnalysis.h" 59249423Sdim#include "llvm/Analysis/Dominators.h" 60249423Sdim#include "llvm/Analysis/LoopInfo.h" 61249423Sdim#include "llvm/Analysis/LoopIterator.h" 62249423Sdim#include "llvm/Analysis/LoopPass.h" 63243789Sdim#include "llvm/Analysis/ScalarEvolution.h" 64249423Sdim#include "llvm/Analysis/ScalarEvolutionExpander.h" 65243789Sdim#include "llvm/Analysis/ScalarEvolutionExpressions.h" 66249423Sdim#include "llvm/Analysis/TargetTransformInfo.h" 67243789Sdim#include "llvm/Analysis/ValueTracking.h" 68249423Sdim#include "llvm/Analysis/Verifier.h" 69249423Sdim#include "llvm/IR/Constants.h" 70249423Sdim#include "llvm/IR/DataLayout.h" 71249423Sdim#include "llvm/IR/DerivedTypes.h" 72249423Sdim#include "llvm/IR/Function.h" 73249423Sdim#include "llvm/IR/IRBuilder.h" 74249423Sdim#include "llvm/IR/Instructions.h" 75249423Sdim#include "llvm/IR/IntrinsicInst.h" 76249423Sdim#include "llvm/IR/LLVMContext.h" 77249423Sdim#include "llvm/IR/Module.h" 78249423Sdim#include "llvm/IR/Type.h" 79249423Sdim#include "llvm/IR/Value.h" 80249423Sdim#include "llvm/Pass.h" 81243789Sdim#include "llvm/Support/CommandLine.h" 82243789Sdim#include "llvm/Support/Debug.h" 83251662Sdim#include "llvm/Support/PatternMatch.h" 84243789Sdim#include "llvm/Support/raw_ostream.h" 85251662Sdim#include "llvm/Support/ValueHandle.h" 86249423Sdim#include "llvm/Target/TargetLibraryInfo.h" 87249423Sdim#include "llvm/Transforms/Scalar.h" 88249423Sdim#include "llvm/Transforms/Utils/BasicBlockUtils.h" 89243789Sdim#include "llvm/Transforms/Utils/Local.h" 90243789Sdim#include <algorithm> 91249423Sdim#include <map> 92249423Sdim 93243789Sdimusing namespace llvm; 94251662Sdimusing namespace llvm::PatternMatch; 95243789Sdim 96243789Sdimstatic cl::opt<unsigned> 97243789SdimVectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, 98249423Sdim cl::desc("Sets the SIMD width. Zero is autoselect.")); 99243789Sdim 100249423Sdimstatic cl::opt<unsigned> 101249423SdimVectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, 102249423Sdim cl::desc("Sets the vectorization unroll count. " 103249423Sdim "Zero is autoselect.")); 104249423Sdim 105249423Sdimstatic cl::opt<bool> 106249423SdimEnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, 107249423Sdim cl::desc("Enable if-conversion during vectorization.")); 108249423Sdim 109243789Sdim/// We don't vectorize loops with a known constant trip count below this number. 110249423Sdimstatic cl::opt<unsigned> 111249423SdimTinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), 112249423Sdim cl::Hidden, 113249423Sdim cl::desc("Don't vectorize loops with a constant " 114249423Sdim "trip count that is smaller than this " 115249423Sdim "value.")); 116243789Sdim 117249423Sdim/// We don't unroll loops with a known constant trip count below this number. 118249423Sdimstatic const unsigned TinyTripCountUnrollThreshold = 128; 119249423Sdim 120251662Sdim/// When performing memory disambiguation checks at runtime do not make more 121251662Sdim/// than this number of comparisons. 122251662Sdimstatic const unsigned RuntimeMemoryCheckThreshold = 8; 123243789Sdim 124263508Sdim/// Maximum simd width. 125263508Sdimstatic const unsigned MaxVectorWidth = 64; 126249423Sdim 127263508Sdim/// Maximum vectorization unroll count. 128263508Sdimstatic const unsigned MaxUnrollFactor = 16; 129263508Sdim 130263508Sdim/// The cost of a loop that is considered 'small' by the unroller. 131263508Sdimstatic const unsigned SmallLoopCost = 20; 132263508Sdim 133243789Sdimnamespace { 134243789Sdim 135243789Sdim// Forward declarations. 136243789Sdimclass LoopVectorizationLegality; 137243789Sdimclass LoopVectorizationCostModel; 138243789Sdim 139249423Sdim/// InnerLoopVectorizer vectorizes loops which contain only one basic 140243789Sdim/// block to a specified vectorization factor (VF). 141243789Sdim/// This class performs the widening of scalars into vectors, or multiple 142243789Sdim/// scalars. This class also implements the following features: 143243789Sdim/// * It inserts an epilogue loop for handling loops that don't have iteration 144243789Sdim/// counts that are known to be a multiple of the vectorization factor. 145243789Sdim/// * It handles the code generation for reduction variables. 146243789Sdim/// * Scalarization (implementation using scalars) of un-vectorizable 147243789Sdim/// instructions. 148249423Sdim/// InnerLoopVectorizer does not perform any vectorization-legality 149243789Sdim/// checks, and relies on the caller to check for the different legality 150249423Sdim/// aspects. The InnerLoopVectorizer relies on the 151243789Sdim/// LoopVectorizationLegality class to provide information about the induction 152243789Sdim/// and reduction variables that were found to a given vectorization factor. 153249423Sdimclass InnerLoopVectorizer { 154243789Sdimpublic: 155249423Sdim InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 156249423Sdim DominatorTree *DT, DataLayout *DL, 157249423Sdim const TargetLibraryInfo *TLI, unsigned VecWidth, 158249423Sdim unsigned UnrollFactor) 159249423Sdim : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), 160249423Sdim VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), 161249423Sdim OldInduction(0), WidenMap(UnrollFactor) {} 162243789Sdim 163243789Sdim // Perform the actual loop widening (vectorization). 164243789Sdim void vectorize(LoopVectorizationLegality *Legal) { 165249423Sdim // Create a new empty loop. Unlink the old loop and connect the new one. 166243789Sdim createEmptyLoop(Legal); 167249423Sdim // Widen each instruction in the old loop to a new one in the new loop. 168249423Sdim // Use the Legality module to find the induction and reduction variables. 169243789Sdim vectorizeLoop(Legal); 170243789Sdim // Register the new loop and update the analysis passes. 171243789Sdim updateAnalysis(); 172249423Sdim } 173243789Sdim 174263508Sdim virtual ~InnerLoopVectorizer() {} 175263508Sdim 176263508Sdimprotected: 177249423Sdim /// A small list of PHINodes. 178249423Sdim typedef SmallVector<PHINode*, 4> PhiVector; 179249423Sdim /// When we unroll loops we have multiple vector values for each scalar. 180249423Sdim /// This data structure holds the unrolled and vectorized values that 181249423Sdim /// originated from one scalar instruction. 182249423Sdim typedef SmallVector<Value*, 2> VectorParts; 183249423Sdim 184263508Sdim // When we if-convert we need create edge masks. We have to cache values so 185263508Sdim // that we don't end up with exponential recursion/IR. 186263508Sdim typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>, 187263508Sdim VectorParts> EdgeMaskCache; 188263508Sdim 189249423Sdim /// Add code that checks at runtime if the accessed arrays overlap. 190249423Sdim /// Returns the comparator value or NULL if no check is needed. 191249423Sdim Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, 192249423Sdim Instruction *Loc); 193243789Sdim /// Create an empty loop, based on the loop ranges of the old loop. 194243789Sdim void createEmptyLoop(LoopVectorizationLegality *Legal); 195243789Sdim /// Copy and widen the instructions from the old loop. 196263508Sdim virtual void vectorizeLoop(LoopVectorizationLegality *Legal); 197249423Sdim 198263508Sdim /// \brief The Loop exit block may have single value PHI nodes where the 199263508Sdim /// incoming value is 'Undef'. While vectorizing we only handled real values 200263508Sdim /// that were defined inside the loop. Here we fix the 'undef case'. 201263508Sdim /// See PR14725. 202263508Sdim void fixLCSSAPHIs(); 203263508Sdim 204249423Sdim /// A helper function that computes the predicate of the block BB, assuming 205249423Sdim /// that the header block of the loop is set to True. It returns the *entry* 206249423Sdim /// mask for the block BB. 207249423Sdim VectorParts createBlockInMask(BasicBlock *BB); 208249423Sdim /// A helper function that computes the predicate of the edge between SRC 209249423Sdim /// and DST. 210249423Sdim VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); 211249423Sdim 212249423Sdim /// A helper function to vectorize a single BB within the innermost loop. 213249423Sdim void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB, 214249423Sdim PhiVector *PV); 215249423Sdim 216263508Sdim /// Vectorize a single PHINode in a block. This method handles the induction 217263508Sdim /// variable canonicalization. It supports both VF = 1 for unrolled loops and 218263508Sdim /// arbitrary length vectors. 219263508Sdim void widenPHIInstruction(Instruction *PN, VectorParts &Entry, 220263508Sdim LoopVectorizationLegality *Legal, 221263508Sdim unsigned UF, unsigned VF, PhiVector *PV); 222263508Sdim 223243789Sdim /// Insert the new loop to the loop hierarchy and pass manager 224243789Sdim /// and update the analysis passes. 225243789Sdim void updateAnalysis(); 226243789Sdim 227243789Sdim /// This instruction is un-vectorizable. Implement it as a sequence 228243789Sdim /// of scalars. 229263508Sdim virtual void scalarizeInstruction(Instruction *Instr); 230243789Sdim 231249423Sdim /// Vectorize Load and Store instructions, 232263508Sdim virtual void vectorizeMemoryInstruction(Instruction *Instr, 233249423Sdim LoopVectorizationLegality *Legal); 234249423Sdim 235243789Sdim /// Create a broadcast instruction. This method generates a broadcast 236243789Sdim /// instruction (shuffle) for loop invariant values and for the induction 237243789Sdim /// value. If this is the induction variable then we extend it to N, N+1, ... 238243789Sdim /// this is needed because each iteration in the loop corresponds to a SIMD 239243789Sdim /// element. 240263508Sdim virtual Value *getBroadcastInstrs(Value *V); 241243789Sdim 242249423Sdim /// This function adds 0, 1, 2 ... to each vector element, starting at zero. 243249423Sdim /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). 244249423Sdim /// The sequence starts at StartIndex. 245263508Sdim virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); 246243789Sdim 247243789Sdim /// When we go over instructions in the basic block we rely on previous 248243789Sdim /// values within the current basic block or on loop invariant values. 249243789Sdim /// When we widen (vectorize) values we place them in the map. If the values 250243789Sdim /// are not within the map, they have to be loop invariant, so we simply 251243789Sdim /// broadcast them into a vector. 252249423Sdim VectorParts &getVectorValue(Value *V); 253243789Sdim 254249423Sdim /// Generate a shuffle sequence that will reverse the vector Vec. 255263508Sdim virtual Value *reverseVector(Value *Vec); 256243789Sdim 257249423Sdim /// This is a helper class that holds the vectorizer state. It maps scalar 258249423Sdim /// instructions to vector instructions. When the code is 'unrolled' then 259249423Sdim /// then a single scalar value is mapped to multiple vector parts. The parts 260249423Sdim /// are stored in the VectorPart type. 261249423Sdim struct ValueMap { 262249423Sdim /// C'tor. UnrollFactor controls the number of vectors ('parts') that 263249423Sdim /// are mapped. 264249423Sdim ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} 265243789Sdim 266249423Sdim /// \return True if 'Key' is saved in the Value Map. 267249423Sdim bool has(Value *Key) const { return MapStorage.count(Key); } 268249423Sdim 269249423Sdim /// Initializes a new entry in the map. Sets all of the vector parts to the 270249423Sdim /// save value in 'Val'. 271249423Sdim /// \return A reference to a vector with splat values. 272249423Sdim VectorParts &splat(Value *Key, Value *Val) { 273249423Sdim VectorParts &Entry = MapStorage[Key]; 274249423Sdim Entry.assign(UF, Val); 275249423Sdim return Entry; 276249423Sdim } 277249423Sdim 278249423Sdim ///\return A reference to the value that is stored at 'Key'. 279249423Sdim VectorParts &get(Value *Key) { 280249423Sdim VectorParts &Entry = MapStorage[Key]; 281249423Sdim if (Entry.empty()) 282249423Sdim Entry.resize(UF); 283249423Sdim assert(Entry.size() == UF); 284249423Sdim return Entry; 285249423Sdim } 286249423Sdim 287249423Sdim private: 288249423Sdim /// The unroll factor. Each entry in the map stores this number of vector 289249423Sdim /// elements. 290249423Sdim unsigned UF; 291249423Sdim 292249423Sdim /// Map storage. We use std::map and not DenseMap because insertions to a 293249423Sdim /// dense map invalidates its iterators. 294249423Sdim std::map<Value *, VectorParts> MapStorage; 295249423Sdim }; 296249423Sdim 297243789Sdim /// The original loop. 298243789Sdim Loop *OrigLoop; 299249423Sdim /// Scev analysis to use. 300243789Sdim ScalarEvolution *SE; 301249423Sdim /// Loop Info. 302243789Sdim LoopInfo *LI; 303249423Sdim /// Dominator Tree. 304243789Sdim DominatorTree *DT; 305249423Sdim /// Data Layout. 306249423Sdim DataLayout *DL; 307249423Sdim /// Target Library Info. 308249423Sdim const TargetLibraryInfo *TLI; 309249423Sdim 310249423Sdim /// The vectorization SIMD factor to use. Each vector will have this many 311249423Sdim /// vector elements. 312243789Sdim unsigned VF; 313263508Sdim 314263508Sdimprotected: 315249423Sdim /// The vectorization unroll factor to use. Each scalar is vectorized to this 316249423Sdim /// many different vector instructions. 317249423Sdim unsigned UF; 318243789Sdim 319249423Sdim /// The builder that we use 320243789Sdim IRBuilder<> Builder; 321243789Sdim 322243789Sdim // --- Vectorization state --- 323243789Sdim 324243789Sdim /// The vector-loop preheader. 325243789Sdim BasicBlock *LoopVectorPreHeader; 326243789Sdim /// The scalar-loop preheader. 327243789Sdim BasicBlock *LoopScalarPreHeader; 328243789Sdim /// Middle Block between the vector and the scalar. 329243789Sdim BasicBlock *LoopMiddleBlock; 330243789Sdim ///The ExitBlock of the scalar loop. 331243789Sdim BasicBlock *LoopExitBlock; 332243789Sdim ///The vector loop body. 333243789Sdim BasicBlock *LoopVectorBody; 334243789Sdim ///The scalar loop body. 335243789Sdim BasicBlock *LoopScalarBody; 336249423Sdim /// A list of all bypass blocks. The first block is the entry of the loop. 337249423Sdim SmallVector<BasicBlock *, 4> LoopBypassBlocks; 338243789Sdim 339243789Sdim /// The new Induction variable which was added to the new block. 340243789Sdim PHINode *Induction; 341243789Sdim /// The induction variable of the old basic block. 342243789Sdim PHINode *OldInduction; 343263508Sdim /// Holds the extended (to the widest induction type) start index. 344263508Sdim Value *ExtendedIdx; 345249423Sdim /// Maps scalars to widened vectors. 346243789Sdim ValueMap WidenMap; 347263508Sdim EdgeMaskCache MaskCache; 348243789Sdim}; 349243789Sdim 350263508Sdimclass InnerLoopUnroller : public InnerLoopVectorizer { 351263508Sdimpublic: 352263508Sdim InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, 353263508Sdim DominatorTree *DT, DataLayout *DL, 354263508Sdim const TargetLibraryInfo *TLI, unsigned UnrollFactor) : 355263508Sdim InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } 356263508Sdim 357263508Sdimprivate: 358263508Sdim virtual void scalarizeInstruction(Instruction *Instr); 359263508Sdim virtual void vectorizeMemoryInstruction(Instruction *Instr, 360263508Sdim LoopVectorizationLegality *Legal); 361263508Sdim virtual Value *getBroadcastInstrs(Value *V); 362263508Sdim virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); 363263508Sdim virtual Value *reverseVector(Value *Vec); 364263508Sdim}; 365263508Sdim 366263508Sdim/// \brief Look for a meaningful debug location on the instruction or it's 367263508Sdim/// operands. 368263508Sdimstatic Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 369263508Sdim if (!I) 370263508Sdim return I; 371263508Sdim 372263508Sdim DebugLoc Empty; 373263508Sdim if (I->getDebugLoc() != Empty) 374263508Sdim return I; 375263508Sdim 376263508Sdim for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 377263508Sdim if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 378263508Sdim if (OpInst->getDebugLoc() != Empty) 379263508Sdim return OpInst; 380263508Sdim } 381263508Sdim 382263508Sdim return I; 383263508Sdim} 384263508Sdim 385263508Sdim/// \brief Set the debug location in the builder using the debug location in the 386263508Sdim/// instruction. 387263508Sdimstatic void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 388263508Sdim if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) 389263508Sdim B.SetCurrentDebugLocation(Inst->getDebugLoc()); 390263508Sdim else 391263508Sdim B.SetCurrentDebugLocation(DebugLoc()); 392263508Sdim} 393263508Sdim 394243789Sdim/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and 395243789Sdim/// to what vectorization factor. 396243789Sdim/// This class does not look at the profitability of vectorization, only the 397243789Sdim/// legality. This class has two main kinds of checks: 398243789Sdim/// * Memory checks - The code in canVectorizeMemory checks if vectorization 399243789Sdim/// will change the order of memory accesses in a way that will change the 400243789Sdim/// correctness of the program. 401249423Sdim/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory 402249423Sdim/// checks for a number of different conditions, such as the availability of a 403249423Sdim/// single induction variable, that all types are supported and vectorize-able, 404249423Sdim/// etc. This code reflects the capabilities of InnerLoopVectorizer. 405249423Sdim/// This class is also used by InnerLoopVectorizer for identifying 406243789Sdim/// induction variable and the different reduction variables. 407243789Sdimclass LoopVectorizationLegality { 408243789Sdimpublic: 409249423Sdim LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, 410263508Sdim DominatorTree *DT, TargetLibraryInfo *TLI) 411263508Sdim : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), 412263508Sdim Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), 413263508Sdim MaxSafeDepDistBytes(-1U) {} 414243789Sdim 415249423Sdim /// This enum represents the kinds of reductions that we support. 416243789Sdim enum ReductionKind { 417249423Sdim RK_NoReduction, ///< Not a reduction. 418249423Sdim RK_IntegerAdd, ///< Sum of integers. 419249423Sdim RK_IntegerMult, ///< Product of integers. 420249423Sdim RK_IntegerOr, ///< Bitwise or logical OR of numbers. 421249423Sdim RK_IntegerAnd, ///< Bitwise or logical AND of numbers. 422249423Sdim RK_IntegerXor, ///< Bitwise or logical XOR of numbers. 423251662Sdim RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). 424249423Sdim RK_FloatAdd, ///< Sum of floats. 425251662Sdim RK_FloatMult, ///< Product of floats. 426251662Sdim RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). 427243789Sdim }; 428243789Sdim 429249423Sdim /// This enum represents the kinds of inductions that we support. 430249423Sdim enum InductionKind { 431249423Sdim IK_NoInduction, ///< Not an induction variable. 432249423Sdim IK_IntInduction, ///< Integer induction variable. Step = 1. 433249423Sdim IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. 434249423Sdim IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). 435249423Sdim IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). 436249423Sdim }; 437249423Sdim 438251662Sdim // This enum represents the kind of minmax reduction. 439251662Sdim enum MinMaxReductionKind { 440251662Sdim MRK_Invalid, 441251662Sdim MRK_UIntMin, 442251662Sdim MRK_UIntMax, 443251662Sdim MRK_SIntMin, 444251662Sdim MRK_SIntMax, 445251662Sdim MRK_FloatMin, 446251662Sdim MRK_FloatMax 447251662Sdim }; 448251662Sdim 449263508Sdim /// This struct holds information about reduction variables. 450243789Sdim struct ReductionDescriptor { 451249423Sdim ReductionDescriptor() : StartValue(0), LoopExitInstr(0), 452251662Sdim Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {} 453243789Sdim 454251662Sdim ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K, 455251662Sdim MinMaxReductionKind MK) 456251662Sdim : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {} 457243789Sdim 458243789Sdim // The starting value of the reduction. 459243789Sdim // It does not have to be zero! 460251662Sdim TrackingVH<Value> StartValue; 461243789Sdim // The instruction who's value is used outside the loop. 462243789Sdim Instruction *LoopExitInstr; 463243789Sdim // The kind of the reduction. 464243789Sdim ReductionKind Kind; 465251662Sdim // If this a min/max reduction the kind of reduction. 466251662Sdim MinMaxReductionKind MinMaxKind; 467243789Sdim }; 468243789Sdim 469251662Sdim /// This POD struct holds information about a potential reduction operation. 470251662Sdim struct ReductionInstDesc { 471251662Sdim ReductionInstDesc(bool IsRedux, Instruction *I) : 472251662Sdim IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} 473251662Sdim 474251662Sdim ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : 475251662Sdim IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} 476251662Sdim 477251662Sdim // Is this instruction a reduction candidate. 478251662Sdim bool IsReduction; 479251662Sdim // The last instruction in a min/max pattern (select of the select(icmp()) 480251662Sdim // pattern), or the current reduction instruction otherwise. 481251662Sdim Instruction *PatternLastInst; 482251662Sdim // If this is a min/max pattern the comparison predicate. 483251662Sdim MinMaxReductionKind MinMaxKind; 484251662Sdim }; 485251662Sdim 486263508Sdim /// This struct holds information about the memory runtime legality 487263508Sdim /// check that a group of pointers do not overlap. 488243789Sdim struct RuntimePointerCheck { 489249423Sdim RuntimePointerCheck() : Need(false) {} 490249423Sdim 491249423Sdim /// Reset the state of the pointer runtime information. 492249423Sdim void reset() { 493249423Sdim Need = false; 494249423Sdim Pointers.clear(); 495249423Sdim Starts.clear(); 496249423Sdim Ends.clear(); 497263508Sdim IsWritePtr.clear(); 498263508Sdim DependencySetId.clear(); 499249423Sdim } 500249423Sdim 501249423Sdim /// Insert a pointer and calculate the start and end SCEVs. 502263508Sdim void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, 503263508Sdim unsigned DepSetId); 504249423Sdim 505243789Sdim /// This flag indicates if we need to add the runtime check. 506243789Sdim bool Need; 507243789Sdim /// Holds the pointers that we need to check. 508251662Sdim SmallVector<TrackingVH<Value>, 2> Pointers; 509249423Sdim /// Holds the pointer value at the beginning of the loop. 510249423Sdim SmallVector<const SCEV*, 2> Starts; 511249423Sdim /// Holds the pointer value at the end of the loop. 512249423Sdim SmallVector<const SCEV*, 2> Ends; 513251662Sdim /// Holds the information if this pointer is used for writing to memory. 514251662Sdim SmallVector<bool, 2> IsWritePtr; 515263508Sdim /// Holds the id of the set of pointers that could be dependent because of a 516263508Sdim /// shared underlying object. 517263508Sdim SmallVector<unsigned, 2> DependencySetId; 518243789Sdim }; 519243789Sdim 520263508Sdim /// A struct for saving information about induction variables. 521249423Sdim struct InductionInfo { 522249423Sdim InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} 523249423Sdim InductionInfo() : StartValue(0), IK(IK_NoInduction) {} 524249423Sdim /// Start value. 525251662Sdim TrackingVH<Value> StartValue; 526249423Sdim /// Induction kind. 527249423Sdim InductionKind IK; 528249423Sdim }; 529249423Sdim 530243789Sdim /// ReductionList contains the reduction descriptors for all 531243789Sdim /// of the reductions that were found in the loop. 532243789Sdim typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; 533243789Sdim 534249423Sdim /// InductionList saves induction variables and maps them to the 535249423Sdim /// induction descriptor. 536249423Sdim typedef MapVector<PHINode*, InductionInfo> InductionList; 537249423Sdim 538243789Sdim /// Returns true if it is legal to vectorize this loop. 539243789Sdim /// This does not mean that it is profitable to vectorize this 540243789Sdim /// loop, only that it is legal to do so. 541243789Sdim bool canVectorize(); 542243789Sdim 543243789Sdim /// Returns the Induction variable. 544249423Sdim PHINode *getInduction() { return Induction; } 545243789Sdim 546243789Sdim /// Returns the reduction variables found in the loop. 547243789Sdim ReductionList *getReductionVars() { return &Reductions; } 548243789Sdim 549249423Sdim /// Returns the induction variables found in the loop. 550249423Sdim InductionList *getInductionVars() { return &Inductions; } 551249423Sdim 552263508Sdim /// Returns the widest induction type. 553263508Sdim Type *getWidestInductionType() { return WidestIndTy; } 554263508Sdim 555249423Sdim /// Returns True if V is an induction variable in this loop. 556249423Sdim bool isInductionVariable(const Value *V); 557249423Sdim 558249423Sdim /// Return true if the block BB needs to be predicated in order for the loop 559249423Sdim /// to be vectorized. 560249423Sdim bool blockNeedsPredication(BasicBlock *BB); 561249423Sdim 562249423Sdim /// Check if this pointer is consecutive when vectorizing. This happens 563249423Sdim /// when the last index of the GEP is the induction variable, or that the 564249423Sdim /// pointer itself is an induction variable. 565243789Sdim /// This check allows us to vectorize A[idx] into a wide load/store. 566249423Sdim /// Returns: 567249423Sdim /// 0 - Stride is unknown or non consecutive. 568249423Sdim /// 1 - Address is consecutive. 569249423Sdim /// -1 - Address is consecutive, and decreasing. 570249423Sdim int isConsecutivePtr(Value *Ptr); 571243789Sdim 572243789Sdim /// Returns true if the value V is uniform within the loop. 573243789Sdim bool isUniform(Value *V); 574243789Sdim 575243789Sdim /// Returns true if this instruction will remain scalar after vectorization. 576249423Sdim bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } 577243789Sdim 578243789Sdim /// Returns the information that we collected about runtime memory check. 579249423Sdim RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } 580251662Sdim 581251662Sdim /// This function returns the identity element (or neutral element) for 582251662Sdim /// the operation K. 583251662Sdim static Constant *getReductionIdentity(ReductionKind K, Type *Tp); 584263508Sdim 585263508Sdim unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } 586263508Sdim 587243789Sdimprivate: 588243789Sdim /// Check if a single basic block loop is vectorizable. 589243789Sdim /// At this point we know that this is a loop with a constant trip count 590243789Sdim /// and we only need to check individual instructions. 591249423Sdim bool canVectorizeInstrs(); 592243789Sdim 593243789Sdim /// When we vectorize loops we may change the order in which 594243789Sdim /// we read and write from memory. This method checks if it is 595243789Sdim /// legal to vectorize the code, considering only memory constrains. 596249423Sdim /// Returns true if the loop is vectorizable 597249423Sdim bool canVectorizeMemory(); 598243789Sdim 599249423Sdim /// Return true if we can vectorize this loop using the IF-conversion 600249423Sdim /// transformation. 601249423Sdim bool canVectorizeWithIfConvert(); 602249423Sdim 603249423Sdim /// Collect the variables that need to stay uniform after vectorization. 604249423Sdim void collectLoopUniforms(); 605249423Sdim 606249423Sdim /// Return true if all of the instructions in the block can be speculatively 607263508Sdim /// executed. \p SafePtrs is a list of addresses that are known to be legal 608263508Sdim /// and we know that we can read from them without segfault. 609263508Sdim bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs); 610249423Sdim 611243789Sdim /// Returns True, if 'Phi' is the kind of reduction variable for type 612243789Sdim /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. 613243789Sdim bool AddReductionVar(PHINode *Phi, ReductionKind Kind); 614251662Sdim /// Returns a struct describing if the instruction 'I' can be a reduction 615251662Sdim /// variable of type 'Kind'. If the reduction is a min/max pattern of 616251662Sdim /// select(icmp()) this function advances the instruction pointer 'I' from the 617251662Sdim /// compare instruction to the select instruction and stores this pointer in 618251662Sdim /// 'PatternLastInst' member of the returned struct. 619251662Sdim ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind, 620251662Sdim ReductionInstDesc &Desc); 621251662Sdim /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 622251662Sdim /// pattern corresponding to a min(X, Y) or max(X, Y). 623251662Sdim static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, 624251662Sdim ReductionInstDesc &Prev); 625249423Sdim /// Returns the induction kind of Phi. This function may return NoInduction 626249423Sdim /// if the PHI is not an induction variable. 627249423Sdim InductionKind isInductionVariable(PHINode *Phi); 628243789Sdim 629243789Sdim /// The loop that we evaluate. 630243789Sdim Loop *TheLoop; 631243789Sdim /// Scev analysis. 632243789Sdim ScalarEvolution *SE; 633243789Sdim /// DataLayout analysis. 634243789Sdim DataLayout *DL; 635249423Sdim /// Dominators. 636249423Sdim DominatorTree *DT; 637249423Sdim /// Target Library Info. 638249423Sdim TargetLibraryInfo *TLI; 639243789Sdim 640243789Sdim // --- vectorization state --- // 641243789Sdim 642249423Sdim /// Holds the integer induction variable. This is the counter of the 643249423Sdim /// loop. 644243789Sdim PHINode *Induction; 645243789Sdim /// Holds the reduction variables. 646243789Sdim ReductionList Reductions; 647249423Sdim /// Holds all of the induction variables that we found in the loop. 648249423Sdim /// Notice that inductions don't need to start at zero and that induction 649249423Sdim /// variables can be pointers. 650249423Sdim InductionList Inductions; 651263508Sdim /// Holds the widest induction type encountered. 652263508Sdim Type *WidestIndTy; 653249423Sdim 654243789Sdim /// Allowed outside users. This holds the reduction 655243789Sdim /// vars which can be accessed from outside the loop. 656243789Sdim SmallPtrSet<Value*, 4> AllowedExit; 657243789Sdim /// This set holds the variables which are known to be uniform after 658243789Sdim /// vectorization. 659243789Sdim SmallPtrSet<Instruction*, 4> Uniforms; 660243789Sdim /// We need to check that all of the pointers in this list are disjoint 661243789Sdim /// at runtime. 662243789Sdim RuntimePointerCheck PtrRtCheck; 663251662Sdim /// Can we assume the absence of NaNs. 664251662Sdim bool HasFunNoNaNAttr; 665263508Sdim 666263508Sdim unsigned MaxSafeDepDistBytes; 667243789Sdim}; 668243789Sdim 669243789Sdim/// LoopVectorizationCostModel - estimates the expected speedups due to 670243789Sdim/// vectorization. 671249423Sdim/// In many cases vectorization is not profitable. This can happen because of 672249423Sdim/// a number of reasons. In this class we mainly attempt to predict the 673249423Sdim/// expected speedup/slowdowns due to the supported instruction set. We use the 674249423Sdim/// TargetTransformInfo to query the different backends for the cost of 675249423Sdim/// different operations. 676243789Sdimclass LoopVectorizationCostModel { 677243789Sdimpublic: 678249423Sdim LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, 679249423Sdim LoopVectorizationLegality *Legal, 680249423Sdim const TargetTransformInfo &TTI, 681249423Sdim DataLayout *DL, const TargetLibraryInfo *TLI) 682249423Sdim : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} 683243789Sdim 684249423Sdim /// Information about vectorization costs 685249423Sdim struct VectorizationFactor { 686249423Sdim unsigned Width; // Vector width with best cost 687249423Sdim unsigned Cost; // Cost of the loop with that width 688249423Sdim }; 689249423Sdim /// \return The most profitable vectorization factor and the cost of that VF. 690249423Sdim /// This method checks every power of two up to VF. If UserVF is not ZERO 691249423Sdim /// then this vectorization factor will be selected if vectorization is 692249423Sdim /// possible. 693249423Sdim VectorizationFactor selectVectorizationFactor(bool OptForSize, 694249423Sdim unsigned UserVF); 695243789Sdim 696249423Sdim /// \return The size (in bits) of the widest type in the code that 697249423Sdim /// needs to be vectorized. We ignore values that remain scalar such as 698249423Sdim /// 64 bit loop indices. 699249423Sdim unsigned getWidestType(); 700249423Sdim 701249423Sdim /// \return The most profitable unroll factor. 702249423Sdim /// If UserUF is non-zero then this method finds the best unroll-factor 703249423Sdim /// based on register pressure and other parameters. 704249423Sdim /// VF and LoopCost are the selected vectorization factor and the cost of the 705249423Sdim /// selected VF. 706249423Sdim unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, 707249423Sdim unsigned LoopCost); 708249423Sdim 709249423Sdim /// \brief A struct that represents some properties of the register usage 710249423Sdim /// of a loop. 711249423Sdim struct RegisterUsage { 712249423Sdim /// Holds the number of loop invariant values that are used in the loop. 713249423Sdim unsigned LoopInvariantRegs; 714249423Sdim /// Holds the maximum number of concurrent live intervals in the loop. 715249423Sdim unsigned MaxLocalUsers; 716249423Sdim /// Holds the number of instructions in the loop. 717249423Sdim unsigned NumInstructions; 718249423Sdim }; 719249423Sdim 720249423Sdim /// \return information about the register usage of the loop. 721249423Sdim RegisterUsage calculateRegisterUsage(); 722249423Sdim 723243789Sdimprivate: 724243789Sdim /// Returns the expected execution cost. The unit of the cost does 725243789Sdim /// not matter because we use the 'cost' units to compare different 726243789Sdim /// vector widths. The cost that is returned is *not* normalized by 727243789Sdim /// the factor width. 728243789Sdim unsigned expectedCost(unsigned VF); 729243789Sdim 730243789Sdim /// Returns the execution time cost of an instruction for a given vector 731243789Sdim /// width. Vector width of one means scalar. 732243789Sdim unsigned getInstructionCost(Instruction *I, unsigned VF); 733243789Sdim 734243789Sdim /// A helper function for converting Scalar types to vector types. 735243789Sdim /// If the incoming type is void, we return void. If the VF is 1, we return 736243789Sdim /// the scalar type. 737243789Sdim static Type* ToVectorTy(Type *Scalar, unsigned VF); 738243789Sdim 739249423Sdim /// Returns whether the instruction is a load or store and will be a emitted 740249423Sdim /// as a vector operation. 741249423Sdim bool isConsecutiveLoadOrStore(Instruction *I); 742249423Sdim 743243789Sdim /// The loop that we evaluate. 744243789Sdim Loop *TheLoop; 745243789Sdim /// Scev analysis. 746243789Sdim ScalarEvolution *SE; 747249423Sdim /// Loop Info analysis. 748249423Sdim LoopInfo *LI; 749243789Sdim /// Vectorization legality. 750243789Sdim LoopVectorizationLegality *Legal; 751243789Sdim /// Vector target information. 752249423Sdim const TargetTransformInfo &TTI; 753249423Sdim /// Target data layout information. 754249423Sdim DataLayout *DL; 755249423Sdim /// Target Library Info. 756249423Sdim const TargetLibraryInfo *TLI; 757243789Sdim}; 758243789Sdim 759263508Sdim/// Utility class for getting and setting loop vectorizer hints in the form 760263508Sdim/// of loop metadata. 761263508Sdimstruct LoopVectorizeHints { 762263508Sdim /// Vectorization width. 763263508Sdim unsigned Width; 764263508Sdim /// Vectorization unroll factor. 765263508Sdim unsigned Unroll; 766263508Sdim 767263508Sdim LoopVectorizeHints(const Loop *L, bool DisableUnrolling) 768263508Sdim : Width(VectorizationFactor) 769263508Sdim , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) 770263508Sdim , LoopID(L->getLoopID()) { 771263508Sdim getHints(L); 772263508Sdim // The command line options override any loop metadata except for when 773263508Sdim // width == 1 which is used to indicate the loop is already vectorized. 774263508Sdim if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1) 775263508Sdim Width = VectorizationFactor; 776263508Sdim if (VectorizationUnroll.getNumOccurrences() > 0) 777263508Sdim Unroll = VectorizationUnroll; 778263508Sdim 779263508Sdim DEBUG(if (DisableUnrolling && Unroll == 1) 780263508Sdim dbgs() << "LV: Unrolling disabled by the pass manager\n"); 781263508Sdim } 782263508Sdim 783263508Sdim /// Return the loop vectorizer metadata prefix. 784263508Sdim static StringRef Prefix() { return "llvm.vectorizer."; } 785263508Sdim 786263508Sdim MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) { 787263508Sdim SmallVector<Value*, 2> Vals; 788263508Sdim Vals.push_back(MDString::get(Context, Name)); 789263508Sdim Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); 790263508Sdim return MDNode::get(Context, Vals); 791263508Sdim } 792263508Sdim 793263508Sdim /// Mark the loop L as already vectorized by setting the width to 1. 794263508Sdim void setAlreadyVectorized(Loop *L) { 795263508Sdim LLVMContext &Context = L->getHeader()->getContext(); 796263508Sdim 797263508Sdim Width = 1; 798263508Sdim 799263508Sdim // Create a new loop id with one more operand for the already_vectorized 800263508Sdim // hint. If the loop already has a loop id then copy the existing operands. 801263508Sdim SmallVector<Value*, 4> Vals(1); 802263508Sdim if (LoopID) 803263508Sdim for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) 804263508Sdim Vals.push_back(LoopID->getOperand(i)); 805263508Sdim 806263508Sdim Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); 807263508Sdim Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); 808263508Sdim 809263508Sdim MDNode *NewLoopID = MDNode::get(Context, Vals); 810263508Sdim // Set operand 0 to refer to the loop id itself. 811263508Sdim NewLoopID->replaceOperandWith(0, NewLoopID); 812263508Sdim 813263508Sdim L->setLoopID(NewLoopID); 814263508Sdim if (LoopID) 815263508Sdim LoopID->replaceAllUsesWith(NewLoopID); 816263508Sdim 817263508Sdim LoopID = NewLoopID; 818263508Sdim } 819263508Sdim 820263508Sdimprivate: 821263508Sdim MDNode *LoopID; 822263508Sdim 823263508Sdim /// Find hints specified in the loop metadata. 824263508Sdim void getHints(const Loop *L) { 825263508Sdim if (!LoopID) 826263508Sdim return; 827263508Sdim 828263508Sdim // First operand should refer to the loop id itself. 829263508Sdim assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); 830263508Sdim assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); 831263508Sdim 832263508Sdim for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 833263508Sdim const MDString *S = 0; 834263508Sdim SmallVector<Value*, 4> Args; 835263508Sdim 836263508Sdim // The expected hint is either a MDString or a MDNode with the first 837263508Sdim // operand a MDString. 838263508Sdim if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) { 839263508Sdim if (!MD || MD->getNumOperands() == 0) 840263508Sdim continue; 841263508Sdim S = dyn_cast<MDString>(MD->getOperand(0)); 842263508Sdim for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) 843263508Sdim Args.push_back(MD->getOperand(i)); 844263508Sdim } else { 845263508Sdim S = dyn_cast<MDString>(LoopID->getOperand(i)); 846263508Sdim assert(Args.size() == 0 && "too many arguments for MDString"); 847263508Sdim } 848263508Sdim 849263508Sdim if (!S) 850263508Sdim continue; 851263508Sdim 852263508Sdim // Check if the hint starts with the vectorizer prefix. 853263508Sdim StringRef Hint = S->getString(); 854263508Sdim if (!Hint.startswith(Prefix())) 855263508Sdim continue; 856263508Sdim // Remove the prefix. 857263508Sdim Hint = Hint.substr(Prefix().size(), StringRef::npos); 858263508Sdim 859263508Sdim if (Args.size() == 1) 860263508Sdim getHint(Hint, Args[0]); 861263508Sdim } 862263508Sdim } 863263508Sdim 864263508Sdim // Check string hint with one operand. 865263508Sdim void getHint(StringRef Hint, Value *Arg) { 866263508Sdim const ConstantInt *C = dyn_cast<ConstantInt>(Arg); 867263508Sdim if (!C) return; 868263508Sdim unsigned Val = C->getZExtValue(); 869263508Sdim 870263508Sdim if (Hint == "width") { 871263508Sdim if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) 872263508Sdim Width = Val; 873263508Sdim else 874263508Sdim DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); 875263508Sdim } else if (Hint == "unroll") { 876263508Sdim if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) 877263508Sdim Unroll = Val; 878263508Sdim else 879263508Sdim DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); 880263508Sdim } else { 881263508Sdim DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); 882263508Sdim } 883263508Sdim } 884263508Sdim}; 885263508Sdim 886249423Sdim/// The LoopVectorize Pass. 887243789Sdimstruct LoopVectorize : public LoopPass { 888249423Sdim /// Pass identification, replacement for typeid 889249423Sdim static char ID; 890243789Sdim 891263508Sdim explicit LoopVectorize(bool NoUnrolling = false) 892263508Sdim : LoopPass(ID), DisableUnrolling(NoUnrolling) { 893243789Sdim initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 894243789Sdim } 895243789Sdim 896243789Sdim ScalarEvolution *SE; 897243789Sdim DataLayout *DL; 898243789Sdim LoopInfo *LI; 899243789Sdim TargetTransformInfo *TTI; 900243789Sdim DominatorTree *DT; 901249423Sdim TargetLibraryInfo *TLI; 902263508Sdim bool DisableUnrolling; 903243789Sdim 904243789Sdim virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { 905243789Sdim // We only vectorize innermost loops. 906243789Sdim if (!L->empty()) 907243789Sdim return false; 908243789Sdim 909243789Sdim SE = &getAnalysis<ScalarEvolution>(); 910243789Sdim DL = getAnalysisIfAvailable<DataLayout>(); 911243789Sdim LI = &getAnalysis<LoopInfo>(); 912249423Sdim TTI = &getAnalysis<TargetTransformInfo>(); 913243789Sdim DT = &getAnalysis<DominatorTree>(); 914249423Sdim TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 915243789Sdim 916263508Sdim // If the target claims to have no vector registers don't attempt 917263508Sdim // vectorization. 918263508Sdim if (!TTI->getNumberOfRegisters(true)) 919263508Sdim return false; 920263508Sdim 921251662Sdim if (DL == NULL) { 922263508Sdim DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n"); 923251662Sdim return false; 924251662Sdim } 925251662Sdim 926243789Sdim DEBUG(dbgs() << "LV: Checking a loop in \"" << 927243789Sdim L->getHeader()->getParent()->getName() << "\"\n"); 928243789Sdim 929263508Sdim LoopVectorizeHints Hints(L, DisableUnrolling); 930263508Sdim 931263508Sdim if (Hints.Width == 1 && Hints.Unroll == 1) { 932263508Sdim DEBUG(dbgs() << "LV: Not vectorizing.\n"); 933263508Sdim return false; 934263508Sdim } 935263508Sdim 936243789Sdim // Check if it is legal to vectorize the loop. 937263508Sdim LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); 938243789Sdim if (!LVL.canVectorize()) { 939243789Sdim DEBUG(dbgs() << "LV: Not vectorizing.\n"); 940243789Sdim return false; 941243789Sdim } 942243789Sdim 943249423Sdim // Use the cost model. 944249423Sdim LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); 945243789Sdim 946249423Sdim // Check the function attributes to find out if this function should be 947249423Sdim // optimized for size. 948249423Sdim Function *F = L->getHeader()->getParent(); 949249423Sdim Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; 950249423Sdim Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; 951249423Sdim unsigned FnIndex = AttributeSet::FunctionIndex; 952249423Sdim bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr); 953249423Sdim bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); 954243789Sdim 955249423Sdim if (NoFloat) { 956249423Sdim DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" 957249423Sdim "attribute is used.\n"); 958249423Sdim return false; 959243789Sdim } 960243789Sdim 961249423Sdim // Select the optimal vectorization factor. 962249423Sdim LoopVectorizationCostModel::VectorizationFactor VF; 963263508Sdim VF = CM.selectVectorizationFactor(OptForSize, Hints.Width); 964249423Sdim // Select the unroll factor. 965263508Sdim unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, 966263508Sdim VF.Cost); 967243789Sdim 968263508Sdim DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< 969263508Sdim F->getParent()->getModuleIdentifier() << '\n'); 970263508Sdim DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); 971263508Sdim 972249423Sdim if (VF.Width == 1) { 973249423Sdim DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 974263508Sdim if (UF == 1) 975263508Sdim return false; 976263508Sdim // We decided not to vectorize, but we may want to unroll. 977263508Sdim InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); 978263508Sdim Unroller.vectorize(&LVL); 979263508Sdim } else { 980263508Sdim // If we decided that it is *legal* to vectorize the loop then do it. 981263508Sdim InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); 982263508Sdim LB.vectorize(&LVL); 983249423Sdim } 984249423Sdim 985263508Sdim // Mark the loop as already vectorized to avoid vectorizing again. 986263508Sdim Hints.setAlreadyVectorized(L); 987249423Sdim 988243789Sdim DEBUG(verifyFunction(*L->getHeader()->getParent())); 989243789Sdim return true; 990243789Sdim } 991243789Sdim 992243789Sdim virtual void getAnalysisUsage(AnalysisUsage &AU) const { 993243789Sdim LoopPass::getAnalysisUsage(AU); 994243789Sdim AU.addRequiredID(LoopSimplifyID); 995243789Sdim AU.addRequiredID(LCSSAID); 996249423Sdim AU.addRequired<DominatorTree>(); 997243789Sdim AU.addRequired<LoopInfo>(); 998243789Sdim AU.addRequired<ScalarEvolution>(); 999249423Sdim AU.addRequired<TargetTransformInfo>(); 1000243789Sdim AU.addPreserved<LoopInfo>(); 1001243789Sdim AU.addPreserved<DominatorTree>(); 1002243789Sdim } 1003243789Sdim 1004243789Sdim}; 1005243789Sdim 1006249423Sdim} // end anonymous namespace 1007249423Sdim 1008249423Sdim//===----------------------------------------------------------------------===// 1009249423Sdim// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1010249423Sdim// LoopVectorizationCostModel. 1011249423Sdim//===----------------------------------------------------------------------===// 1012249423Sdim 1013249423Sdimvoid 1014249423SdimLoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, 1015251662Sdim Loop *Lp, Value *Ptr, 1016263508Sdim bool WritePtr, 1017263508Sdim unsigned DepSetId) { 1018249423Sdim const SCEV *Sc = SE->getSCEV(Ptr); 1019249423Sdim const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 1020249423Sdim assert(AR && "Invalid addrec expression"); 1021263508Sdim const SCEV *Ex = SE->getBackedgeTakenCount(Lp); 1022249423Sdim const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 1023249423Sdim Pointers.push_back(Ptr); 1024249423Sdim Starts.push_back(AR->getStart()); 1025249423Sdim Ends.push_back(ScEnd); 1026251662Sdim IsWritePtr.push_back(WritePtr); 1027263508Sdim DependencySetId.push_back(DepSetId); 1028249423Sdim} 1029249423Sdim 1030249423SdimValue *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1031249423Sdim // We need to place the broadcast of invariant variables outside the loop. 1032249423Sdim Instruction *Instr = dyn_cast<Instruction>(V); 1033249423Sdim bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); 1034249423Sdim bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; 1035249423Sdim 1036249423Sdim // Place the code for broadcasting invariant variables in the new preheader. 1037263508Sdim IRBuilder<>::InsertPointGuard Guard(Builder); 1038249423Sdim if (Invariant) 1039249423Sdim Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1040249423Sdim 1041243789Sdim // Broadcast the scalar into all locations in the vector. 1042249423Sdim Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1043249423Sdim 1044243789Sdim return Shuf; 1045243789Sdim} 1046243789Sdim 1047250593SdimValue *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, 1048249423Sdim bool Negate) { 1049243789Sdim assert(Val->getType()->isVectorTy() && "Must be a vector"); 1050243789Sdim assert(Val->getType()->getScalarType()->isIntegerTy() && 1051243789Sdim "Elem must be an integer"); 1052243789Sdim // Create the types. 1053243789Sdim Type *ITy = Val->getType()->getScalarType(); 1054243789Sdim VectorType *Ty = cast<VectorType>(Val->getType()); 1055249423Sdim int VLen = Ty->getNumElements(); 1056243789Sdim SmallVector<Constant*, 8> Indices; 1057243789Sdim 1058243789Sdim // Create a vector of consecutive numbers from zero to VF. 1059249423Sdim for (int i = 0; i < VLen; ++i) { 1060250593Sdim int64_t Idx = Negate ? (-i) : i; 1061250593Sdim Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); 1062249423Sdim } 1063243789Sdim 1064243789Sdim // Add the consecutive indices to the vector value. 1065243789Sdim Constant *Cv = ConstantVector::get(Indices); 1066243789Sdim assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1067243789Sdim return Builder.CreateAdd(Val, Cv, "induction"); 1068243789Sdim} 1069243789Sdim 1070263508Sdim/// \brief Find the operand of the GEP that should be checked for consecutive 1071263508Sdim/// stores. This ignores trailing indices that have no effect on the final 1072263508Sdim/// pointer. 1073263508Sdimstatic unsigned getGEPInductionOperand(DataLayout *DL, 1074263508Sdim const GetElementPtrInst *Gep) { 1075263508Sdim unsigned LastOperand = Gep->getNumOperands() - 1; 1076263508Sdim unsigned GEPAllocSize = DL->getTypeAllocSize( 1077263508Sdim cast<PointerType>(Gep->getType()->getScalarType())->getElementType()); 1078263508Sdim 1079263508Sdim // Walk backwards and try to peel off zeros. 1080263508Sdim while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { 1081263508Sdim // Find the type we're currently indexing into. 1082263508Sdim gep_type_iterator GEPTI = gep_type_begin(Gep); 1083263508Sdim std::advance(GEPTI, LastOperand - 1); 1084263508Sdim 1085263508Sdim // If it's a type with the same allocation size as the result of the GEP we 1086263508Sdim // can peel off the zero index. 1087263508Sdim if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) 1088263508Sdim break; 1089263508Sdim --LastOperand; 1090263508Sdim } 1091263508Sdim 1092263508Sdim return LastOperand; 1093263508Sdim} 1094263508Sdim 1095249423Sdimint LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { 1096249423Sdim assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); 1097249423Sdim // Make sure that the pointer does not point to structs. 1098263508Sdim if (Ptr->getType()->getPointerElementType()->isAggregateType()) 1099249423Sdim return 0; 1100249423Sdim 1101249423Sdim // If this value is a pointer induction variable we know it is consecutive. 1102249423Sdim PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); 1103249423Sdim if (Phi && Inductions.count(Phi)) { 1104249423Sdim InductionInfo II = Inductions[Phi]; 1105249423Sdim if (IK_PtrInduction == II.IK) 1106249423Sdim return 1; 1107249423Sdim else if (IK_ReversePtrInduction == II.IK) 1108249423Sdim return -1; 1109249423Sdim } 1110249423Sdim 1111243789Sdim GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); 1112243789Sdim if (!Gep) 1113249423Sdim return 0; 1114243789Sdim 1115243789Sdim unsigned NumOperands = Gep->getNumOperands(); 1116249423Sdim Value *GpPtr = Gep->getPointerOperand(); 1117249423Sdim // If this GEP value is a consecutive pointer induction variable and all of 1118249423Sdim // the indices are constant then we know it is consecutive. We can 1119249423Sdim Phi = dyn_cast<PHINode>(GpPtr); 1120249423Sdim if (Phi && Inductions.count(Phi)) { 1121249423Sdim 1122249423Sdim // Make sure that the pointer does not point to structs. 1123249423Sdim PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); 1124249423Sdim if (GepPtrType->getElementType()->isAggregateType()) 1125249423Sdim return 0; 1126249423Sdim 1127249423Sdim // Make sure that all of the index operands are loop invariant. 1128249423Sdim for (unsigned i = 1; i < NumOperands; ++i) 1129249423Sdim if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 1130249423Sdim return 0; 1131249423Sdim 1132249423Sdim InductionInfo II = Inductions[Phi]; 1133249423Sdim if (IK_PtrInduction == II.IK) 1134249423Sdim return 1; 1135249423Sdim else if (IK_ReversePtrInduction == II.IK) 1136249423Sdim return -1; 1137249423Sdim } 1138249423Sdim 1139263508Sdim unsigned InductionOperand = getGEPInductionOperand(DL, Gep); 1140263508Sdim 1141263508Sdim // Check that all of the gep indices are uniform except for our induction 1142263508Sdim // operand. 1143263508Sdim for (unsigned i = 0; i != NumOperands; ++i) 1144263508Sdim if (i != InductionOperand && 1145263508Sdim !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) 1146249423Sdim return 0; 1147243789Sdim 1148263508Sdim // We can emit wide load/stores only if the last non-zero index is the 1149263508Sdim // induction variable. 1150263508Sdim const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand)); 1151243789Sdim if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { 1152243789Sdim const SCEV *Step = AR->getStepRecurrence(*SE); 1153243789Sdim 1154243789Sdim // The memory is consecutive because the last index is consecutive 1155243789Sdim // and all other indices are loop invariant. 1156243789Sdim if (Step->isOne()) 1157249423Sdim return 1; 1158249423Sdim if (Step->isAllOnesValue()) 1159249423Sdim return -1; 1160243789Sdim } 1161243789Sdim 1162249423Sdim return 0; 1163243789Sdim} 1164243789Sdim 1165243789Sdimbool LoopVectorizationLegality::isUniform(Value *V) { 1166243789Sdim return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 1167243789Sdim} 1168243789Sdim 1169249423SdimInnerLoopVectorizer::VectorParts& 1170249423SdimInnerLoopVectorizer::getVectorValue(Value *V) { 1171249423Sdim assert(V != Induction && "The new induction variable should not be used."); 1172243789Sdim assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1173243789Sdim 1174249423Sdim // If we have this scalar in the map, return it. 1175249423Sdim if (WidenMap.has(V)) 1176249423Sdim return WidenMap.get(V); 1177249423Sdim 1178249423Sdim // If this scalar is unknown, assume that it is a constant or that it is 1179249423Sdim // loop invariant. Broadcast V and save the value for future uses. 1180243789Sdim Value *B = getBroadcastInstrs(V); 1181249423Sdim return WidenMap.splat(V, B); 1182243789Sdim} 1183243789Sdim 1184249423SdimValue *InnerLoopVectorizer::reverseVector(Value *Vec) { 1185249423Sdim assert(Vec->getType()->isVectorTy() && "Invalid type"); 1186249423Sdim SmallVector<Constant*, 8> ShuffleMask; 1187243789Sdim for (unsigned i = 0; i < VF; ++i) 1188249423Sdim ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 1189243789Sdim 1190249423Sdim return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 1191249423Sdim ConstantVector::get(ShuffleMask), 1192249423Sdim "reverse"); 1193243789Sdim} 1194243789Sdim 1195249423Sdim 1196249423Sdimvoid InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 1197249423Sdim LoopVectorizationLegality *Legal) { 1198249423Sdim // Attempt to issue a wide load. 1199249423Sdim LoadInst *LI = dyn_cast<LoadInst>(Instr); 1200249423Sdim StoreInst *SI = dyn_cast<StoreInst>(Instr); 1201249423Sdim 1202249423Sdim assert((LI || SI) && "Invalid Load/Store instruction"); 1203249423Sdim 1204249423Sdim Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 1205249423Sdim Type *DataTy = VectorType::get(ScalarDataTy, VF); 1206249423Sdim Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); 1207249423Sdim unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); 1208263508Sdim // An alignment of 0 means target abi alignment. We need to use the scalar's 1209263508Sdim // target abi alignment in such a case. 1210263508Sdim if (!Alignment) 1211263508Sdim Alignment = DL->getABITypeAlignment(ScalarDataTy); 1212263508Sdim unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 1213251662Sdim unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); 1214251662Sdim unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; 1215251662Sdim 1216251662Sdim if (ScalarAllocatedSize != VectorElementSize) 1217251662Sdim return scalarizeInstruction(Instr); 1218251662Sdim 1219249423Sdim // If the pointer is loop invariant or if it is non consecutive, 1220249423Sdim // scalarize the load. 1221251662Sdim int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 1222251662Sdim bool Reverse = ConsecutiveStride < 0; 1223249423Sdim bool UniformLoad = LI && Legal->isUniform(Ptr); 1224251662Sdim if (!ConsecutiveStride || UniformLoad) 1225249423Sdim return scalarizeInstruction(Instr); 1226249423Sdim 1227249423Sdim Constant *Zero = Builder.getInt32(0); 1228249423Sdim VectorParts &Entry = WidenMap.get(Instr); 1229249423Sdim 1230249423Sdim // Handle consecutive loads/stores. 1231249423Sdim GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 1232249423Sdim if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { 1233263508Sdim setDebugLocFromInst(Builder, Gep); 1234249423Sdim Value *PtrOperand = Gep->getPointerOperand(); 1235249423Sdim Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; 1236249423Sdim FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); 1237249423Sdim 1238249423Sdim // Create the new GEP with the new induction variable. 1239249423Sdim GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1240249423Sdim Gep2->setOperand(0, FirstBasePtr); 1241249423Sdim Gep2->setName("gep.indvar.base"); 1242249423Sdim Ptr = Builder.Insert(Gep2); 1243249423Sdim } else if (Gep) { 1244263508Sdim setDebugLocFromInst(Builder, Gep); 1245249423Sdim assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), 1246249423Sdim OrigLoop) && "Base ptr must be invariant"); 1247249423Sdim 1248249423Sdim // The last index does not have to be the induction. It can be 1249249423Sdim // consecutive and be a function of the index. For example A[I+1]; 1250249423Sdim unsigned NumOperands = Gep->getNumOperands(); 1251263508Sdim unsigned InductionOperand = getGEPInductionOperand(DL, Gep); 1252263508Sdim // Create the new GEP with the new induction variable. 1253263508Sdim GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); 1254249423Sdim 1255263508Sdim for (unsigned i = 0; i < NumOperands; ++i) { 1256263508Sdim Value *GepOperand = Gep->getOperand(i); 1257263508Sdim Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand); 1258249423Sdim 1259263508Sdim // Update last index or loop invariant instruction anchored in loop. 1260263508Sdim if (i == InductionOperand || 1261263508Sdim (GepOperandInst && OrigLoop->contains(GepOperandInst))) { 1262263508Sdim assert((i == InductionOperand || 1263263508Sdim SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && 1264263508Sdim "Must be last index or loop invariant"); 1265263508Sdim 1266263508Sdim VectorParts &GEPParts = getVectorValue(GepOperand); 1267263508Sdim Value *Index = GEPParts[0]; 1268263508Sdim Index = Builder.CreateExtractElement(Index, Zero); 1269263508Sdim Gep2->setOperand(i, Index); 1270263508Sdim Gep2->setName("gep.indvar.idx"); 1271263508Sdim } 1272263508Sdim } 1273249423Sdim Ptr = Builder.Insert(Gep2); 1274249423Sdim } else { 1275249423Sdim // Use the induction element ptr. 1276249423Sdim assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); 1277263508Sdim setDebugLocFromInst(Builder, Ptr); 1278249423Sdim VectorParts &PtrVal = getVectorValue(Ptr); 1279249423Sdim Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); 1280249423Sdim } 1281249423Sdim 1282249423Sdim // Handle Stores: 1283249423Sdim if (SI) { 1284249423Sdim assert(!Legal->isUniform(SI->getPointerOperand()) && 1285249423Sdim "We do not allow storing to uniform addresses"); 1286263508Sdim setDebugLocFromInst(Builder, SI); 1287263508Sdim // We don't want to update the value in the map as it might be used in 1288263508Sdim // another expression. So don't use a reference type for "StoredVal". 1289263508Sdim VectorParts StoredVal = getVectorValue(SI->getValueOperand()); 1290249423Sdim 1291249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 1292249423Sdim // Calculate the pointer for the specific unroll-part. 1293249423Sdim Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1294249423Sdim 1295249423Sdim if (Reverse) { 1296249423Sdim // If we store to reverse consecutive memory locations then we need 1297249423Sdim // to reverse the order of elements in the stored value. 1298249423Sdim StoredVal[Part] = reverseVector(StoredVal[Part]); 1299249423Sdim // If the address is consecutive but reversed, then the 1300249423Sdim // wide store needs to start at the last vector element. 1301249423Sdim PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1302249423Sdim PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1303249423Sdim } 1304249423Sdim 1305263508Sdim Value *VecPtr = Builder.CreateBitCast(PartPtr, 1306263508Sdim DataTy->getPointerTo(AddressSpace)); 1307249423Sdim Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); 1308249423Sdim } 1309263508Sdim return; 1310249423Sdim } 1311249423Sdim 1312263508Sdim // Handle loads. 1313263508Sdim assert(LI && "Must have a load instruction"); 1314263508Sdim setDebugLocFromInst(Builder, LI); 1315249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 1316249423Sdim // Calculate the pointer for the specific unroll-part. 1317249423Sdim Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); 1318249423Sdim 1319249423Sdim if (Reverse) { 1320249423Sdim // If the address is consecutive but reversed, then the 1321249423Sdim // wide store needs to start at the last vector element. 1322249423Sdim PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); 1323249423Sdim PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); 1324249423Sdim } 1325249423Sdim 1326263508Sdim Value *VecPtr = Builder.CreateBitCast(PartPtr, 1327263508Sdim DataTy->getPointerTo(AddressSpace)); 1328249423Sdim Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); 1329249423Sdim cast<LoadInst>(LI)->setAlignment(Alignment); 1330249423Sdim Entry[Part] = Reverse ? reverseVector(LI) : LI; 1331249423Sdim } 1332249423Sdim} 1333249423Sdim 1334249423Sdimvoid InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { 1335243789Sdim assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 1336243789Sdim // Holds vector parameters or scalars, in case of uniform vals. 1337249423Sdim SmallVector<VectorParts, 4> Params; 1338243789Sdim 1339263508Sdim setDebugLocFromInst(Builder, Instr); 1340263508Sdim 1341243789Sdim // Find all of the vectorized parameters. 1342243789Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1343243789Sdim Value *SrcOp = Instr->getOperand(op); 1344243789Sdim 1345243789Sdim // If we are accessing the old induction variable, use the new one. 1346243789Sdim if (SrcOp == OldInduction) { 1347249423Sdim Params.push_back(getVectorValue(SrcOp)); 1348243789Sdim continue; 1349243789Sdim } 1350243789Sdim 1351243789Sdim // Try using previously calculated values. 1352243789Sdim Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 1353243789Sdim 1354243789Sdim // If the src is an instruction that appeared earlier in the basic block 1355243789Sdim // then it should already be vectorized. 1356249423Sdim if (SrcInst && OrigLoop->contains(SrcInst)) { 1357249423Sdim assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 1358243789Sdim // The parameter is a vector value from earlier. 1359249423Sdim Params.push_back(WidenMap.get(SrcInst)); 1360243789Sdim } else { 1361243789Sdim // The parameter is a scalar from outside the loop. Maybe even a constant. 1362249423Sdim VectorParts Scalars; 1363249423Sdim Scalars.append(UF, SrcOp); 1364249423Sdim Params.push_back(Scalars); 1365243789Sdim } 1366243789Sdim } 1367243789Sdim 1368243789Sdim assert(Params.size() == Instr->getNumOperands() && 1369243789Sdim "Invalid number of operands"); 1370243789Sdim 1371243789Sdim // Does this instruction return a value ? 1372243789Sdim bool IsVoidRetTy = Instr->getType()->isVoidTy(); 1373243789Sdim 1374249423Sdim Value *UndefVec = IsVoidRetTy ? 0 : 1375249423Sdim UndefValue::get(VectorType::get(Instr->getType(), VF)); 1376249423Sdim // Create a new entry in the WidenMap and initialize it to Undef or Null. 1377249423Sdim VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 1378243789Sdim 1379249817Sdim // For each vector unroll 'part': 1380249817Sdim for (unsigned Part = 0; Part < UF; ++Part) { 1381249817Sdim // For each scalar that we create: 1382249817Sdim for (unsigned Width = 0; Width < VF; ++Width) { 1383249423Sdim Instruction *Cloned = Instr->clone(); 1384249423Sdim if (!IsVoidRetTy) 1385249423Sdim Cloned->setName(Instr->getName() + ".cloned"); 1386263508Sdim // Replace the operands of the cloned instructions with extracted scalars. 1387249423Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 1388249423Sdim Value *Op = Params[op][Part]; 1389249423Sdim // Param is a vector. Need to extract the right lane. 1390249423Sdim if (Op->getType()->isVectorTy()) 1391249423Sdim Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width)); 1392249423Sdim Cloned->setOperand(op, Op); 1393249423Sdim } 1394249423Sdim 1395249423Sdim // Place the cloned scalar in the new loop. 1396249423Sdim Builder.Insert(Cloned); 1397249423Sdim 1398249423Sdim // If the original scalar returns a value we need to place it in a vector 1399249423Sdim // so that future users will be able to use it. 1400249423Sdim if (!IsVoidRetTy) 1401249423Sdim VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, 1402249423Sdim Builder.getInt32(Width)); 1403243789Sdim } 1404249423Sdim } 1405249423Sdim} 1406243789Sdim 1407249423SdimInstruction * 1408249423SdimInnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, 1409249423Sdim Instruction *Loc) { 1410249423Sdim LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = 1411249423Sdim Legal->getRuntimePointerCheck(); 1412243789Sdim 1413249423Sdim if (!PtrRtCheck->Need) 1414249423Sdim return NULL; 1415249423Sdim 1416249423Sdim unsigned NumPointers = PtrRtCheck->Pointers.size(); 1417263508Sdim SmallVector<TrackingVH<Value> , 2> Starts; 1418263508Sdim SmallVector<TrackingVH<Value> , 2> Ends; 1419249423Sdim 1420263508Sdim LLVMContext &Ctx = Loc->getContext(); 1421249423Sdim SCEVExpander Exp(*SE, "induction"); 1422249423Sdim 1423249423Sdim for (unsigned i = 0; i < NumPointers; ++i) { 1424249423Sdim Value *Ptr = PtrRtCheck->Pointers[i]; 1425249423Sdim const SCEV *Sc = SE->getSCEV(Ptr); 1426249423Sdim 1427249423Sdim if (SE->isLoopInvariant(Sc, OrigLoop)) { 1428249423Sdim DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << 1429249423Sdim *Ptr <<"\n"); 1430249423Sdim Starts.push_back(Ptr); 1431249423Sdim Ends.push_back(Ptr); 1432249423Sdim } else { 1433263508Sdim DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); 1434263508Sdim unsigned AS = Ptr->getType()->getPointerAddressSpace(); 1435249423Sdim 1436263508Sdim // Use this type for pointer arithmetic. 1437263508Sdim Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); 1438263508Sdim 1439249423Sdim Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); 1440249423Sdim Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); 1441249423Sdim Starts.push_back(Start); 1442249423Sdim Ends.push_back(End); 1443249423Sdim } 1444243789Sdim } 1445243789Sdim 1446249423Sdim IRBuilder<> ChkBuilder(Loc); 1447263508Sdim // Our instructions might fold to a constant. 1448263508Sdim Value *MemoryRuntimeCheck = 0; 1449249423Sdim for (unsigned i = 0; i < NumPointers; ++i) { 1450249423Sdim for (unsigned j = i+1; j < NumPointers; ++j) { 1451251662Sdim // No need to check if two readonly pointers intersect. 1452251662Sdim if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) 1453251662Sdim continue; 1454251662Sdim 1455263508Sdim // Only need to check pointers between two different dependency sets. 1456263508Sdim if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) 1457263508Sdim continue; 1458249423Sdim 1459263508Sdim unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); 1460263508Sdim unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); 1461263508Sdim 1462263508Sdim assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && 1463263508Sdim (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && 1464263508Sdim "Trying to bounds check pointers with different address spaces"); 1465263508Sdim 1466263508Sdim Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); 1467263508Sdim Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); 1468263508Sdim 1469263508Sdim Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); 1470263508Sdim Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); 1471263508Sdim Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); 1472263508Sdim Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); 1473263508Sdim 1474249423Sdim Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); 1475249423Sdim Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); 1476249423Sdim Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 1477249423Sdim if (MemoryRuntimeCheck) 1478249423Sdim IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, 1479249423Sdim "conflict.rdx"); 1480263508Sdim MemoryRuntimeCheck = IsConflict; 1481249423Sdim } 1482249423Sdim } 1483249423Sdim 1484263508Sdim // We have to do this trickery because the IRBuilder might fold the check to a 1485263508Sdim // constant expression in which case there is no Instruction anchored in a 1486263508Sdim // the block. 1487263508Sdim Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, 1488263508Sdim ConstantInt::getTrue(Ctx)); 1489263508Sdim ChkBuilder.Insert(Check, "memcheck.conflict"); 1490263508Sdim return Check; 1491243789Sdim} 1492243789Sdim 1493243789Sdimvoid 1494249423SdimInnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { 1495243789Sdim /* 1496243789Sdim In this function we generate a new loop. The new loop will contain 1497243789Sdim the vectorized instructions while the old loop will continue to run the 1498243789Sdim scalar remainder. 1499243789Sdim 1500249423Sdim [ ] <-- vector loop bypass (may consist of multiple blocks). 1501249423Sdim / | 1502249423Sdim / v 1503249423Sdim | [ ] <-- vector pre header. 1504249423Sdim | | 1505249423Sdim | v 1506249423Sdim | [ ] \ 1507249423Sdim | [ ]_| <-- vector loop. 1508249423Sdim | | 1509249423Sdim \ v 1510249423Sdim >[ ] <--- middle-block. 1511249423Sdim / | 1512249423Sdim / v 1513249423Sdim | [ ] <--- new preheader. 1514249423Sdim | | 1515249423Sdim | v 1516249423Sdim | [ ] \ 1517249423Sdim | [ ]_| <-- old scalar loop to handle remainder. 1518249423Sdim \ | 1519249423Sdim \ v 1520249423Sdim >[ ] <-- exit block. 1521243789Sdim ... 1522243789Sdim */ 1523243789Sdim 1524249423Sdim BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 1525249423Sdim BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); 1526249423Sdim BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 1527249423Sdim assert(ExitBlock && "Must have an exit block"); 1528249423Sdim 1529249423Sdim // Some loops have a single integer induction variable, while other loops 1530249423Sdim // don't. One example is c++ iterators that often have multiple pointer 1531249423Sdim // induction variables. In the code below we also support a case where we 1532249423Sdim // don't have a single induction variable. 1533243789Sdim OldInduction = Legal->getInduction(); 1534263508Sdim Type *IdxTy = Legal->getWidestInductionType(); 1535243789Sdim 1536243789Sdim // Find the loop boundaries. 1537263508Sdim const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); 1538243789Sdim assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); 1539243789Sdim 1540263508Sdim // The exit count might have the type of i64 while the phi is i32. This can 1541263508Sdim // happen if we have an induction variable that is sign extended before the 1542263508Sdim // compare. The only way that we get a backedge taken count is that the 1543263508Sdim // induction variable was signed and as such will not overflow. In such a case 1544263508Sdim // truncation is legal. 1545263508Sdim if (ExitCount->getType()->getPrimitiveSizeInBits() > 1546263508Sdim IdxTy->getPrimitiveSizeInBits()) 1547263508Sdim ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); 1548263508Sdim 1549263508Sdim ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); 1550243789Sdim // Get the total trip count from the count by adding 1. 1551243789Sdim ExitCount = SE->getAddExpr(ExitCount, 1552243789Sdim SE->getConstant(ExitCount->getType(), 1)); 1553243789Sdim 1554249423Sdim // Expand the trip count and place the new instructions in the preheader. 1555249423Sdim // Notice that the pre-header does not change, only the loop body. 1556249423Sdim SCEVExpander Exp(*SE, "induction"); 1557243789Sdim 1558249423Sdim // Count holds the overall loop count (N). 1559249423Sdim Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 1560249423Sdim BypassBlock->getTerminator()); 1561243789Sdim 1562249423Sdim // The loop index does not have to start at Zero. Find the original start 1563249423Sdim // value from the induction PHI node. If we don't have an induction variable 1564249423Sdim // then we know that it starts at zero. 1565263508Sdim Builder.SetInsertPoint(BypassBlock->getTerminator()); 1566263508Sdim Value *StartIdx = ExtendedIdx = OldInduction ? 1567263508Sdim Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), 1568263508Sdim IdxTy): 1569263508Sdim ConstantInt::get(IdxTy, 0); 1570249423Sdim 1571243789Sdim assert(BypassBlock && "Invalid loop structure"); 1572249423Sdim LoopBypassBlocks.push_back(BypassBlock); 1573243789Sdim 1574249423Sdim // Split the single block loop into the two loop structure described above. 1575243789Sdim BasicBlock *VectorPH = 1576249423Sdim BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); 1577249423Sdim BasicBlock *VecBody = 1578249423Sdim VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 1579249423Sdim BasicBlock *MiddleBlock = 1580249423Sdim VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 1581243789Sdim BasicBlock *ScalarPH = 1582249423Sdim MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 1583243789Sdim 1584263508Sdim // Create and register the new vector loop. 1585263508Sdim Loop* Lp = new Loop(); 1586263508Sdim Loop *ParentLoop = OrigLoop->getParentLoop(); 1587263508Sdim 1588263508Sdim // Insert the new loop into the loop nest and register the new basic blocks 1589263508Sdim // before calling any utilities such as SCEV that require valid LoopInfo. 1590263508Sdim if (ParentLoop) { 1591263508Sdim ParentLoop->addChildLoop(Lp); 1592263508Sdim ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); 1593263508Sdim ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); 1594263508Sdim ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); 1595263508Sdim } else { 1596263508Sdim LI->addTopLevelLoop(Lp); 1597263508Sdim } 1598263508Sdim Lp->addBasicBlockToLoop(VecBody, LI->getBase()); 1599263508Sdim 1600243789Sdim // Use this IR builder to create the loop instructions (Phi, Br, Cmp) 1601243789Sdim // inside the loop. 1602263508Sdim Builder.SetInsertPoint(VecBody->getFirstNonPHI()); 1603243789Sdim 1604243789Sdim // Generate the induction variable. 1605263508Sdim setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); 1606243789Sdim Induction = Builder.CreatePHI(IdxTy, 2, "index"); 1607249423Sdim // The loop step is equal to the vectorization factor (num of SIMD elements) 1608249423Sdim // times the unroll factor (num of SIMD instructions). 1609249423Sdim Constant *Step = ConstantInt::get(IdxTy, VF * UF); 1610243789Sdim 1611249423Sdim // This is the IR builder that we use to add all of the logic for bypassing 1612249423Sdim // the new vector loop. 1613249423Sdim IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); 1614263508Sdim setDebugLocFromInst(BypassBuilder, 1615263508Sdim getDebugLocFromInstOrOperands(OldInduction)); 1616243789Sdim 1617249423Sdim // We may need to extend the index in case there is a type mismatch. 1618249423Sdim // We know that the count starts at zero and does not overflow. 1619249423Sdim if (Count->getType() != IdxTy) { 1620249423Sdim // The exit count can be of pointer type. Convert it to the correct 1621249423Sdim // integer type. 1622249423Sdim if (ExitCount->getType()->isPointerTy()) 1623249423Sdim Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); 1624249423Sdim else 1625249423Sdim Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); 1626249423Sdim } 1627243789Sdim 1628243789Sdim // Add the start index to the loop count to get the new end index. 1629249423Sdim Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); 1630243789Sdim 1631243789Sdim // Now we need to generate the expression for N - (N % VF), which is 1632243789Sdim // the part that the vectorized body will execute. 1633249423Sdim Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); 1634249423Sdim Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); 1635249423Sdim Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, 1636249423Sdim "end.idx.rnd.down"); 1637243789Sdim 1638249423Sdim // Now, compare the new count to zero. If it is zero skip the vector loop and 1639249423Sdim // jump to the scalar loop. 1640249423Sdim Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, 1641249423Sdim "cmp.zero"); 1642243789Sdim 1643249423Sdim BasicBlock *LastBypassBlock = BypassBlock; 1644243789Sdim 1645249423Sdim // Generate the code that checks in runtime if arrays overlap. We put the 1646249423Sdim // checks into a separate block to make the more common case of few elements 1647249423Sdim // faster. 1648249423Sdim Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, 1649249423Sdim BypassBlock->getTerminator()); 1650249423Sdim if (MemRuntimeCheck) { 1651249423Sdim // Create a new block containing the memory check. 1652249423Sdim BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, 1653249423Sdim "vector.memcheck"); 1654263508Sdim if (ParentLoop) 1655263508Sdim ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); 1656249423Sdim LoopBypassBlocks.push_back(CheckBlock); 1657243789Sdim 1658249423Sdim // Replace the branch into the memory check block with a conditional branch 1659249423Sdim // for the "few elements case". 1660249423Sdim Instruction *OldTerm = BypassBlock->getTerminator(); 1661249423Sdim BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); 1662249423Sdim OldTerm->eraseFromParent(); 1663243789Sdim 1664249423Sdim Cmp = MemRuntimeCheck; 1665249423Sdim LastBypassBlock = CheckBlock; 1666249423Sdim } 1667243789Sdim 1668249423Sdim LastBypassBlock->getTerminator()->eraseFromParent(); 1669249423Sdim BranchInst::Create(MiddleBlock, VectorPH, Cmp, 1670249423Sdim LastBypassBlock); 1671249423Sdim 1672249423Sdim // We are going to resume the execution of the scalar loop. 1673249423Sdim // Go over all of the induction variables that we found and fix the 1674249423Sdim // PHIs that are left in the scalar version of the loop. 1675249423Sdim // The starting values of PHI nodes depend on the counter of the last 1676249423Sdim // iteration in the vectorized loop. 1677249423Sdim // If we come from a bypass edge then we need to start from the original 1678249423Sdim // start value. 1679249423Sdim 1680249423Sdim // This variable saves the new starting index for the scalar loop. 1681249423Sdim PHINode *ResumeIndex = 0; 1682249423Sdim LoopVectorizationLegality::InductionList::iterator I, E; 1683249423Sdim LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 1684263508Sdim // Set builder to point to last bypass block. 1685263508Sdim BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); 1686249423Sdim for (I = List->begin(), E = List->end(); I != E; ++I) { 1687249423Sdim PHINode *OrigPhi = I->first; 1688249423Sdim LoopVectorizationLegality::InductionInfo II = I->second; 1689263508Sdim 1690263508Sdim Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); 1691263508Sdim PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", 1692249423Sdim MiddleBlock->getTerminator()); 1693263508Sdim // We might have extended the type of the induction variable but we need a 1694263508Sdim // truncated version for the scalar loop. 1695263508Sdim PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? 1696263508Sdim PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", 1697263508Sdim MiddleBlock->getTerminator()) : 0; 1698263508Sdim 1699249423Sdim Value *EndValue = 0; 1700249423Sdim switch (II.IK) { 1701249423Sdim case LoopVectorizationLegality::IK_NoInduction: 1702249423Sdim llvm_unreachable("Unknown induction"); 1703249423Sdim case LoopVectorizationLegality::IK_IntInduction: { 1704263508Sdim // Handle the integer induction counter. 1705249423Sdim assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); 1706263508Sdim 1707263508Sdim // We have the canonical induction variable. 1708263508Sdim if (OrigPhi == OldInduction) { 1709263508Sdim // Create a truncated version of the resume value for the scalar loop, 1710263508Sdim // we might have promoted the type to a larger width. 1711263508Sdim EndValue = 1712263508Sdim BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); 1713263508Sdim // The new PHI merges the original incoming value, in case of a bypass, 1714263508Sdim // or the value at the end of the vectorized loop. 1715263508Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1716263508Sdim TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1717263508Sdim TruncResumeVal->addIncoming(EndValue, VecBody); 1718263508Sdim 1719263508Sdim // We know what the end value is. 1720263508Sdim EndValue = IdxEndRoundDown; 1721263508Sdim // We also know which PHI node holds it. 1722263508Sdim ResumeIndex = ResumeVal; 1723263508Sdim break; 1724263508Sdim } 1725263508Sdim 1726263508Sdim // Not the canonical induction variable - add the vector loop count to the 1727263508Sdim // start value. 1728263508Sdim Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 1729263508Sdim II.StartValue->getType(), 1730263508Sdim "cast.crd"); 1731263508Sdim EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); 1732249423Sdim break; 1733243789Sdim } 1734249423Sdim case LoopVectorizationLegality::IK_ReverseIntInduction: { 1735249423Sdim // Convert the CountRoundDown variable to the PHI size. 1736263508Sdim Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, 1737263508Sdim II.StartValue->getType(), 1738263508Sdim "cast.crd"); 1739263508Sdim // Handle reverse integer induction counter. 1740263508Sdim EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); 1741249423Sdim break; 1742249423Sdim } 1743249423Sdim case LoopVectorizationLegality::IK_PtrInduction: { 1744249423Sdim // For pointer induction variables, calculate the offset using 1745249423Sdim // the end index. 1746263508Sdim EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, 1747263508Sdim "ptr.ind.end"); 1748249423Sdim break; 1749249423Sdim } 1750249423Sdim case LoopVectorizationLegality::IK_ReversePtrInduction: { 1751249423Sdim // The value at the end of the loop for the reverse pointer is calculated 1752249423Sdim // by creating a GEP with a negative index starting from the start value. 1753249423Sdim Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); 1754263508Sdim Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, 1755263508Sdim "rev.ind.end"); 1756263508Sdim EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, 1757263508Sdim "rev.ptr.ind.end"); 1758249423Sdim break; 1759249423Sdim } 1760249423Sdim }// end of case 1761243789Sdim 1762249423Sdim // The new PHI merges the original incoming value, in case of a bypass, 1763249423Sdim // or the value at the end of the vectorized loop. 1764263508Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { 1765263508Sdim if (OrigPhi == OldInduction) 1766263508Sdim ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); 1767263508Sdim else 1768263508Sdim ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); 1769263508Sdim } 1770249423Sdim ResumeVal->addIncoming(EndValue, VecBody); 1771249423Sdim 1772249423Sdim // Fix the scalar body counter (PHI node). 1773249423Sdim unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); 1774263508Sdim // The old inductions phi node in the scalar body needs the truncated value. 1775263508Sdim if (OrigPhi == OldInduction) 1776263508Sdim OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); 1777263508Sdim else 1778263508Sdim OrigPhi->setIncomingValue(BlockIdx, ResumeVal); 1779243789Sdim } 1780243789Sdim 1781249423Sdim // If we are generating a new induction variable then we also need to 1782249423Sdim // generate the code that calculates the exit value. This value is not 1783249423Sdim // simply the end of the counter because we may skip the vectorized body 1784249423Sdim // in case of a runtime check. 1785249423Sdim if (!OldInduction){ 1786249423Sdim assert(!ResumeIndex && "Unexpected resume value found"); 1787249423Sdim ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", 1788249423Sdim MiddleBlock->getTerminator()); 1789249423Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 1790249423Sdim ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); 1791249423Sdim ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); 1792249423Sdim } 1793243789Sdim 1794249423Sdim // Make sure that we found the index where scalar loop needs to continue. 1795249423Sdim assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && 1796249423Sdim "Invalid resume Index"); 1797243789Sdim 1798243789Sdim // Add a check in the middle block to see if we have completed 1799243789Sdim // all of the iterations in the first vector loop. 1800243789Sdim // If (N - N%VF) == N, then we *don't* need to run the remainder. 1801243789Sdim Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, 1802243789Sdim ResumeIndex, "cmp.n", 1803243789Sdim MiddleBlock->getTerminator()); 1804243789Sdim 1805243789Sdim BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); 1806243789Sdim // Remove the old terminator. 1807243789Sdim MiddleBlock->getTerminator()->eraseFromParent(); 1808243789Sdim 1809243789Sdim // Create i+1 and fill the PHINode. 1810243789Sdim Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); 1811243789Sdim Induction->addIncoming(StartIdx, VectorPH); 1812243789Sdim Induction->addIncoming(NextIdx, VecBody); 1813243789Sdim // Create the compare. 1814243789Sdim Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); 1815243789Sdim Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); 1816243789Sdim 1817243789Sdim // Now we have two terminators. Remove the old one from the block. 1818243789Sdim VecBody->getTerminator()->eraseFromParent(); 1819243789Sdim 1820243789Sdim // Get ready to start creating new instructions into the vectorized body. 1821243789Sdim Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); 1822243789Sdim 1823243789Sdim // Save the state. 1824243789Sdim LoopVectorPreHeader = VectorPH; 1825243789Sdim LoopScalarPreHeader = ScalarPH; 1826243789Sdim LoopMiddleBlock = MiddleBlock; 1827243789Sdim LoopExitBlock = ExitBlock; 1828243789Sdim LoopVectorBody = VecBody; 1829243789Sdim LoopScalarBody = OldBasicBlock; 1830263508Sdim 1831263508Sdim LoopVectorizeHints Hints(Lp, true); 1832263508Sdim Hints.setAlreadyVectorized(Lp); 1833243789Sdim} 1834243789Sdim 1835243789Sdim/// This function returns the identity element (or neutral element) for 1836243789Sdim/// the operation K. 1837251662SdimConstant* 1838251662SdimLoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) { 1839243789Sdim switch (K) { 1840251662Sdim case RK_IntegerXor: 1841251662Sdim case RK_IntegerAdd: 1842251662Sdim case RK_IntegerOr: 1843243789Sdim // Adding, Xoring, Oring zero to a number does not change it. 1844249423Sdim return ConstantInt::get(Tp, 0); 1845251662Sdim case RK_IntegerMult: 1846243789Sdim // Multiplying a number by 1 does not change it. 1847249423Sdim return ConstantInt::get(Tp, 1); 1848251662Sdim case RK_IntegerAnd: 1849243789Sdim // AND-ing a number with an all-1 value does not change it. 1850249423Sdim return ConstantInt::get(Tp, -1, true); 1851251662Sdim case RK_FloatMult: 1852249423Sdim // Multiplying a number by 1 does not change it. 1853249423Sdim return ConstantFP::get(Tp, 1.0L); 1854251662Sdim case RK_FloatAdd: 1855249423Sdim // Adding zero to a number does not change it. 1856249423Sdim return ConstantFP::get(Tp, 0.0L); 1857243789Sdim default: 1858243789Sdim llvm_unreachable("Unknown reduction kind"); 1859243789Sdim } 1860243789Sdim} 1861243789Sdim 1862263508Sdimstatic Intrinsic::ID checkUnaryFloatSignature(const CallInst &I, 1863263508Sdim Intrinsic::ID ValidIntrinsicID) { 1864263508Sdim if (I.getNumArgOperands() != 1 || 1865263508Sdim !I.getArgOperand(0)->getType()->isFloatingPointTy() || 1866263508Sdim I.getType() != I.getArgOperand(0)->getType() || 1867263508Sdim !I.onlyReadsMemory()) 1868263508Sdim return Intrinsic::not_intrinsic; 1869263508Sdim 1870263508Sdim return ValidIntrinsicID; 1871263508Sdim} 1872263508Sdim 1873263508Sdimstatic Intrinsic::ID checkBinaryFloatSignature(const CallInst &I, 1874263508Sdim Intrinsic::ID ValidIntrinsicID) { 1875263508Sdim if (I.getNumArgOperands() != 2 || 1876263508Sdim !I.getArgOperand(0)->getType()->isFloatingPointTy() || 1877263508Sdim !I.getArgOperand(1)->getType()->isFloatingPointTy() || 1878263508Sdim I.getType() != I.getArgOperand(0)->getType() || 1879263508Sdim I.getType() != I.getArgOperand(1)->getType() || 1880263508Sdim !I.onlyReadsMemory()) 1881263508Sdim return Intrinsic::not_intrinsic; 1882263508Sdim 1883263508Sdim return ValidIntrinsicID; 1884263508Sdim} 1885263508Sdim 1886263508Sdim 1887249423Sdimstatic Intrinsic::ID 1888249423SdimgetIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { 1889249423Sdim // If we have an intrinsic call, check if it is trivially vectorizable. 1890249423Sdim if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { 1891249423Sdim switch (II->getIntrinsicID()) { 1892249423Sdim case Intrinsic::sqrt: 1893249423Sdim case Intrinsic::sin: 1894249423Sdim case Intrinsic::cos: 1895249423Sdim case Intrinsic::exp: 1896249423Sdim case Intrinsic::exp2: 1897249423Sdim case Intrinsic::log: 1898249423Sdim case Intrinsic::log10: 1899249423Sdim case Intrinsic::log2: 1900249423Sdim case Intrinsic::fabs: 1901263508Sdim case Intrinsic::copysign: 1902249423Sdim case Intrinsic::floor: 1903249423Sdim case Intrinsic::ceil: 1904249423Sdim case Intrinsic::trunc: 1905249423Sdim case Intrinsic::rint: 1906249423Sdim case Intrinsic::nearbyint: 1907263508Sdim case Intrinsic::round: 1908249423Sdim case Intrinsic::pow: 1909249423Sdim case Intrinsic::fma: 1910249423Sdim case Intrinsic::fmuladd: 1911263508Sdim case Intrinsic::lifetime_start: 1912263508Sdim case Intrinsic::lifetime_end: 1913249423Sdim return II->getIntrinsicID(); 1914249423Sdim default: 1915249423Sdim return Intrinsic::not_intrinsic; 1916249423Sdim } 1917249423Sdim } 1918249423Sdim 1919249423Sdim if (!TLI) 1920249423Sdim return Intrinsic::not_intrinsic; 1921249423Sdim 1922249423Sdim LibFunc::Func Func; 1923249423Sdim Function *F = CI->getCalledFunction(); 1924249423Sdim // We're going to make assumptions on the semantics of the functions, check 1925263508Sdim // that the target knows that it's available in this environment and it does 1926263508Sdim // not have local linkage. 1927263508Sdim if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func)) 1928249423Sdim return Intrinsic::not_intrinsic; 1929249423Sdim 1930249423Sdim // Otherwise check if we have a call to a function that can be turned into a 1931249423Sdim // vector intrinsic. 1932249423Sdim switch (Func) { 1933249423Sdim default: 1934249423Sdim break; 1935249423Sdim case LibFunc::sin: 1936249423Sdim case LibFunc::sinf: 1937249423Sdim case LibFunc::sinl: 1938263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::sin); 1939249423Sdim case LibFunc::cos: 1940249423Sdim case LibFunc::cosf: 1941249423Sdim case LibFunc::cosl: 1942263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::cos); 1943249423Sdim case LibFunc::exp: 1944249423Sdim case LibFunc::expf: 1945249423Sdim case LibFunc::expl: 1946263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::exp); 1947249423Sdim case LibFunc::exp2: 1948249423Sdim case LibFunc::exp2f: 1949249423Sdim case LibFunc::exp2l: 1950263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::exp2); 1951249423Sdim case LibFunc::log: 1952249423Sdim case LibFunc::logf: 1953249423Sdim case LibFunc::logl: 1954263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::log); 1955249423Sdim case LibFunc::log10: 1956249423Sdim case LibFunc::log10f: 1957249423Sdim case LibFunc::log10l: 1958263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::log10); 1959249423Sdim case LibFunc::log2: 1960249423Sdim case LibFunc::log2f: 1961249423Sdim case LibFunc::log2l: 1962263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::log2); 1963249423Sdim case LibFunc::fabs: 1964249423Sdim case LibFunc::fabsf: 1965249423Sdim case LibFunc::fabsl: 1966263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::fabs); 1967263508Sdim case LibFunc::copysign: 1968263508Sdim case LibFunc::copysignf: 1969263508Sdim case LibFunc::copysignl: 1970263508Sdim return checkBinaryFloatSignature(*CI, Intrinsic::copysign); 1971249423Sdim case LibFunc::floor: 1972249423Sdim case LibFunc::floorf: 1973249423Sdim case LibFunc::floorl: 1974263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::floor); 1975249423Sdim case LibFunc::ceil: 1976249423Sdim case LibFunc::ceilf: 1977249423Sdim case LibFunc::ceill: 1978263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::ceil); 1979249423Sdim case LibFunc::trunc: 1980249423Sdim case LibFunc::truncf: 1981249423Sdim case LibFunc::truncl: 1982263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::trunc); 1983249423Sdim case LibFunc::rint: 1984249423Sdim case LibFunc::rintf: 1985249423Sdim case LibFunc::rintl: 1986263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::rint); 1987249423Sdim case LibFunc::nearbyint: 1988249423Sdim case LibFunc::nearbyintf: 1989249423Sdim case LibFunc::nearbyintl: 1990263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint); 1991263508Sdim case LibFunc::round: 1992263508Sdim case LibFunc::roundf: 1993263508Sdim case LibFunc::roundl: 1994263508Sdim return checkUnaryFloatSignature(*CI, Intrinsic::round); 1995249423Sdim case LibFunc::pow: 1996249423Sdim case LibFunc::powf: 1997249423Sdim case LibFunc::powl: 1998263508Sdim return checkBinaryFloatSignature(*CI, Intrinsic::pow); 1999249423Sdim } 2000249423Sdim 2001249423Sdim return Intrinsic::not_intrinsic; 2002249423Sdim} 2003249423Sdim 2004249423Sdim/// This function translates the reduction kind to an LLVM binary operator. 2005251662Sdimstatic unsigned 2006249423SdimgetReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { 2007249423Sdim switch (Kind) { 2008249423Sdim case LoopVectorizationLegality::RK_IntegerAdd: 2009249423Sdim return Instruction::Add; 2010249423Sdim case LoopVectorizationLegality::RK_IntegerMult: 2011249423Sdim return Instruction::Mul; 2012249423Sdim case LoopVectorizationLegality::RK_IntegerOr: 2013249423Sdim return Instruction::Or; 2014249423Sdim case LoopVectorizationLegality::RK_IntegerAnd: 2015249423Sdim return Instruction::And; 2016249423Sdim case LoopVectorizationLegality::RK_IntegerXor: 2017249423Sdim return Instruction::Xor; 2018249423Sdim case LoopVectorizationLegality::RK_FloatMult: 2019249423Sdim return Instruction::FMul; 2020249423Sdim case LoopVectorizationLegality::RK_FloatAdd: 2021249423Sdim return Instruction::FAdd; 2022251662Sdim case LoopVectorizationLegality::RK_IntegerMinMax: 2023251662Sdim return Instruction::ICmp; 2024251662Sdim case LoopVectorizationLegality::RK_FloatMinMax: 2025251662Sdim return Instruction::FCmp; 2026249423Sdim default: 2027249423Sdim llvm_unreachable("Unknown reduction operation"); 2028249423Sdim } 2029249423Sdim} 2030249423Sdim 2031251662SdimValue *createMinMaxOp(IRBuilder<> &Builder, 2032251662Sdim LoopVectorizationLegality::MinMaxReductionKind RK, 2033251662Sdim Value *Left, 2034251662Sdim Value *Right) { 2035251662Sdim CmpInst::Predicate P = CmpInst::ICMP_NE; 2036251662Sdim switch (RK) { 2037251662Sdim default: 2038251662Sdim llvm_unreachable("Unknown min/max reduction kind"); 2039251662Sdim case LoopVectorizationLegality::MRK_UIntMin: 2040251662Sdim P = CmpInst::ICMP_ULT; 2041251662Sdim break; 2042251662Sdim case LoopVectorizationLegality::MRK_UIntMax: 2043251662Sdim P = CmpInst::ICMP_UGT; 2044251662Sdim break; 2045251662Sdim case LoopVectorizationLegality::MRK_SIntMin: 2046251662Sdim P = CmpInst::ICMP_SLT; 2047251662Sdim break; 2048251662Sdim case LoopVectorizationLegality::MRK_SIntMax: 2049251662Sdim P = CmpInst::ICMP_SGT; 2050251662Sdim break; 2051251662Sdim case LoopVectorizationLegality::MRK_FloatMin: 2052251662Sdim P = CmpInst::FCMP_OLT; 2053251662Sdim break; 2054251662Sdim case LoopVectorizationLegality::MRK_FloatMax: 2055251662Sdim P = CmpInst::FCMP_OGT; 2056251662Sdim break; 2057251662Sdim } 2058251662Sdim 2059251662Sdim Value *Cmp; 2060263508Sdim if (RK == LoopVectorizationLegality::MRK_FloatMin || 2061263508Sdim RK == LoopVectorizationLegality::MRK_FloatMax) 2062251662Sdim Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp"); 2063251662Sdim else 2064251662Sdim Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp"); 2065251662Sdim 2066251662Sdim Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); 2067251662Sdim return Select; 2068251662Sdim} 2069251662Sdim 2070263508Sdimnamespace { 2071263508Sdimstruct CSEDenseMapInfo { 2072263508Sdim static bool canHandle(Instruction *I) { 2073263508Sdim return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 2074263508Sdim isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 2075263508Sdim } 2076263508Sdim static inline Instruction *getEmptyKey() { 2077263508Sdim return DenseMapInfo<Instruction *>::getEmptyKey(); 2078263508Sdim } 2079263508Sdim static inline Instruction *getTombstoneKey() { 2080263508Sdim return DenseMapInfo<Instruction *>::getTombstoneKey(); 2081263508Sdim } 2082263508Sdim static unsigned getHashValue(Instruction *I) { 2083263508Sdim assert(canHandle(I) && "Unknown instruction!"); 2084263508Sdim return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 2085263508Sdim I->value_op_end())); 2086263508Sdim } 2087263508Sdim static bool isEqual(Instruction *LHS, Instruction *RHS) { 2088263508Sdim if (LHS == getEmptyKey() || RHS == getEmptyKey() || 2089263508Sdim LHS == getTombstoneKey() || RHS == getTombstoneKey()) 2090263508Sdim return LHS == RHS; 2091263508Sdim return LHS->isIdenticalTo(RHS); 2092263508Sdim } 2093263508Sdim}; 2094263508Sdim} 2095263508Sdim 2096263508Sdim///\brief Perform cse of induction variable instructions. 2097263508Sdimstatic void cse(BasicBlock *BB) { 2098263508Sdim // Perform simple cse. 2099263508Sdim SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 2100263508Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 2101263508Sdim Instruction *In = I++; 2102263508Sdim 2103263508Sdim if (!CSEDenseMapInfo::canHandle(In)) 2104263508Sdim continue; 2105263508Sdim 2106263508Sdim // Check if we can replace this instruction with any of the 2107263508Sdim // visited instructions. 2108263508Sdim if (Instruction *V = CSEMap.lookup(In)) { 2109263508Sdim In->replaceAllUsesWith(V); 2110263508Sdim In->eraseFromParent(); 2111263508Sdim continue; 2112263508Sdim } 2113263508Sdim 2114263508Sdim CSEMap[In] = In; 2115263508Sdim } 2116263508Sdim} 2117263508Sdim 2118243789Sdimvoid 2119249423SdimInnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { 2120243789Sdim //===------------------------------------------------===// 2121243789Sdim // 2122243789Sdim // Notice: any optimization or new instruction that go 2123243789Sdim // into the code below should be also be implemented in 2124243789Sdim // the cost-model. 2125243789Sdim // 2126243789Sdim //===------------------------------------------------===// 2127249423Sdim Constant *Zero = Builder.getInt32(0); 2128243789Sdim 2129243789Sdim // In order to support reduction variables we need to be able to vectorize 2130243789Sdim // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two 2131249423Sdim // stages. First, we create a new vector PHI node with no incoming edges. 2132243789Sdim // We use this value when we vectorize all of the instructions that use the 2133243789Sdim // PHI. Next, after all of the instructions in the block are complete we 2134243789Sdim // add the new incoming edges to the PHI. At this point all of the 2135243789Sdim // instructions in the basic block are vectorized, so we can use them to 2136243789Sdim // construct the PHI. 2137249423Sdim PhiVector RdxPHIsToFix; 2138243789Sdim 2139249423Sdim // Scan the loop in a topological order to ensure that defs are vectorized 2140249423Sdim // before users. 2141249423Sdim LoopBlocksDFS DFS(OrigLoop); 2142249423Sdim DFS.perform(LI); 2143243789Sdim 2144249423Sdim // Vectorize all of the blocks in the original loop. 2145249423Sdim for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 2146249423Sdim be = DFS.endRPO(); bb != be; ++bb) 2147249423Sdim vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); 2148243789Sdim 2149249423Sdim // At this point every instruction in the original loop is widened to 2150243789Sdim // a vector form. We are almost done. Now, we need to fix the PHI nodes 2151243789Sdim // that we vectorized. The PHI nodes are currently empty because we did 2152243789Sdim // not want to introduce cycles. Notice that the remaining PHI nodes 2153243789Sdim // that we need to fix are reduction variables. 2154243789Sdim 2155243789Sdim // Create the 'reduced' values for each of the induction vars. 2156243789Sdim // The reduced values are the vector values that we scalarize and combine 2157243789Sdim // after the loop is finished. 2158249423Sdim for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); 2159243789Sdim it != e; ++it) { 2160243789Sdim PHINode *RdxPhi = *it; 2161243789Sdim assert(RdxPhi && "Unable to recover vectorized PHI"); 2162243789Sdim 2163243789Sdim // Find the reduction variable descriptor. 2164243789Sdim assert(Legal->getReductionVars()->count(RdxPhi) && 2165243789Sdim "Unable to find the reduction variable"); 2166243789Sdim LoopVectorizationLegality::ReductionDescriptor RdxDesc = 2167249423Sdim (*Legal->getReductionVars())[RdxPhi]; 2168243789Sdim 2169263508Sdim setDebugLocFromInst(Builder, RdxDesc.StartValue); 2170263508Sdim 2171243789Sdim // We need to generate a reduction vector from the incoming scalar. 2172243789Sdim // To do so, we need to generate the 'identity' vector and overide 2173243789Sdim // one of the elements with the incoming scalar reduction. We need 2174243789Sdim // to do it in the vector-loop preheader. 2175249423Sdim Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); 2176243789Sdim 2177243789Sdim // This is the vector-clone of the value that leaves the loop. 2178249423Sdim VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); 2179249423Sdim Type *VecTy = VectorExit[0]->getType(); 2180243789Sdim 2181243789Sdim // Find the reduction identity variable. Zero for addition, or, xor, 2182243789Sdim // one for multiplication, -1 for And. 2183251662Sdim Value *Identity; 2184251662Sdim Value *VectorStart; 2185251662Sdim if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax || 2186251662Sdim RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) { 2187251662Sdim // MinMax reduction have the start value as their identify. 2188263508Sdim if (VF == 1) { 2189263508Sdim VectorStart = Identity = RdxDesc.StartValue; 2190263508Sdim } else { 2191263508Sdim VectorStart = Identity = Builder.CreateVectorSplat(VF, 2192263508Sdim RdxDesc.StartValue, 2193263508Sdim "minmax.ident"); 2194263508Sdim } 2195251662Sdim } else { 2196263508Sdim // Handle other reduction kinds: 2197251662Sdim Constant *Iden = 2198263508Sdim LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind, 2199263508Sdim VecTy->getScalarType()); 2200263508Sdim if (VF == 1) { 2201263508Sdim Identity = Iden; 2202263508Sdim // This vector is the Identity vector where the first element is the 2203263508Sdim // incoming scalar reduction. 2204263508Sdim VectorStart = RdxDesc.StartValue; 2205263508Sdim } else { 2206263508Sdim Identity = ConstantVector::getSplat(VF, Iden); 2207243789Sdim 2208263508Sdim // This vector is the Identity vector where the first element is the 2209263508Sdim // incoming scalar reduction. 2210263508Sdim VectorStart = Builder.CreateInsertElement(Identity, 2211263508Sdim RdxDesc.StartValue, Zero); 2212263508Sdim } 2213251662Sdim } 2214243789Sdim 2215243789Sdim // Fix the vector-loop phi. 2216243789Sdim // We created the induction variable so we know that the 2217243789Sdim // preheader is the first entry. 2218243789Sdim BasicBlock *VecPreheader = Induction->getIncomingBlock(0); 2219243789Sdim 2220243789Sdim // Reductions do not have to start at zero. They can start with 2221243789Sdim // any loop invariant values. 2222249423Sdim VectorParts &VecRdxPhi = WidenMap.get(RdxPhi); 2223249423Sdim BasicBlock *Latch = OrigLoop->getLoopLatch(); 2224249423Sdim Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch); 2225249423Sdim VectorParts &Val = getVectorValue(LoopVal); 2226249423Sdim for (unsigned part = 0; part < UF; ++part) { 2227263508Sdim // Make sure to add the reduction stat value only to the 2228249423Sdim // first unroll part. 2229249423Sdim Value *StartVal = (part == 0) ? VectorStart : Identity; 2230249423Sdim cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); 2231249423Sdim cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); 2232249423Sdim } 2233243789Sdim 2234243789Sdim // Before each round, move the insertion point right between 2235243789Sdim // the PHIs and the values we are going to write. 2236243789Sdim // This allows us to write both PHINodes and the extractelement 2237243789Sdim // instructions. 2238243789Sdim Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); 2239243789Sdim 2240249423Sdim VectorParts RdxParts; 2241263508Sdim setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr); 2242249423Sdim for (unsigned part = 0; part < UF; ++part) { 2243249423Sdim // This PHINode contains the vectorized reduction variable, or 2244249423Sdim // the initial value vector, if we bypass the vector loop. 2245249423Sdim VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); 2246249423Sdim PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); 2247249423Sdim Value *StartVal = (part == 0) ? VectorStart : Identity; 2248249423Sdim for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 2249249423Sdim NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); 2250249423Sdim NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); 2251249423Sdim RdxParts.push_back(NewPhi); 2252249423Sdim } 2253243789Sdim 2254249423Sdim // Reduce all of the unrolled parts into a single vector. 2255249423Sdim Value *ReducedPartRdx = RdxParts[0]; 2256251662Sdim unsigned Op = getReductionBinOp(RdxDesc.Kind); 2257263508Sdim setDebugLocFromInst(Builder, ReducedPartRdx); 2258249423Sdim for (unsigned part = 1; part < UF; ++part) { 2259251662Sdim if (Op != Instruction::ICmp && Op != Instruction::FCmp) 2260251662Sdim ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, 2261251662Sdim RdxParts[part], ReducedPartRdx, 2262251662Sdim "bin.rdx"); 2263251662Sdim else 2264251662Sdim ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, 2265251662Sdim ReducedPartRdx, RdxParts[part]); 2266243789Sdim } 2267243789Sdim 2268263508Sdim if (VF > 1) { 2269263508Sdim // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles 2270263508Sdim // and vector ops, reducing the set of values being computed by half each 2271263508Sdim // round. 2272263508Sdim assert(isPowerOf2_32(VF) && 2273263508Sdim "Reduction emission only supported for pow2 vectors!"); 2274263508Sdim Value *TmpVec = ReducedPartRdx; 2275263508Sdim SmallVector<Constant*, 32> ShuffleMask(VF, 0); 2276263508Sdim for (unsigned i = VF; i != 1; i >>= 1) { 2277263508Sdim // Move the upper half of the vector to the lower half. 2278263508Sdim for (unsigned j = 0; j != i/2; ++j) 2279263508Sdim ShuffleMask[j] = Builder.getInt32(i/2 + j); 2280249423Sdim 2281263508Sdim // Fill the rest of the mask with undef. 2282263508Sdim std::fill(&ShuffleMask[i/2], ShuffleMask.end(), 2283263508Sdim UndefValue::get(Builder.getInt32Ty())); 2284249423Sdim 2285263508Sdim Value *Shuf = 2286249423Sdim Builder.CreateShuffleVector(TmpVec, 2287249423Sdim UndefValue::get(TmpVec->getType()), 2288249423Sdim ConstantVector::get(ShuffleMask), 2289249423Sdim "rdx.shuf"); 2290249423Sdim 2291263508Sdim if (Op != Instruction::ICmp && Op != Instruction::FCmp) 2292263508Sdim TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, 2293263508Sdim "bin.rdx"); 2294263508Sdim else 2295263508Sdim TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); 2296263508Sdim } 2297263508Sdim 2298263508Sdim // The result is in the first element of the vector. 2299263508Sdim ReducedPartRdx = Builder.CreateExtractElement(TmpVec, 2300263508Sdim Builder.getInt32(0)); 2301249423Sdim } 2302249423Sdim 2303243789Sdim // Now, we need to fix the users of the reduction variable 2304243789Sdim // inside and outside of the scalar remainder loop. 2305243789Sdim // We know that the loop is in LCSSA form. We need to update the 2306243789Sdim // PHI nodes in the exit blocks. 2307243789Sdim for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 2308243789Sdim LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 2309243789Sdim PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 2310263508Sdim if (!LCSSAPhi) break; 2311243789Sdim 2312243789Sdim // All PHINodes need to have a single entry edge, or two if 2313243789Sdim // we already fixed them. 2314243789Sdim assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 2315243789Sdim 2316243789Sdim // We found our reduction value exit-PHI. Update it with the 2317243789Sdim // incoming bypass edge. 2318243789Sdim if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { 2319243789Sdim // Add an edge coming from the bypass. 2320263508Sdim LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 2321243789Sdim break; 2322243789Sdim } 2323243789Sdim }// end of the LCSSA phi scan. 2324243789Sdim 2325243789Sdim // Fix the scalar loop reduction variable with the incoming reduction sum 2326243789Sdim // from the vector body and from the backedge value. 2327249423Sdim int IncomingEdgeBlockIdx = 2328249423Sdim (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); 2329249423Sdim assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 2330249423Sdim // Pick the other block. 2331249423Sdim int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 2332263508Sdim (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); 2333243789Sdim (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); 2334243789Sdim }// end of for each redux variable. 2335249423Sdim 2336263508Sdim fixLCSSAPHIs(); 2337263508Sdim 2338263508Sdim // Remove redundant induction instructions. 2339263508Sdim cse(LoopVectorBody); 2340263508Sdim} 2341263508Sdim 2342263508Sdimvoid InnerLoopVectorizer::fixLCSSAPHIs() { 2343249423Sdim for (BasicBlock::iterator LEI = LoopExitBlock->begin(), 2344249423Sdim LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { 2345249423Sdim PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); 2346263508Sdim if (!LCSSAPhi) break; 2347249423Sdim if (LCSSAPhi->getNumIncomingValues() == 1) 2348249423Sdim LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), 2349249423Sdim LoopMiddleBlock); 2350249423Sdim } 2351263508Sdim} 2352243789Sdim 2353249423SdimInnerLoopVectorizer::VectorParts 2354249423SdimInnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { 2355249423Sdim assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && 2356249423Sdim "Invalid edge"); 2357249423Sdim 2358263508Sdim // Look for cached value. 2359263508Sdim std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst); 2360263508Sdim EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge); 2361263508Sdim if (ECEntryIt != MaskCache.end()) 2362263508Sdim return ECEntryIt->second; 2363263508Sdim 2364249423Sdim VectorParts SrcMask = createBlockInMask(Src); 2365249423Sdim 2366249423Sdim // The terminator has to be a branch inst! 2367249423Sdim BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 2368249423Sdim assert(BI && "Unexpected terminator found"); 2369249423Sdim 2370249423Sdim if (BI->isConditional()) { 2371249423Sdim VectorParts EdgeMask = getVectorValue(BI->getCondition()); 2372249423Sdim 2373249423Sdim if (BI->getSuccessor(0) != Dst) 2374249423Sdim for (unsigned part = 0; part < UF; ++part) 2375249423Sdim EdgeMask[part] = Builder.CreateNot(EdgeMask[part]); 2376249423Sdim 2377249423Sdim for (unsigned part = 0; part < UF; ++part) 2378249423Sdim EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]); 2379263508Sdim 2380263508Sdim MaskCache[Edge] = EdgeMask; 2381249423Sdim return EdgeMask; 2382249423Sdim } 2383249423Sdim 2384263508Sdim MaskCache[Edge] = SrcMask; 2385249423Sdim return SrcMask; 2386249423Sdim} 2387249423Sdim 2388249423SdimInnerLoopVectorizer::VectorParts 2389249423SdimInnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { 2390249423Sdim assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 2391249423Sdim 2392249423Sdim // Loop incoming mask is all-one. 2393249423Sdim if (OrigLoop->getHeader() == BB) { 2394249423Sdim Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); 2395249423Sdim return getVectorValue(C); 2396249423Sdim } 2397249423Sdim 2398249423Sdim // This is the block mask. We OR all incoming edges, and with zero. 2399249423Sdim Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); 2400249423Sdim VectorParts BlockMask = getVectorValue(Zero); 2401249423Sdim 2402249423Sdim // For each pred: 2403249423Sdim for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) { 2404249423Sdim VectorParts EM = createEdgeMask(*it, BB); 2405249423Sdim for (unsigned part = 0; part < UF; ++part) 2406249423Sdim BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); 2407249423Sdim } 2408249423Sdim 2409249423Sdim return BlockMask; 2410249423Sdim} 2411249423Sdim 2412263508Sdimvoid InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, 2413263508Sdim InnerLoopVectorizer::VectorParts &Entry, 2414263508Sdim LoopVectorizationLegality *Legal, 2415263508Sdim unsigned UF, unsigned VF, PhiVector *PV) { 2416263508Sdim PHINode* P = cast<PHINode>(PN); 2417263508Sdim // Handle reduction variables: 2418263508Sdim if (Legal->getReductionVars()->count(P)) { 2419263508Sdim for (unsigned part = 0; part < UF; ++part) { 2420263508Sdim // This is phase one of vectorizing PHIs. 2421263508Sdim Type *VecTy = (VF == 1) ? PN->getType() : 2422263508Sdim VectorType::get(PN->getType(), VF); 2423263508Sdim Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", 2424263508Sdim LoopVectorBody-> getFirstInsertionPt()); 2425263508Sdim } 2426263508Sdim PV->push_back(P); 2427263508Sdim return; 2428263508Sdim } 2429249423Sdim 2430263508Sdim setDebugLocFromInst(Builder, P); 2431263508Sdim // Check for PHI nodes that are lowered to vector selects. 2432263508Sdim if (P->getParent() != OrigLoop->getHeader()) { 2433263508Sdim // We know that all PHIs in non header blocks are converted into 2434263508Sdim // selects, so we don't have to worry about the insertion order and we 2435263508Sdim // can just use the builder. 2436263508Sdim // At this point we generate the predication tree. There may be 2437263508Sdim // duplications since this is a simple recursive scan, but future 2438263508Sdim // optimizations will clean it up. 2439249423Sdim 2440263508Sdim unsigned NumIncoming = P->getNumIncomingValues(); 2441251662Sdim 2442263508Sdim // Generate a sequence of selects of the form: 2443263508Sdim // SELECT(Mask3, In3, 2444263508Sdim // SELECT(Mask2, In2, 2445263508Sdim // ( ...))) 2446263508Sdim for (unsigned In = 0; In < NumIncoming; In++) { 2447263508Sdim VectorParts Cond = createEdgeMask(P->getIncomingBlock(In), 2448263508Sdim P->getParent()); 2449263508Sdim VectorParts &In0 = getVectorValue(P->getIncomingValue(In)); 2450251662Sdim 2451263508Sdim for (unsigned part = 0; part < UF; ++part) { 2452263508Sdim // We might have single edge PHIs (blocks) - use an identity 2453263508Sdim // 'select' for the first PHI operand. 2454263508Sdim if (In == 0) 2455263508Sdim Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 2456263508Sdim In0[part]); 2457263508Sdim else 2458263508Sdim // Select between the current value and the previous incoming edge 2459263508Sdim // based on the incoming mask. 2460263508Sdim Entry[part] = Builder.CreateSelect(Cond[part], In0[part], 2461263508Sdim Entry[part], "predphi"); 2462249423Sdim } 2463263508Sdim } 2464263508Sdim return; 2465263508Sdim } 2466249423Sdim 2467263508Sdim // This PHINode must be an induction variable. 2468263508Sdim // Make sure that we know about it. 2469263508Sdim assert(Legal->getInductionVars()->count(P) && 2470263508Sdim "Not an induction variable"); 2471249423Sdim 2472263508Sdim LoopVectorizationLegality::InductionInfo II = 2473263508Sdim Legal->getInductionVars()->lookup(P); 2474249423Sdim 2475263508Sdim switch (II.IK) { 2476263508Sdim case LoopVectorizationLegality::IK_NoInduction: 2477263508Sdim llvm_unreachable("Unknown induction"); 2478263508Sdim case LoopVectorizationLegality::IK_IntInduction: { 2479263508Sdim assert(P->getType() == II.StartValue->getType() && "Types must match"); 2480263508Sdim Type *PhiTy = P->getType(); 2481263508Sdim Value *Broadcasted; 2482263508Sdim if (P == OldInduction) { 2483263508Sdim // Handle the canonical induction variable. We might have had to 2484263508Sdim // extend the type. 2485263508Sdim Broadcasted = Builder.CreateTrunc(Induction, PhiTy); 2486263508Sdim } else { 2487263508Sdim // Handle other induction variables that are now based on the 2488263508Sdim // canonical one. 2489263508Sdim Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, 2490263508Sdim "normalized.idx"); 2491263508Sdim NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); 2492263508Sdim Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, 2493263508Sdim "offset.idx"); 2494263508Sdim } 2495263508Sdim Broadcasted = getBroadcastInstrs(Broadcasted); 2496263508Sdim // After broadcasting the induction variable we need to make the vector 2497263508Sdim // consecutive by adding 0, 1, 2, etc. 2498263508Sdim for (unsigned part = 0; part < UF; ++part) 2499263508Sdim Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); 2500263508Sdim return; 2501263508Sdim } 2502263508Sdim case LoopVectorizationLegality::IK_ReverseIntInduction: 2503263508Sdim case LoopVectorizationLegality::IK_PtrInduction: 2504263508Sdim case LoopVectorizationLegality::IK_ReversePtrInduction: 2505263508Sdim // Handle reverse integer and pointer inductions. 2506263508Sdim Value *StartIdx = ExtendedIdx; 2507263508Sdim // This is the normalized GEP that starts counting at zero. 2508263508Sdim Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, 2509263508Sdim "normalized.idx"); 2510263508Sdim 2511263508Sdim // Handle the reverse integer induction variable case. 2512263508Sdim if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { 2513263508Sdim IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); 2514263508Sdim Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, 2515263508Sdim "resize.norm.idx"); 2516263508Sdim Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, 2517263508Sdim "reverse.idx"); 2518263508Sdim 2519263508Sdim // This is a new value so do not hoist it out. 2520263508Sdim Value *Broadcasted = getBroadcastInstrs(ReverseInd); 2521249423Sdim // After broadcasting the induction variable we need to make the 2522263508Sdim // vector consecutive by adding ... -3, -2, -1, 0. 2523249423Sdim for (unsigned part = 0; part < UF; ++part) 2524263508Sdim Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, 2525263508Sdim true); 2526263508Sdim return; 2527249423Sdim } 2528249423Sdim 2529263508Sdim // Handle the pointer induction variable case. 2530263508Sdim assert(P->getType()->isPointerTy() && "Unexpected type."); 2531249423Sdim 2532263508Sdim // Is this a reverse induction ptr or a consecutive induction ptr. 2533263508Sdim bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == 2534263508Sdim II.IK); 2535263508Sdim 2536263508Sdim // This is the vector of results. Notice that we don't generate 2537263508Sdim // vector geps because scalar geps result in better code. 2538263508Sdim for (unsigned part = 0; part < UF; ++part) { 2539263508Sdim if (VF == 1) { 2540263508Sdim int EltIndex = (part) * (Reverse ? -1 : 1); 2541263508Sdim Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); 2542263508Sdim Value *GlobalIdx; 2543263508Sdim if (Reverse) 2544263508Sdim GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); 2545263508Sdim else 2546263508Sdim GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 2547263508Sdim 2548263508Sdim Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 2549263508Sdim "next.gep"); 2550263508Sdim Entry[part] = SclrGep; 2551249423Sdim continue; 2552249423Sdim } 2553249423Sdim 2554263508Sdim Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); 2555263508Sdim for (unsigned int i = 0; i < VF; ++i) { 2556263508Sdim int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); 2557263508Sdim Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); 2558263508Sdim Value *GlobalIdx; 2559263508Sdim if (!Reverse) 2560263508Sdim GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); 2561263508Sdim else 2562263508Sdim GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); 2563249423Sdim 2564263508Sdim Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, 2565263508Sdim "next.gep"); 2566263508Sdim VecVal = Builder.CreateInsertElement(VecVal, SclrGep, 2567263508Sdim Builder.getInt32(i), 2568263508Sdim "insert.gep"); 2569249423Sdim } 2570263508Sdim Entry[part] = VecVal; 2571249423Sdim } 2572263508Sdim return; 2573263508Sdim } 2574263508Sdim} 2575249423Sdim 2576263508Sdimvoid 2577263508SdimInnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, 2578263508Sdim BasicBlock *BB, PhiVector *PV) { 2579263508Sdim // For each instruction in the old loop. 2580263508Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 2581263508Sdim VectorParts &Entry = WidenMap.get(it); 2582263508Sdim switch (it->getOpcode()) { 2583263508Sdim case Instruction::Br: 2584263508Sdim // Nothing to do for PHIs and BR, since we already took care of the 2585263508Sdim // loop control flow instructions. 2586263508Sdim continue; 2587263508Sdim case Instruction::PHI:{ 2588263508Sdim // Vectorize PHINodes. 2589263508Sdim widenPHIInstruction(it, Entry, Legal, UF, VF, PV); 2590263508Sdim continue; 2591249423Sdim }// End of PHI. 2592249423Sdim 2593249423Sdim case Instruction::Add: 2594249423Sdim case Instruction::FAdd: 2595249423Sdim case Instruction::Sub: 2596249423Sdim case Instruction::FSub: 2597249423Sdim case Instruction::Mul: 2598249423Sdim case Instruction::FMul: 2599249423Sdim case Instruction::UDiv: 2600249423Sdim case Instruction::SDiv: 2601249423Sdim case Instruction::FDiv: 2602249423Sdim case Instruction::URem: 2603249423Sdim case Instruction::SRem: 2604249423Sdim case Instruction::FRem: 2605249423Sdim case Instruction::Shl: 2606249423Sdim case Instruction::LShr: 2607249423Sdim case Instruction::AShr: 2608249423Sdim case Instruction::And: 2609249423Sdim case Instruction::Or: 2610249423Sdim case Instruction::Xor: { 2611249423Sdim // Just widen binops. 2612249423Sdim BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); 2613263508Sdim setDebugLocFromInst(Builder, BinOp); 2614249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 2615249423Sdim VectorParts &B = getVectorValue(it->getOperand(1)); 2616249423Sdim 2617249423Sdim // Use this vector value for all users of the original instruction. 2618249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2619249423Sdim Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); 2620249423Sdim 2621249423Sdim // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. 2622249423Sdim BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V); 2623249423Sdim if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) { 2624249423Sdim VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); 2625249423Sdim VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); 2626249423Sdim } 2627249423Sdim if (VecOp && isa<PossiblyExactOperator>(VecOp)) 2628249423Sdim VecOp->setIsExact(BinOp->isExact()); 2629249423Sdim 2630249423Sdim Entry[Part] = V; 2631249423Sdim } 2632249423Sdim break; 2633249423Sdim } 2634249423Sdim case Instruction::Select: { 2635249423Sdim // Widen selects. 2636249423Sdim // If the selector is loop invariant we can create a select 2637249423Sdim // instruction with a scalar condition. Otherwise, use vector-select. 2638249423Sdim bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), 2639249423Sdim OrigLoop); 2640263508Sdim setDebugLocFromInst(Builder, it); 2641249423Sdim 2642249423Sdim // The condition can be loop invariant but still defined inside the 2643249423Sdim // loop. This means that we can't just use the original 'cond' value. 2644249423Sdim // We have to take the 'vectorized' value and pick the first lane. 2645249423Sdim // Instcombine will make this a no-op. 2646249423Sdim VectorParts &Cond = getVectorValue(it->getOperand(0)); 2647249423Sdim VectorParts &Op0 = getVectorValue(it->getOperand(1)); 2648249423Sdim VectorParts &Op1 = getVectorValue(it->getOperand(2)); 2649263508Sdim 2650263508Sdim Value *ScalarCond = (VF == 1) ? Cond[0] : 2651263508Sdim Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); 2652263508Sdim 2653249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2654249423Sdim Entry[Part] = Builder.CreateSelect( 2655249423Sdim InvariantCond ? ScalarCond : Cond[Part], 2656249423Sdim Op0[Part], 2657249423Sdim Op1[Part]); 2658249423Sdim } 2659249423Sdim break; 2660249423Sdim } 2661249423Sdim 2662249423Sdim case Instruction::ICmp: 2663249423Sdim case Instruction::FCmp: { 2664249423Sdim // Widen compares. Generate vector compares. 2665249423Sdim bool FCmp = (it->getOpcode() == Instruction::FCmp); 2666249423Sdim CmpInst *Cmp = dyn_cast<CmpInst>(it); 2667263508Sdim setDebugLocFromInst(Builder, it); 2668249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 2669249423Sdim VectorParts &B = getVectorValue(it->getOperand(1)); 2670249423Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2671249423Sdim Value *C = 0; 2672249423Sdim if (FCmp) 2673249423Sdim C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); 2674249423Sdim else 2675249423Sdim C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); 2676249423Sdim Entry[Part] = C; 2677249423Sdim } 2678249423Sdim break; 2679249423Sdim } 2680249423Sdim 2681249423Sdim case Instruction::Store: 2682249423Sdim case Instruction::Load: 2683249423Sdim vectorizeMemoryInstruction(it, Legal); 2684249423Sdim break; 2685249423Sdim case Instruction::ZExt: 2686249423Sdim case Instruction::SExt: 2687249423Sdim case Instruction::FPToUI: 2688249423Sdim case Instruction::FPToSI: 2689249423Sdim case Instruction::FPExt: 2690249423Sdim case Instruction::PtrToInt: 2691249423Sdim case Instruction::IntToPtr: 2692249423Sdim case Instruction::SIToFP: 2693249423Sdim case Instruction::UIToFP: 2694249423Sdim case Instruction::Trunc: 2695249423Sdim case Instruction::FPTrunc: 2696249423Sdim case Instruction::BitCast: { 2697249423Sdim CastInst *CI = dyn_cast<CastInst>(it); 2698263508Sdim setDebugLocFromInst(Builder, it); 2699249423Sdim /// Optimize the special case where the source is the induction 2700249423Sdim /// variable. Notice that we can only optimize the 'trunc' case 2701249423Sdim /// because: a. FP conversions lose precision, b. sext/zext may wrap, 2702249423Sdim /// c. other casts depend on pointer size. 2703249423Sdim if (CI->getOperand(0) == OldInduction && 2704249423Sdim it->getOpcode() == Instruction::Trunc) { 2705249423Sdim Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, 2706249423Sdim CI->getType()); 2707249423Sdim Value *Broadcasted = getBroadcastInstrs(ScalarCast); 2708249423Sdim for (unsigned Part = 0; Part < UF; ++Part) 2709249423Sdim Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); 2710249423Sdim break; 2711249423Sdim } 2712249423Sdim /// Vectorize casts. 2713263508Sdim Type *DestTy = (VF == 1) ? CI->getType() : 2714263508Sdim VectorType::get(CI->getType(), VF); 2715249423Sdim 2716249423Sdim VectorParts &A = getVectorValue(it->getOperand(0)); 2717249423Sdim for (unsigned Part = 0; Part < UF; ++Part) 2718249423Sdim Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); 2719249423Sdim break; 2720249423Sdim } 2721249423Sdim 2722249423Sdim case Instruction::Call: { 2723249423Sdim // Ignore dbg intrinsics. 2724249423Sdim if (isa<DbgInfoIntrinsic>(it)) 2725249423Sdim break; 2726263508Sdim setDebugLocFromInst(Builder, it); 2727249423Sdim 2728249423Sdim Module *M = BB->getParent()->getParent(); 2729249423Sdim CallInst *CI = cast<CallInst>(it); 2730249423Sdim Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 2731249423Sdim assert(ID && "Not an intrinsic call!"); 2732263508Sdim switch (ID) { 2733263508Sdim case Intrinsic::lifetime_end: 2734263508Sdim case Intrinsic::lifetime_start: 2735263508Sdim scalarizeInstruction(it); 2736263508Sdim break; 2737263508Sdim default: 2738263508Sdim for (unsigned Part = 0; Part < UF; ++Part) { 2739263508Sdim SmallVector<Value *, 4> Args; 2740263508Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 2741263508Sdim VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); 2742263508Sdim Args.push_back(Arg[Part]); 2743263508Sdim } 2744263508Sdim Type *Tys[] = {CI->getType()}; 2745263508Sdim if (VF > 1) 2746263508Sdim Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); 2747263508Sdim 2748263508Sdim Function *F = Intrinsic::getDeclaration(M, ID, Tys); 2749263508Sdim Entry[Part] = Builder.CreateCall(F, Args); 2750249423Sdim } 2751263508Sdim break; 2752249423Sdim } 2753249423Sdim break; 2754249423Sdim } 2755249423Sdim 2756249423Sdim default: 2757249423Sdim // All other instructions are unsupported. Scalarize them. 2758249423Sdim scalarizeInstruction(it); 2759249423Sdim break; 2760249423Sdim }// end of switch. 2761249423Sdim }// end of for_each instr. 2762249423Sdim} 2763249423Sdim 2764249423Sdimvoid InnerLoopVectorizer::updateAnalysis() { 2765249423Sdim // Forget the original basic block. 2766243789Sdim SE->forgetLoop(OrigLoop); 2767243789Sdim 2768243789Sdim // Update the dominator tree information. 2769249423Sdim assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 2770243789Sdim "Entry does not dominate exit."); 2771243789Sdim 2772249423Sdim for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) 2773249423Sdim DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); 2774249423Sdim DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); 2775243789Sdim DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); 2776249423Sdim DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); 2777243789Sdim DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); 2778243789Sdim DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 2779243789Sdim DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2780243789Sdim 2781243789Sdim DEBUG(DT->verifyAnalysis()); 2782243789Sdim} 2783243789Sdim 2784263508Sdim/// \brief Check whether it is safe to if-convert this phi node. 2785263508Sdim/// 2786263508Sdim/// Phi nodes with constant expressions that can trap are not safe to if 2787263508Sdim/// convert. 2788263508Sdimstatic bool canIfConvertPHINodes(BasicBlock *BB) { 2789263508Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 2790263508Sdim PHINode *Phi = dyn_cast<PHINode>(I); 2791263508Sdim if (!Phi) 2792263508Sdim return true; 2793263508Sdim for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p) 2794263508Sdim if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p))) 2795263508Sdim if (C->canTrap()) 2796263508Sdim return false; 2797263508Sdim } 2798263508Sdim return true; 2799263508Sdim} 2800263508Sdim 2801249423Sdimbool LoopVectorizationLegality::canVectorizeWithIfConvert() { 2802249423Sdim if (!EnableIfConversion) 2803249423Sdim return false; 2804249423Sdim 2805249423Sdim assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); 2806249423Sdim 2807263508Sdim // A list of pointers that we can safely read and write to. 2808263508Sdim SmallPtrSet<Value *, 8> SafePointes; 2809263508Sdim 2810263508Sdim // Collect safe addresses. 2811263508Sdim for (Loop::block_iterator BI = TheLoop->block_begin(), 2812263508Sdim BE = TheLoop->block_end(); BI != BE; ++BI) { 2813263508Sdim BasicBlock *BB = *BI; 2814263508Sdim 2815263508Sdim if (blockNeedsPredication(BB)) 2816263508Sdim continue; 2817263508Sdim 2818263508Sdim for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 2819263508Sdim if (LoadInst *LI = dyn_cast<LoadInst>(I)) 2820263508Sdim SafePointes.insert(LI->getPointerOperand()); 2821263508Sdim else if (StoreInst *SI = dyn_cast<StoreInst>(I)) 2822263508Sdim SafePointes.insert(SI->getPointerOperand()); 2823263508Sdim } 2824263508Sdim } 2825263508Sdim 2826249423Sdim // Collect the blocks that need predication. 2827263508Sdim BasicBlock *Header = TheLoop->getHeader(); 2828263508Sdim for (Loop::block_iterator BI = TheLoop->block_begin(), 2829263508Sdim BE = TheLoop->block_end(); BI != BE; ++BI) { 2830263508Sdim BasicBlock *BB = *BI; 2831249423Sdim 2832249423Sdim // We don't support switch statements inside loops. 2833249423Sdim if (!isa<BranchInst>(BB->getTerminator())) 2834249423Sdim return false; 2835249423Sdim 2836249423Sdim // We must be able to predicate all blocks that need to be predicated. 2837263508Sdim if (blockNeedsPredication(BB)) { 2838263508Sdim if (!blockCanBePredicated(BB, SafePointes)) 2839263508Sdim return false; 2840263508Sdim } else if (BB != Header && !canIfConvertPHINodes(BB)) 2841249423Sdim return false; 2842263508Sdim 2843243789Sdim } 2844243789Sdim 2845249423Sdim // We can if-convert this loop. 2846249423Sdim return true; 2847249423Sdim} 2848249423Sdim 2849249423Sdimbool LoopVectorizationLegality::canVectorize() { 2850250997Sdim // We must have a loop in canonical form. Loops with indirectbr in them cannot 2851250997Sdim // be canonicalized. 2852250997Sdim if (!TheLoop->getLoopPreheader()) 2853250997Sdim return false; 2854249423Sdim 2855249423Sdim // We can only vectorize innermost loops. 2856249423Sdim if (TheLoop->getSubLoopsVector().size()) 2857249423Sdim return false; 2858249423Sdim 2859249423Sdim // We must have a single backedge. 2860249423Sdim if (TheLoop->getNumBackEdges() != 1) 2861249423Sdim return false; 2862249423Sdim 2863249423Sdim // We must have a single exiting block. 2864249423Sdim if (!TheLoop->getExitingBlock()) 2865249423Sdim return false; 2866249423Sdim 2867275742Sdim // We only handle bottom-tested loops, i.e. loop in which the condition is 2868275742Sdim // checked at the end of each iteration. With that we can assume that all 2869275742Sdim // instructions in the loop are executed the same number of times. 2870275742Sdim if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 2871275742Sdim DEBUG(dbgs() << "LV: loop control flow is not understood by vectorizer\n"); 2872275742Sdim return false; 2873275742Sdim } 2874275742Sdim 2875263508Sdim // We need to have a loop header. 2876263508Sdim DEBUG(dbgs() << "LV: Found a loop: " << 2877263508Sdim TheLoop->getHeader()->getName() << '\n'); 2878249423Sdim 2879249423Sdim // Check if we can if-convert non single-bb loops. 2880263508Sdim unsigned NumBlocks = TheLoop->getNumBlocks(); 2881249423Sdim if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { 2882249423Sdim DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); 2883243789Sdim return false; 2884243789Sdim } 2885243789Sdim 2886243789Sdim // ScalarEvolution needs to be able to find the exit count. 2887263508Sdim const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); 2888243789Sdim if (ExitCount == SE->getCouldNotCompute()) { 2889243789Sdim DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); 2890243789Sdim return false; 2891243789Sdim } 2892243789Sdim 2893243789Sdim // Do not loop-vectorize loops with a tiny trip count. 2894263508Sdim BasicBlock *Latch = TheLoop->getLoopLatch(); 2895249423Sdim unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); 2896249423Sdim if (TC > 0u && TC < TinyTripCountVectorThreshold) { 2897243789Sdim DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << 2898243789Sdim "This loop is not worth vectorizing.\n"); 2899243789Sdim return false; 2900243789Sdim } 2901243789Sdim 2902249423Sdim // Check if we can vectorize the instructions and CFG in this loop. 2903249423Sdim if (!canVectorizeInstrs()) { 2904249423Sdim DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); 2905249423Sdim return false; 2906249423Sdim } 2907249423Sdim 2908243789Sdim // Go over each instruction and look at memory deps. 2909249423Sdim if (!canVectorizeMemory()) { 2910249423Sdim DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); 2911243789Sdim return false; 2912243789Sdim } 2913243789Sdim 2914249423Sdim // Collect all of the variables that remain uniform after vectorization. 2915249423Sdim collectLoopUniforms(); 2916249423Sdim 2917243789Sdim DEBUG(dbgs() << "LV: We can vectorize this loop" << 2918243789Sdim (PtrRtCheck.Need ? " (with a runtime bound check)" : "") 2919243789Sdim <<"!\n"); 2920243789Sdim 2921243789Sdim // Okay! We can vectorize. At this point we don't have any other mem analysis 2922243789Sdim // which may limit our maximum vectorization factor, so just return true with 2923243789Sdim // no restrictions. 2924243789Sdim return true; 2925243789Sdim} 2926243789Sdim 2927263508Sdimstatic Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { 2928263508Sdim if (Ty->isPointerTy()) 2929263508Sdim return DL.getIntPtrType(Ty); 2930263508Sdim 2931263508Sdim // It is possible that char's or short's overflow when we ask for the loop's 2932263508Sdim // trip count, work around this by changing the type size. 2933263508Sdim if (Ty->getScalarSizeInBits() < 32) 2934263508Sdim return Type::getInt32Ty(Ty->getContext()); 2935263508Sdim 2936263508Sdim return Ty; 2937263508Sdim} 2938263508Sdim 2939263508Sdimstatic Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { 2940263508Sdim Ty0 = convertPointerToIntegerType(DL, Ty0); 2941263508Sdim Ty1 = convertPointerToIntegerType(DL, Ty1); 2942263508Sdim if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) 2943263508Sdim return Ty0; 2944263508Sdim return Ty1; 2945263508Sdim} 2946263508Sdim 2947251662Sdim/// \brief Check that the instruction has outside loop users and is not an 2948251662Sdim/// identified reduction variable. 2949251662Sdimstatic bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, 2950251662Sdim SmallPtrSet<Value *, 4> &Reductions) { 2951251662Sdim // Reduction instructions are allowed to have exit users. All other 2952251662Sdim // instructions must not have external users. 2953251662Sdim if (!Reductions.count(Inst)) 2954251662Sdim //Check that all of the users of the loop are inside the BB. 2955251662Sdim for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); 2956251662Sdim I != E; ++I) { 2957251662Sdim Instruction *U = cast<Instruction>(*I); 2958251662Sdim // This user may be a reduction exit value. 2959251662Sdim if (!TheLoop->contains(U)) { 2960263508Sdim DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n'); 2961251662Sdim return true; 2962251662Sdim } 2963251662Sdim } 2964251662Sdim return false; 2965251662Sdim} 2966251662Sdim 2967249423Sdimbool LoopVectorizationLegality::canVectorizeInstrs() { 2968249423Sdim BasicBlock *PreHeader = TheLoop->getLoopPreheader(); 2969249423Sdim BasicBlock *Header = TheLoop->getHeader(); 2970243789Sdim 2971251662Sdim // Look for the attribute signaling the absence of NaNs. 2972251662Sdim Function &F = *Header->getParent(); 2973251662Sdim if (F.hasFnAttribute("no-nans-fp-math")) 2974251662Sdim HasFunNoNaNAttr = F.getAttributes().getAttribute( 2975251662Sdim AttributeSet::FunctionIndex, 2976251662Sdim "no-nans-fp-math").getValueAsString() == "true"; 2977251662Sdim 2978249423Sdim // For each block in the loop. 2979249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 2980249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 2981249423Sdim 2982249423Sdim // Scan the instructions in the block and look for hazards. 2983249423Sdim for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 2984249423Sdim ++it) { 2985249423Sdim 2986249423Sdim if (PHINode *Phi = dyn_cast<PHINode>(it)) { 2987263508Sdim Type *PhiTy = Phi->getType(); 2988249423Sdim // Check that this PHI type is allowed. 2989263508Sdim if (!PhiTy->isIntegerTy() && 2990263508Sdim !PhiTy->isFloatingPointTy() && 2991263508Sdim !PhiTy->isPointerTy()) { 2992249423Sdim DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); 2993249423Sdim return false; 2994249423Sdim } 2995249423Sdim 2996249423Sdim // If this PHINode is not in the header block, then we know that we 2997249423Sdim // can convert it to select during if-conversion. No need to check if 2998249423Sdim // the PHIs in this block are induction or reduction variables. 2999251662Sdim if (*bb != Header) { 3000251662Sdim // Check that this instruction has no outside users or is an 3001251662Sdim // identified reduction value with an outside user. 3002251662Sdim if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) 3003251662Sdim continue; 3004251662Sdim return false; 3005251662Sdim } 3006249423Sdim 3007251662Sdim // We only allow if-converted PHIs with more than two incoming values. 3008251662Sdim if (Phi->getNumIncomingValues() != 2) { 3009251662Sdim DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); 3010251662Sdim return false; 3011251662Sdim } 3012251662Sdim 3013249423Sdim // This is the value coming from the preheader. 3014249423Sdim Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); 3015249423Sdim // Check if this is an induction variable. 3016249423Sdim InductionKind IK = isInductionVariable(Phi); 3017249423Sdim 3018249423Sdim if (IK_NoInduction != IK) { 3019263508Sdim // Get the widest type. 3020263508Sdim if (!WidestIndTy) 3021263508Sdim WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); 3022263508Sdim else 3023263508Sdim WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); 3024263508Sdim 3025249423Sdim // Int inductions are special because we only allow one IV. 3026249423Sdim if (IK == IK_IntInduction) { 3027263508Sdim // Use the phi node with the widest type as induction. Use the last 3028263508Sdim // one if there are multiple (no good reason for doing this other 3029263508Sdim // than it is expedient). 3030263508Sdim if (!Induction || PhiTy == WidestIndTy) 3031263508Sdim Induction = Phi; 3032249423Sdim } 3033249423Sdim 3034249423Sdim DEBUG(dbgs() << "LV: Found an induction variable.\n"); 3035249423Sdim Inductions[Phi] = InductionInfo(StartValue, IK); 3036263508Sdim 3037263508Sdim // Until we explicitly handle the case of an induction variable with 3038263508Sdim // an outside loop user we have to give up vectorizing this loop. 3039263508Sdim if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) 3040263508Sdim return false; 3041263508Sdim 3042249423Sdim continue; 3043249423Sdim } 3044249423Sdim 3045249423Sdim if (AddReductionVar(Phi, RK_IntegerAdd)) { 3046249423Sdim DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); 3047249423Sdim continue; 3048249423Sdim } 3049249423Sdim if (AddReductionVar(Phi, RK_IntegerMult)) { 3050249423Sdim DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); 3051249423Sdim continue; 3052249423Sdim } 3053249423Sdim if (AddReductionVar(Phi, RK_IntegerOr)) { 3054249423Sdim DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); 3055249423Sdim continue; 3056249423Sdim } 3057249423Sdim if (AddReductionVar(Phi, RK_IntegerAnd)) { 3058249423Sdim DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); 3059249423Sdim continue; 3060249423Sdim } 3061249423Sdim if (AddReductionVar(Phi, RK_IntegerXor)) { 3062249423Sdim DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); 3063249423Sdim continue; 3064249423Sdim } 3065251662Sdim if (AddReductionVar(Phi, RK_IntegerMinMax)) { 3066251662Sdim DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n"); 3067251662Sdim continue; 3068251662Sdim } 3069249423Sdim if (AddReductionVar(Phi, RK_FloatMult)) { 3070249423Sdim DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n"); 3071249423Sdim continue; 3072249423Sdim } 3073249423Sdim if (AddReductionVar(Phi, RK_FloatAdd)) { 3074249423Sdim DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n"); 3075249423Sdim continue; 3076249423Sdim } 3077251662Sdim if (AddReductionVar(Phi, RK_FloatMinMax)) { 3078263508Sdim DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi << 3079263508Sdim "\n"); 3080251662Sdim continue; 3081251662Sdim } 3082249423Sdim 3083249423Sdim DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); 3084243789Sdim return false; 3085249423Sdim }// end of PHI handling 3086249423Sdim 3087249423Sdim // We still don't handle functions. However, we can ignore dbg intrinsic 3088249423Sdim // calls and we do handle certain intrinsic and libm functions. 3089249423Sdim CallInst *CI = dyn_cast<CallInst>(it); 3090249423Sdim if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) { 3091249423Sdim DEBUG(dbgs() << "LV: Found a call site.\n"); 3092249423Sdim return false; 3093243789Sdim } 3094249423Sdim 3095249423Sdim // Check that the instruction return type is vectorizable. 3096263508Sdim // Also, we can't vectorize extractelement instructions. 3097263508Sdim if ((!VectorType::isValidElementType(it->getType()) && 3098263508Sdim !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) { 3099263508Sdim DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); 3100243789Sdim return false; 3101243789Sdim } 3102243789Sdim 3103249423Sdim // Check that the stored type is vectorizable. 3104249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(it)) { 3105249423Sdim Type *T = ST->getValueOperand()->getType(); 3106249423Sdim if (!VectorType::isValidElementType(T)) 3107243789Sdim return false; 3108243789Sdim } 3109243789Sdim 3110249423Sdim // Reduction instructions are allowed to have exit users. 3111249423Sdim // All other instructions must not have external users. 3112251662Sdim if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) 3113251662Sdim return false; 3114251662Sdim 3115249423Sdim } // next instr. 3116243789Sdim 3117249423Sdim } 3118243789Sdim 3119243789Sdim if (!Induction) { 3120249423Sdim DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); 3121263508Sdim if (Inductions.empty()) 3122263508Sdim return false; 3123243789Sdim } 3124243789Sdim 3125249423Sdim return true; 3126249423Sdim} 3127243789Sdim 3128249423Sdimvoid LoopVectorizationLegality::collectLoopUniforms() { 3129243789Sdim // We now know that the loop is vectorizable! 3130243789Sdim // Collect variables that will remain uniform after vectorization. 3131243789Sdim std::vector<Value*> Worklist; 3132249423Sdim BasicBlock *Latch = TheLoop->getLoopLatch(); 3133243789Sdim 3134243789Sdim // Start with the conditional branch and walk up the block. 3135249423Sdim Worklist.push_back(Latch->getTerminator()->getOperand(0)); 3136243789Sdim 3137243789Sdim while (Worklist.size()) { 3138243789Sdim Instruction *I = dyn_cast<Instruction>(Worklist.back()); 3139243789Sdim Worklist.pop_back(); 3140243789Sdim 3141249423Sdim // Look at instructions inside this loop. 3142243789Sdim // Stop when reaching PHI nodes. 3143249423Sdim // TODO: we need to follow values all over the loop, not only in this block. 3144249423Sdim if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) 3145249423Sdim continue; 3146243789Sdim 3147243789Sdim // This is a known uniform. 3148243789Sdim Uniforms.insert(I); 3149243789Sdim 3150243789Sdim // Insert all operands. 3151263508Sdim Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); 3152243789Sdim } 3153249423Sdim} 3154243789Sdim 3155263508Sdimnamespace { 3156263508Sdim/// \brief Analyses memory accesses in a loop. 3157263508Sdim/// 3158263508Sdim/// Checks whether run time pointer checks are needed and builds sets for data 3159263508Sdim/// dependence checking. 3160263508Sdimclass AccessAnalysis { 3161263508Sdimpublic: 3162263508Sdim /// \brief Read or write access location. 3163263508Sdim typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; 3164263508Sdim typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; 3165249423Sdim 3166263508Sdim /// \brief Set of potential dependent memory accesses. 3167263508Sdim typedef EquivalenceClasses<MemAccessInfo> DepCandidates; 3168263508Sdim 3169263508Sdim AccessAnalysis(DataLayout *Dl, DepCandidates &DA) : 3170263508Sdim DL(Dl), DepCands(DA), AreAllWritesIdentified(true), 3171263508Sdim AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} 3172263508Sdim 3173263508Sdim /// \brief Register a load and whether it is only read from. 3174263508Sdim void addLoad(Value *Ptr, bool IsReadOnly) { 3175263508Sdim Accesses.insert(MemAccessInfo(Ptr, false)); 3176263508Sdim if (IsReadOnly) 3177263508Sdim ReadOnlyPtr.insert(Ptr); 3178263508Sdim } 3179263508Sdim 3180263508Sdim /// \brief Register a store. 3181263508Sdim void addStore(Value *Ptr) { 3182263508Sdim Accesses.insert(MemAccessInfo(Ptr, true)); 3183263508Sdim } 3184263508Sdim 3185263508Sdim /// \brief Check whether we can check the pointers at runtime for 3186263508Sdim /// non-intersection. 3187263508Sdim bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, 3188263508Sdim unsigned &NumComparisons, ScalarEvolution *SE, 3189263508Sdim Loop *TheLoop, bool ShouldCheckStride = false); 3190263508Sdim 3191263508Sdim /// \brief Goes over all memory accesses, checks whether a RT check is needed 3192263508Sdim /// and builds sets of dependent accesses. 3193263508Sdim void buildDependenceSets() { 3194263508Sdim // Process read-write pointers first. 3195263508Sdim processMemAccesses(false); 3196263508Sdim // Next, process read pointers. 3197263508Sdim processMemAccesses(true); 3198263508Sdim } 3199263508Sdim 3200263508Sdim bool isRTCheckNeeded() { return IsRTCheckNeeded; } 3201263508Sdim 3202263508Sdim bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } 3203263508Sdim void resetDepChecks() { CheckDeps.clear(); } 3204263508Sdim 3205263508Sdim MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } 3206263508Sdim 3207263508Sdimprivate: 3208263508Sdim typedef SetVector<MemAccessInfo> PtrAccessSet; 3209263508Sdim typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; 3210263508Sdim 3211263508Sdim /// \brief Go over all memory access or only the deferred ones if 3212263508Sdim /// \p UseDeferred is true and check whether runtime pointer checks are needed 3213263508Sdim /// and build sets of dependency check candidates. 3214263508Sdim void processMemAccesses(bool UseDeferred); 3215263508Sdim 3216263508Sdim /// Set of all accesses. 3217263508Sdim PtrAccessSet Accesses; 3218263508Sdim 3219263508Sdim /// Set of access to check after all writes have been processed. 3220263508Sdim PtrAccessSet DeferredAccesses; 3221263508Sdim 3222263508Sdim /// Map of pointers to last access encountered. 3223263508Sdim UnderlyingObjToAccessMap ObjToLastAccess; 3224263508Sdim 3225263508Sdim /// Set of accesses that need a further dependence check. 3226263508Sdim MemAccessInfoSet CheckDeps; 3227263508Sdim 3228263508Sdim /// Set of pointers that are read only. 3229263508Sdim SmallPtrSet<Value*, 16> ReadOnlyPtr; 3230263508Sdim 3231263508Sdim /// Set of underlying objects already written to. 3232263508Sdim SmallPtrSet<Value*, 16> WriteObjects; 3233263508Sdim 3234263508Sdim DataLayout *DL; 3235263508Sdim 3236263508Sdim /// Sets of potentially dependent accesses - members of one set share an 3237263508Sdim /// underlying pointer. The set "CheckDeps" identfies which sets really need a 3238263508Sdim /// dependence check. 3239263508Sdim DepCandidates &DepCands; 3240263508Sdim 3241263508Sdim bool AreAllWritesIdentified; 3242263508Sdim bool AreAllReadsIdentified; 3243263508Sdim bool IsRTCheckNeeded; 3244263508Sdim}; 3245263508Sdim 3246263508Sdim} // end anonymous namespace 3247263508Sdim 3248263508Sdim/// \brief Check whether a pointer can participate in a runtime bounds check. 3249263508Sdimstatic bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) { 3250263508Sdim const SCEV *PtrScev = SE->getSCEV(Ptr); 3251263508Sdim const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); 3252263508Sdim if (!AR) 3253263508Sdim return false; 3254263508Sdim 3255263508Sdim return AR->isAffine(); 3256243789Sdim} 3257243789Sdim 3258263508Sdim/// \brief Check the stride of the pointer and ensure that it does not wrap in 3259263508Sdim/// the address space. 3260263508Sdimstatic int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, 3261263508Sdim const Loop *Lp); 3262249423Sdim 3263263508Sdimbool AccessAnalysis::canCheckPtrAtRT( 3264263508Sdim LoopVectorizationLegality::RuntimePointerCheck &RtCheck, 3265263508Sdim unsigned &NumComparisons, ScalarEvolution *SE, 3266263508Sdim Loop *TheLoop, bool ShouldCheckStride) { 3267263508Sdim // Find pointers with computable bounds. We are going to use this information 3268263508Sdim // to place a runtime bound check. 3269263508Sdim unsigned NumReadPtrChecks = 0; 3270263508Sdim unsigned NumWritePtrChecks = 0; 3271263508Sdim bool CanDoRT = true; 3272249423Sdim 3273263508Sdim bool IsDepCheckNeeded = isDependencyCheckNeeded(); 3274263508Sdim // We assign consecutive id to access from different dependence sets. 3275263508Sdim // Accesses within the same set don't need a runtime check. 3276263508Sdim unsigned RunningDepId = 1; 3277263508Sdim DenseMap<Value *, unsigned> DepSetId; 3278249423Sdim 3279263508Sdim for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); 3280263508Sdim AI != AE; ++AI) { 3281263508Sdim const MemAccessInfo &Access = *AI; 3282263508Sdim Value *Ptr = Access.getPointer(); 3283263508Sdim bool IsWrite = Access.getInt(); 3284263508Sdim 3285263508Sdim // Just add write checks if we have both. 3286263508Sdim if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) 3287249423Sdim continue; 3288249423Sdim 3289263508Sdim if (IsWrite) 3290263508Sdim ++NumWritePtrChecks; 3291263508Sdim else 3292263508Sdim ++NumReadPtrChecks; 3293263508Sdim 3294263508Sdim if (hasComputableBounds(SE, Ptr) && 3295263508Sdim // When we run after a failing dependency check we have to make sure we 3296263508Sdim // don't have wrapping pointers. 3297263508Sdim (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) { 3298263508Sdim // The id of the dependence set. 3299263508Sdim unsigned DepId; 3300263508Sdim 3301263508Sdim if (IsDepCheckNeeded) { 3302263508Sdim Value *Leader = DepCands.getLeaderValue(Access).getPointer(); 3303263508Sdim unsigned &LeaderId = DepSetId[Leader]; 3304263508Sdim if (!LeaderId) 3305263508Sdim LeaderId = RunningDepId++; 3306263508Sdim DepId = LeaderId; 3307263508Sdim } else 3308263508Sdim // Each access has its own dependence set. 3309263508Sdim DepId = RunningDepId++; 3310263508Sdim 3311263508Sdim RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); 3312263508Sdim 3313263508Sdim DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); 3314263508Sdim } else { 3315263508Sdim CanDoRT = false; 3316263508Sdim } 3317263508Sdim } 3318263508Sdim 3319263508Sdim if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) 3320263508Sdim NumComparisons = 0; // Only one dependence set. 3321263508Sdim else { 3322263508Sdim NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + 3323263508Sdim NumWritePtrChecks - 1)); 3324263508Sdim } 3325263508Sdim 3326263508Sdim // If the pointers that we would use for the bounds comparison have different 3327263508Sdim // address spaces, assume the values aren't directly comparable, so we can't 3328263508Sdim // use them for the runtime check. We also have to assume they could 3329263508Sdim // overlap. In the future there should be metadata for whether address spaces 3330263508Sdim // are disjoint. 3331263508Sdim unsigned NumPointers = RtCheck.Pointers.size(); 3332263508Sdim for (unsigned i = 0; i < NumPointers; ++i) { 3333263508Sdim for (unsigned j = i + 1; j < NumPointers; ++j) { 3334263508Sdim // Only need to check pointers between two different dependency sets. 3335263508Sdim if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) 3336263508Sdim continue; 3337263508Sdim 3338263508Sdim Value *PtrI = RtCheck.Pointers[i]; 3339263508Sdim Value *PtrJ = RtCheck.Pointers[j]; 3340263508Sdim 3341263508Sdim unsigned ASi = PtrI->getType()->getPointerAddressSpace(); 3342263508Sdim unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); 3343263508Sdim if (ASi != ASj) { 3344263508Sdim DEBUG(dbgs() << "LV: Runtime check would require comparison between" 3345263508Sdim " different address spaces\n"); 3346263508Sdim return false; 3347263508Sdim } 3348263508Sdim } 3349263508Sdim } 3350263508Sdim 3351263508Sdim return CanDoRT; 3352263508Sdim} 3353263508Sdim 3354263508Sdimstatic bool isFunctionScopeIdentifiedObject(Value *Ptr) { 3355263508Sdim return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr); 3356263508Sdim} 3357263508Sdim 3358263508Sdimvoid AccessAnalysis::processMemAccesses(bool UseDeferred) { 3359263508Sdim // We process the set twice: first we process read-write pointers, last we 3360263508Sdim // process read-only pointers. This allows us to skip dependence tests for 3361263508Sdim // read-only pointers. 3362263508Sdim 3363263508Sdim PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; 3364263508Sdim for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { 3365263508Sdim const MemAccessInfo &Access = *AI; 3366263508Sdim Value *Ptr = Access.getPointer(); 3367263508Sdim bool IsWrite = Access.getInt(); 3368263508Sdim 3369263508Sdim DepCands.insert(Access); 3370263508Sdim 3371263508Sdim // Memorize read-only pointers for later processing and skip them in the 3372263508Sdim // first round (they need to be checked after we have seen all write 3373263508Sdim // pointers). Note: we also mark pointer that are not consecutive as 3374263508Sdim // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the 3375263508Sdim // second check for "!IsWrite". 3376263508Sdim bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; 3377263508Sdim if (!UseDeferred && IsReadOnlyPtr) { 3378263508Sdim DeferredAccesses.insert(Access); 3379263508Sdim continue; 3380263508Sdim } 3381263508Sdim 3382263508Sdim bool NeedDepCheck = false; 3383263508Sdim // Check whether there is the possiblity of dependency because of underlying 3384263508Sdim // objects being the same. 3385263508Sdim typedef SmallVector<Value*, 16> ValueVector; 3386263508Sdim ValueVector TempObjects; 3387263508Sdim GetUnderlyingObjects(Ptr, TempObjects, DL); 3388263508Sdim for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); 3389263508Sdim UI != UE; ++UI) { 3390263508Sdim Value *UnderlyingObj = *UI; 3391263508Sdim 3392263508Sdim // If this is a write then it needs to be an identified object. If this a 3393263508Sdim // read and all writes (so far) are identified function scope objects we 3394263508Sdim // don't need an identified underlying object but only an Argument (the 3395263508Sdim // next write is going to invalidate this assumption if it is 3396263508Sdim // unidentified). 3397263508Sdim // This is a micro-optimization for the case where all writes are 3398263508Sdim // identified and we have one argument pointer. 3399263508Sdim // Otherwise, we do need a runtime check. 3400263508Sdim if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || 3401263508Sdim (!IsWrite && (!AreAllWritesIdentified || 3402263508Sdim !isa<Argument>(UnderlyingObj)) && 3403263508Sdim !isIdentifiedObject(UnderlyingObj))) { 3404263508Sdim DEBUG(dbgs() << "LV: Found an unidentified " << 3405263508Sdim (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << 3406263508Sdim "\n"); 3407263508Sdim IsRTCheckNeeded = (IsRTCheckNeeded || 3408263508Sdim !isIdentifiedObject(UnderlyingObj) || 3409263508Sdim !AreAllReadsIdentified); 3410263508Sdim 3411263508Sdim if (IsWrite) 3412263508Sdim AreAllWritesIdentified = false; 3413263508Sdim if (!IsWrite) 3414263508Sdim AreAllReadsIdentified = false; 3415263508Sdim } 3416263508Sdim 3417263508Sdim // If this is a write - check other reads and writes for conflicts. If 3418263508Sdim // this is a read only check other writes for conflicts (but only if there 3419263508Sdim // is no other write to the ptr - this is an optimization to catch "a[i] = 3420263508Sdim // a[i] + " without having to do a dependence check). 3421263508Sdim if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) 3422263508Sdim NeedDepCheck = true; 3423263508Sdim 3424263508Sdim if (IsWrite) 3425263508Sdim WriteObjects.insert(UnderlyingObj); 3426263508Sdim 3427263508Sdim // Create sets of pointers connected by shared underlying objects. 3428263508Sdim UnderlyingObjToAccessMap::iterator Prev = 3429263508Sdim ObjToLastAccess.find(UnderlyingObj); 3430263508Sdim if (Prev != ObjToLastAccess.end()) 3431263508Sdim DepCands.unionSets(Access, Prev->second); 3432263508Sdim 3433263508Sdim ObjToLastAccess[UnderlyingObj] = Access; 3434263508Sdim } 3435263508Sdim 3436263508Sdim if (NeedDepCheck) 3437263508Sdim CheckDeps.insert(Access); 3438263508Sdim } 3439263508Sdim} 3440263508Sdim 3441263508Sdimnamespace { 3442263508Sdim/// \brief Checks memory dependences among accesses to the same underlying 3443263508Sdim/// object to determine whether there vectorization is legal or not (and at 3444263508Sdim/// which vectorization factor). 3445263508Sdim/// 3446263508Sdim/// This class works under the assumption that we already checked that memory 3447263508Sdim/// locations with different underlying pointers are "must-not alias". 3448263508Sdim/// We use the ScalarEvolution framework to symbolically evalutate access 3449263508Sdim/// functions pairs. Since we currently don't restructure the loop we can rely 3450263508Sdim/// on the program order of memory accesses to determine their safety. 3451263508Sdim/// At the moment we will only deem accesses as safe for: 3452263508Sdim/// * A negative constant distance assuming program order. 3453263508Sdim/// 3454263508Sdim/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; 3455263508Sdim/// a[i] = tmp; y = a[i]; 3456263508Sdim/// 3457263508Sdim/// The latter case is safe because later checks guarantuee that there can't 3458263508Sdim/// be a cycle through a phi node (that is, we check that "x" and "y" is not 3459263508Sdim/// the same variable: a header phi can only be an induction or a reduction, a 3460263508Sdim/// reduction can't have a memory sink, an induction can't have a memory 3461263508Sdim/// source). This is important and must not be violated (or we have to 3462263508Sdim/// resort to checking for cycles through memory). 3463263508Sdim/// 3464263508Sdim/// * A positive constant distance assuming program order that is bigger 3465263508Sdim/// than the biggest memory access. 3466263508Sdim/// 3467263508Sdim/// tmp = a[i] OR b[i] = x 3468263508Sdim/// a[i+2] = tmp y = b[i+2]; 3469263508Sdim/// 3470263508Sdim/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. 3471263508Sdim/// 3472263508Sdim/// * Zero distances and all accesses have the same size. 3473263508Sdim/// 3474263508Sdimclass MemoryDepChecker { 3475263508Sdimpublic: 3476263508Sdim typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; 3477263508Sdim typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; 3478263508Sdim 3479263508Sdim MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) 3480263508Sdim : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), 3481263508Sdim ShouldRetryWithRuntimeCheck(false) {} 3482263508Sdim 3483263508Sdim /// \brief Register the location (instructions are given increasing numbers) 3484263508Sdim /// of a write access. 3485263508Sdim void addAccess(StoreInst *SI) { 3486263508Sdim Value *Ptr = SI->getPointerOperand(); 3487263508Sdim Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); 3488263508Sdim InstMap.push_back(SI); 3489263508Sdim ++AccessIdx; 3490263508Sdim } 3491263508Sdim 3492263508Sdim /// \brief Register the location (instructions are given increasing numbers) 3493263508Sdim /// of a write access. 3494263508Sdim void addAccess(LoadInst *LI) { 3495263508Sdim Value *Ptr = LI->getPointerOperand(); 3496263508Sdim Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); 3497263508Sdim InstMap.push_back(LI); 3498263508Sdim ++AccessIdx; 3499263508Sdim } 3500263508Sdim 3501263508Sdim /// \brief Check whether the dependencies between the accesses are safe. 3502263508Sdim /// 3503263508Sdim /// Only checks sets with elements in \p CheckDeps. 3504263508Sdim bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, 3505263508Sdim MemAccessInfoSet &CheckDeps); 3506263508Sdim 3507263508Sdim /// \brief The maximum number of bytes of a vector register we can vectorize 3508263508Sdim /// the accesses safely with. 3509263508Sdim unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } 3510263508Sdim 3511263508Sdim /// \brief In same cases when the dependency check fails we can still 3512263508Sdim /// vectorize the loop with a dynamic array access check. 3513263508Sdim bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } 3514263508Sdim 3515263508Sdimprivate: 3516263508Sdim ScalarEvolution *SE; 3517263508Sdim DataLayout *DL; 3518263508Sdim const Loop *InnermostLoop; 3519263508Sdim 3520263508Sdim /// \brief Maps access locations (ptr, read/write) to program order. 3521263508Sdim DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; 3522263508Sdim 3523263508Sdim /// \brief Memory access instructions in program order. 3524263508Sdim SmallVector<Instruction *, 16> InstMap; 3525263508Sdim 3526263508Sdim /// \brief The program order index to be used for the next instruction. 3527263508Sdim unsigned AccessIdx; 3528263508Sdim 3529263508Sdim // We can access this many bytes in parallel safely. 3530263508Sdim unsigned MaxSafeDepDistBytes; 3531263508Sdim 3532263508Sdim /// \brief If we see a non constant dependence distance we can still try to 3533263508Sdim /// vectorize this loop with runtime checks. 3534263508Sdim bool ShouldRetryWithRuntimeCheck; 3535263508Sdim 3536263508Sdim /// \brief Check whether there is a plausible dependence between the two 3537263508Sdim /// accesses. 3538263508Sdim /// 3539263508Sdim /// Access \p A must happen before \p B in program order. The two indices 3540263508Sdim /// identify the index into the program order map. 3541263508Sdim /// 3542263508Sdim /// This function checks whether there is a plausible dependence (or the 3543263508Sdim /// absence of such can't be proved) between the two accesses. If there is a 3544263508Sdim /// plausible dependence but the dependence distance is bigger than one 3545263508Sdim /// element access it records this distance in \p MaxSafeDepDistBytes (if this 3546263508Sdim /// distance is smaller than any other distance encountered so far). 3547263508Sdim /// Otherwise, this function returns true signaling a possible dependence. 3548263508Sdim bool isDependent(const MemAccessInfo &A, unsigned AIdx, 3549263508Sdim const MemAccessInfo &B, unsigned BIdx); 3550263508Sdim 3551263508Sdim /// \brief Check whether the data dependence could prevent store-load 3552263508Sdim /// forwarding. 3553263508Sdim bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); 3554263508Sdim}; 3555263508Sdim 3556263508Sdim} // end anonymous namespace 3557263508Sdim 3558263508Sdimstatic bool isInBoundsGep(Value *Ptr) { 3559263508Sdim if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) 3560263508Sdim return GEP->isInBounds(); 3561263508Sdim return false; 3562263508Sdim} 3563263508Sdim 3564263508Sdim/// \brief Check whether the access through \p Ptr has a constant stride. 3565263508Sdimstatic int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, 3566263508Sdim const Loop *Lp) { 3567263508Sdim const Type *Ty = Ptr->getType(); 3568263508Sdim assert(Ty->isPointerTy() && "Unexpected non ptr"); 3569263508Sdim 3570263508Sdim // Make sure that the pointer does not point to aggregate types. 3571263508Sdim const PointerType *PtrTy = cast<PointerType>(Ty); 3572263508Sdim if (PtrTy->getElementType()->isAggregateType()) { 3573263508Sdim DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << 3574263508Sdim "\n"); 3575263508Sdim return 0; 3576263508Sdim } 3577263508Sdim 3578263508Sdim const SCEV *PtrScev = SE->getSCEV(Ptr); 3579263508Sdim const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); 3580263508Sdim if (!AR) { 3581263508Sdim DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " 3582263508Sdim << *Ptr << " SCEV: " << *PtrScev << "\n"); 3583263508Sdim return 0; 3584263508Sdim } 3585263508Sdim 3586263508Sdim // The accesss function must stride over the innermost loop. 3587263508Sdim if (Lp != AR->getLoop()) { 3588263508Sdim DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << 3589263508Sdim *Ptr << " SCEV: " << *PtrScev << "\n"); 3590263508Sdim } 3591263508Sdim 3592263508Sdim // The address calculation must not wrap. Otherwise, a dependence could be 3593263508Sdim // inverted. 3594263508Sdim // An inbounds getelementptr that is a AddRec with a unit stride 3595263508Sdim // cannot wrap per definition. The unit stride requirement is checked later. 3596263508Sdim // An getelementptr without an inbounds attribute and unit stride would have 3597263508Sdim // to access the pointer value "0" which is undefined behavior in address 3598263508Sdim // space 0, therefore we can also vectorize this case. 3599263508Sdim bool IsInBoundsGEP = isInBoundsGep(Ptr); 3600263508Sdim bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); 3601263508Sdim bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; 3602263508Sdim if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { 3603263508Sdim DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " 3604263508Sdim << *Ptr << " SCEV: " << *PtrScev << "\n"); 3605263508Sdim return 0; 3606263508Sdim } 3607263508Sdim 3608263508Sdim // Check the step is constant. 3609263508Sdim const SCEV *Step = AR->getStepRecurrence(*SE); 3610263508Sdim 3611263508Sdim // Calculate the pointer stride and check if it is consecutive. 3612263508Sdim const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 3613263508Sdim if (!C) { 3614263508Sdim DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << 3615263508Sdim " SCEV: " << *PtrScev << "\n"); 3616263508Sdim return 0; 3617263508Sdim } 3618263508Sdim 3619263508Sdim int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); 3620263508Sdim const APInt &APStepVal = C->getValue()->getValue(); 3621263508Sdim 3622263508Sdim // Huge step value - give up. 3623263508Sdim if (APStepVal.getBitWidth() > 64) 3624263508Sdim return 0; 3625263508Sdim 3626263508Sdim int64_t StepVal = APStepVal.getSExtValue(); 3627263508Sdim 3628263508Sdim // Strided access. 3629263508Sdim int64_t Stride = StepVal / Size; 3630263508Sdim int64_t Rem = StepVal % Size; 3631263508Sdim if (Rem) 3632263508Sdim return 0; 3633263508Sdim 3634263508Sdim // If the SCEV could wrap but we have an inbounds gep with a unit stride we 3635263508Sdim // know we can't "wrap around the address space". In case of address space 3636263508Sdim // zero we know that this won't happen without triggering undefined behavior. 3637263508Sdim if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && 3638263508Sdim Stride != 1 && Stride != -1) 3639263508Sdim return 0; 3640263508Sdim 3641263508Sdim return Stride; 3642263508Sdim} 3643263508Sdim 3644263508Sdimbool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, 3645263508Sdim unsigned TypeByteSize) { 3646263508Sdim // If loads occur at a distance that is not a multiple of a feasible vector 3647263508Sdim // factor store-load forwarding does not take place. 3648263508Sdim // Positive dependences might cause troubles because vectorizing them might 3649263508Sdim // prevent store-load forwarding making vectorized code run a lot slower. 3650263508Sdim // a[i] = a[i-3] ^ a[i-8]; 3651263508Sdim // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and 3652263508Sdim // hence on your typical architecture store-load forwarding does not take 3653263508Sdim // place. Vectorizing in such cases does not make sense. 3654263508Sdim // Store-load forwarding distance. 3655263508Sdim const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; 3656263508Sdim // Maximum vector factor. 3657263508Sdim unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; 3658263508Sdim if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) 3659263508Sdim MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; 3660263508Sdim 3661263508Sdim for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; 3662263508Sdim vf *= 2) { 3663263508Sdim if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { 3664263508Sdim MaxVFWithoutSLForwardIssues = (vf >>=1); 3665263508Sdim break; 3666263508Sdim } 3667263508Sdim } 3668263508Sdim 3669263508Sdim if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { 3670263508Sdim DEBUG(dbgs() << "LV: Distance " << Distance << 3671263508Sdim " that could cause a store-load forwarding conflict\n"); 3672263508Sdim return true; 3673263508Sdim } 3674263508Sdim 3675263508Sdim if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && 3676263508Sdim MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) 3677263508Sdim MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; 3678263508Sdim return false; 3679263508Sdim} 3680263508Sdim 3681263508Sdimbool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, 3682263508Sdim const MemAccessInfo &B, unsigned BIdx) { 3683263508Sdim assert (AIdx < BIdx && "Must pass arguments in program order"); 3684263508Sdim 3685263508Sdim Value *APtr = A.getPointer(); 3686263508Sdim Value *BPtr = B.getPointer(); 3687263508Sdim bool AIsWrite = A.getInt(); 3688263508Sdim bool BIsWrite = B.getInt(); 3689263508Sdim 3690263508Sdim // Two reads are independent. 3691263508Sdim if (!AIsWrite && !BIsWrite) 3692263508Sdim return false; 3693263508Sdim 3694263508Sdim const SCEV *AScev = SE->getSCEV(APtr); 3695263508Sdim const SCEV *BScev = SE->getSCEV(BPtr); 3696263508Sdim 3697263508Sdim int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop); 3698263508Sdim int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop); 3699263508Sdim 3700263508Sdim const SCEV *Src = AScev; 3701263508Sdim const SCEV *Sink = BScev; 3702263508Sdim 3703263508Sdim // If the induction step is negative we have to invert source and sink of the 3704263508Sdim // dependence. 3705263508Sdim if (StrideAPtr < 0) { 3706263508Sdim //Src = BScev; 3707263508Sdim //Sink = AScev; 3708263508Sdim std::swap(APtr, BPtr); 3709263508Sdim std::swap(Src, Sink); 3710263508Sdim std::swap(AIsWrite, BIsWrite); 3711263508Sdim std::swap(AIdx, BIdx); 3712263508Sdim std::swap(StrideAPtr, StrideBPtr); 3713263508Sdim } 3714263508Sdim 3715263508Sdim const SCEV *Dist = SE->getMinusSCEV(Sink, Src); 3716263508Sdim 3717263508Sdim DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink 3718263508Sdim << "(Induction step: " << StrideAPtr << ")\n"); 3719263508Sdim DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " 3720263508Sdim << *InstMap[BIdx] << ": " << *Dist << "\n"); 3721263508Sdim 3722263508Sdim // Need consecutive accesses. We don't want to vectorize 3723263508Sdim // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in 3724263508Sdim // the address space. 3725263508Sdim if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ 3726263508Sdim DEBUG(dbgs() << "Non-consecutive pointer access\n"); 3727263508Sdim return true; 3728263508Sdim } 3729263508Sdim 3730263508Sdim const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); 3731263508Sdim if (!C) { 3732263508Sdim DEBUG(dbgs() << "LV: Dependence because of non constant distance\n"); 3733263508Sdim ShouldRetryWithRuntimeCheck = true; 3734263508Sdim return true; 3735263508Sdim } 3736263508Sdim 3737263508Sdim Type *ATy = APtr->getType()->getPointerElementType(); 3738263508Sdim Type *BTy = BPtr->getType()->getPointerElementType(); 3739263508Sdim unsigned TypeByteSize = DL->getTypeAllocSize(ATy); 3740263508Sdim 3741263508Sdim // Negative distances are not plausible dependencies. 3742263508Sdim const APInt &Val = C->getValue()->getValue(); 3743263508Sdim if (Val.isNegative()) { 3744263508Sdim bool IsTrueDataDependence = (AIsWrite && !BIsWrite); 3745263508Sdim if (IsTrueDataDependence && 3746263508Sdim (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || 3747263508Sdim ATy != BTy)) 3748249423Sdim return true; 3749263508Sdim 3750263508Sdim DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); 3751263508Sdim return false; 3752249423Sdim } 3753263508Sdim 3754263508Sdim // Write to the same location with the same size. 3755263508Sdim // Could be improved to assert type sizes are the same (i32 == float, etc). 3756263508Sdim if (Val == 0) { 3757263508Sdim if (ATy == BTy) 3758263508Sdim return false; 3759263508Sdim DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); 3760263508Sdim return true; 3761263508Sdim } 3762263508Sdim 3763263508Sdim assert(Val.isStrictlyPositive() && "Expect a positive value"); 3764263508Sdim 3765263508Sdim // Positive distance bigger than max vectorization factor. 3766263508Sdim if (ATy != BTy) { 3767263508Sdim DEBUG(dbgs() << 3768263508Sdim "LV: ReadWrite-Write positive dependency with different types\n"); 3769263508Sdim return false; 3770263508Sdim } 3771263508Sdim 3772263508Sdim unsigned Distance = (unsigned) Val.getZExtValue(); 3773263508Sdim 3774263508Sdim // Bail out early if passed-in parameters make vectorization not feasible. 3775263508Sdim unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; 3776263508Sdim unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1; 3777263508Sdim 3778263508Sdim // The distance must be bigger than the size needed for a vectorized version 3779263508Sdim // of the operation and the size of the vectorized operation must not be 3780263508Sdim // bigger than the currrent maximum size. 3781263508Sdim if (Distance < 2*TypeByteSize || 3782263508Sdim 2*TypeByteSize > MaxSafeDepDistBytes || 3783263508Sdim Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { 3784263508Sdim DEBUG(dbgs() << "LV: Failure because of Positive distance " 3785263508Sdim << Val.getSExtValue() << '\n'); 3786263508Sdim return true; 3787263508Sdim } 3788263508Sdim 3789263508Sdim MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? 3790263508Sdim Distance : MaxSafeDepDistBytes; 3791263508Sdim 3792263508Sdim bool IsTrueDataDependence = (!AIsWrite && BIsWrite); 3793263508Sdim if (IsTrueDataDependence && 3794263508Sdim couldPreventStoreLoadForward(Distance, TypeByteSize)) 3795263508Sdim return true; 3796263508Sdim 3797263508Sdim DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << 3798263508Sdim " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); 3799263508Sdim 3800249423Sdim return false; 3801249423Sdim} 3802249423Sdim 3803263508Sdimbool 3804263508SdimMemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, 3805263508Sdim MemAccessInfoSet &CheckDeps) { 3806263508Sdim 3807263508Sdim MaxSafeDepDistBytes = -1U; 3808263508Sdim while (!CheckDeps.empty()) { 3809263508Sdim MemAccessInfo CurAccess = *CheckDeps.begin(); 3810263508Sdim 3811263508Sdim // Get the relevant memory access set. 3812263508Sdim EquivalenceClasses<MemAccessInfo>::iterator I = 3813263508Sdim AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); 3814263508Sdim 3815263508Sdim // Check accesses within this set. 3816263508Sdim EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; 3817263508Sdim AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); 3818263508Sdim 3819263508Sdim // Check every access pair. 3820263508Sdim while (AI != AE) { 3821263508Sdim CheckDeps.erase(*AI); 3822263508Sdim EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI); 3823263508Sdim while (OI != AE) { 3824263508Sdim // Check every accessing instruction pair in program order. 3825263508Sdim for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), 3826263508Sdim I1E = Accesses[*AI].end(); I1 != I1E; ++I1) 3827263508Sdim for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), 3828263508Sdim I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { 3829263508Sdim if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2)) 3830263508Sdim return false; 3831263508Sdim if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1)) 3832263508Sdim return false; 3833263508Sdim } 3834263508Sdim ++OI; 3835263508Sdim } 3836263508Sdim AI++; 3837263508Sdim } 3838263508Sdim } 3839263508Sdim return true; 3840263508Sdim} 3841263508Sdim 3842249423Sdimbool LoopVectorizationLegality::canVectorizeMemory() { 3843249423Sdim 3844243789Sdim typedef SmallVector<Value*, 16> ValueVector; 3845243789Sdim typedef SmallPtrSet<Value*, 16> ValueSet; 3846263508Sdim 3847243789Sdim // Holds the Load and Store *instructions*. 3848243789Sdim ValueVector Loads; 3849243789Sdim ValueVector Stores; 3850263508Sdim 3851263508Sdim // Holds all the different accesses in the loop. 3852263508Sdim unsigned NumReads = 0; 3853263508Sdim unsigned NumReadWrites = 0; 3854263508Sdim 3855243789Sdim PtrRtCheck.Pointers.clear(); 3856243789Sdim PtrRtCheck.Need = false; 3857243789Sdim 3858251662Sdim const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 3859263508Sdim MemoryDepChecker DepChecker(SE, DL, TheLoop); 3860251662Sdim 3861249423Sdim // For each block. 3862249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 3863249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 3864243789Sdim 3865249423Sdim // Scan the BB and collect legal loads and stores. 3866249423Sdim for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 3867249423Sdim ++it) { 3868249423Sdim 3869249423Sdim // If this is a load, save it. If this instruction can read from memory 3870249423Sdim // but is not a load, then we quit. Notice that we don't handle function 3871249423Sdim // calls that read or write. 3872249423Sdim if (it->mayReadFromMemory()) { 3873263508Sdim // Many math library functions read the rounding mode. We will only 3874263508Sdim // vectorize a loop if it contains known function calls that don't set 3875263508Sdim // the flag. Therefore, it is safe to ignore this read from memory. 3876263508Sdim CallInst *Call = dyn_cast<CallInst>(it); 3877263508Sdim if (Call && getIntrinsicIDForCall(Call, TLI)) 3878263508Sdim continue; 3879263508Sdim 3880249423Sdim LoadInst *Ld = dyn_cast<LoadInst>(it); 3881249423Sdim if (!Ld) return false; 3882251662Sdim if (!Ld->isSimple() && !IsAnnotatedParallel) { 3883249423Sdim DEBUG(dbgs() << "LV: Found a non-simple load.\n"); 3884249423Sdim return false; 3885249423Sdim } 3886249423Sdim Loads.push_back(Ld); 3887263508Sdim DepChecker.addAccess(Ld); 3888249423Sdim continue; 3889243789Sdim } 3890243789Sdim 3891249423Sdim // Save 'store' instructions. Abort if other instructions write to memory. 3892249423Sdim if (it->mayWriteToMemory()) { 3893249423Sdim StoreInst *St = dyn_cast<StoreInst>(it); 3894249423Sdim if (!St) return false; 3895251662Sdim if (!St->isSimple() && !IsAnnotatedParallel) { 3896249423Sdim DEBUG(dbgs() << "LV: Found a non-simple store.\n"); 3897249423Sdim return false; 3898249423Sdim } 3899249423Sdim Stores.push_back(St); 3900263508Sdim DepChecker.addAccess(St); 3901243789Sdim } 3902263508Sdim } // Next instr. 3903263508Sdim } // Next block. 3904243789Sdim 3905243789Sdim // Now we have two lists that hold the loads and the stores. 3906243789Sdim // Next, we find the pointers that they use. 3907243789Sdim 3908243789Sdim // Check if we see any stores. If there are no stores, then we don't 3909243789Sdim // care if the pointers are *restrict*. 3910243789Sdim if (!Stores.size()) { 3911249423Sdim DEBUG(dbgs() << "LV: Found a read-only loop!\n"); 3912249423Sdim return true; 3913243789Sdim } 3914243789Sdim 3915263508Sdim AccessAnalysis::DepCandidates DependentAccesses; 3916263508Sdim AccessAnalysis Accesses(DL, DependentAccesses); 3917243789Sdim 3918243789Sdim // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 3919243789Sdim // multiple times on the same object. If the ptr is accessed twice, once 3920243789Sdim // for read and once for write, it will only appear once (on the write 3921243789Sdim // list). This is okay, since we are going to check for conflicts between 3922243789Sdim // writes and between reads and writes, but not between reads and reads. 3923243789Sdim ValueSet Seen; 3924243789Sdim 3925243789Sdim ValueVector::iterator I, IE; 3926243789Sdim for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 3927249423Sdim StoreInst *ST = cast<StoreInst>(*I); 3928243789Sdim Value* Ptr = ST->getPointerOperand(); 3929243789Sdim 3930243789Sdim if (isUniform(Ptr)) { 3931243789Sdim DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); 3932243789Sdim return false; 3933243789Sdim } 3934243789Sdim 3935263508Sdim // If we did *not* see this pointer before, insert it to the read-write 3936263508Sdim // list. At this phase it is only a 'write' list. 3937263508Sdim if (Seen.insert(Ptr)) { 3938263508Sdim ++NumReadWrites; 3939263508Sdim Accesses.addStore(Ptr); 3940263508Sdim } 3941243789Sdim } 3942243789Sdim 3943251662Sdim if (IsAnnotatedParallel) { 3944251662Sdim DEBUG(dbgs() 3945251662Sdim << "LV: A loop annotated parallel, ignore memory dependency " 3946251662Sdim << "checks.\n"); 3947251662Sdim return true; 3948251662Sdim } 3949251662Sdim 3950243789Sdim for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 3951249423Sdim LoadInst *LD = cast<LoadInst>(*I); 3952243789Sdim Value* Ptr = LD->getPointerOperand(); 3953243789Sdim // If we did *not* see this pointer before, insert it to the 3954243789Sdim // read list. If we *did* see it before, then it is already in 3955243789Sdim // the read-write list. This allows us to vectorize expressions 3956243789Sdim // such as A[i] += x; Because the address of A[i] is a read-write 3957243789Sdim // pointer. This only works if the index of A[i] is consecutive. 3958243789Sdim // If the address of i is unknown (for example A[B[i]]) then we may 3959243789Sdim // read a few words, modify, and write a few words, and some of the 3960243789Sdim // words may be written to the same address. 3961263508Sdim bool IsReadOnlyPtr = false; 3962263508Sdim if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { 3963263508Sdim ++NumReads; 3964263508Sdim IsReadOnlyPtr = true; 3965263508Sdim } 3966263508Sdim Accesses.addLoad(Ptr, IsReadOnlyPtr); 3967243789Sdim } 3968243789Sdim 3969243789Sdim // If we write (or read-write) to a single destination and there are no 3970243789Sdim // other reads in this loop then is it safe to vectorize. 3971263508Sdim if (NumReadWrites == 1 && NumReads == 0) { 3972243789Sdim DEBUG(dbgs() << "LV: Found a write-only loop!\n"); 3973243789Sdim return true; 3974243789Sdim } 3975243789Sdim 3976263508Sdim // Build dependence sets and check whether we need a runtime pointer bounds 3977263508Sdim // check. 3978263508Sdim Accesses.buildDependenceSets(); 3979263508Sdim bool NeedRTCheck = Accesses.isRTCheckNeeded(); 3980251662Sdim 3981243789Sdim // Find pointers with computable bounds. We are going to use this information 3982243789Sdim // to place a runtime bound check. 3983263508Sdim unsigned NumComparisons = 0; 3984263508Sdim bool CanDoRT = false; 3985263508Sdim if (NeedRTCheck) 3986263508Sdim CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); 3987243789Sdim 3988263508Sdim 3989263508Sdim DEBUG(dbgs() << "LV: We need to do " << NumComparisons << 3990263508Sdim " pointer comparisons.\n"); 3991263508Sdim 3992263508Sdim // If we only have one set of dependences to check pointers among we don't 3993263508Sdim // need a runtime check. 3994263508Sdim if (NumComparisons == 0 && NeedRTCheck) 3995263508Sdim NeedRTCheck = false; 3996263508Sdim 3997263508Sdim // Check that we did not collect too many pointers or found an unsizeable 3998263508Sdim // pointer. 3999251662Sdim if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { 4000249423Sdim PtrRtCheck.reset(); 4001249423Sdim CanDoRT = false; 4002243789Sdim } 4003243789Sdim 4004249423Sdim if (CanDoRT) { 4005243789Sdim DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); 4006243789Sdim } 4007243789Sdim 4008263508Sdim if (NeedRTCheck && !CanDoRT) { 4009263508Sdim DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << 4010263508Sdim "the array bounds.\n"); 4011263508Sdim PtrRtCheck.reset(); 4012263508Sdim return false; 4013263508Sdim } 4014249423Sdim 4015263508Sdim PtrRtCheck.Need = NeedRTCheck; 4016243789Sdim 4017263508Sdim bool CanVecMem = true; 4018263508Sdim if (Accesses.isDependencyCheckNeeded()) { 4019263508Sdim DEBUG(dbgs() << "LV: Checking memory dependencies\n"); 4020263508Sdim CanVecMem = DepChecker.areDepsSafe(DependentAccesses, 4021263508Sdim Accesses.getDependenciesToCheck()); 4022263508Sdim MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); 4023249423Sdim 4024263508Sdim if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { 4025263508Sdim DEBUG(dbgs() << "LV: Retrying with memory checks\n"); 4026263508Sdim NeedRTCheck = true; 4027249423Sdim 4028263508Sdim // Clear the dependency checks. We assume they are not needed. 4029263508Sdim Accesses.resetDepChecks(); 4030249423Sdim 4031263508Sdim PtrRtCheck.reset(); 4032263508Sdim PtrRtCheck.Need = true; 4033263508Sdim 4034263508Sdim CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, 4035263508Sdim TheLoop, true); 4036263508Sdim // Check that we did not collect too many pointers or found an unsizeable 4037263508Sdim // pointer. 4038263508Sdim if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { 4039263508Sdim DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); 4040263508Sdim PtrRtCheck.reset(); 4041249423Sdim return false; 4042249423Sdim } 4043249423Sdim 4044263508Sdim CanVecMem = true; 4045243789Sdim } 4046243789Sdim } 4047243789Sdim 4048263508Sdim DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << 4049263508Sdim " need a runtime memory check.\n"); 4050249423Sdim 4051263508Sdim return CanVecMem; 4052263508Sdim} 4053249423Sdim 4054263508Sdimstatic bool hasMultipleUsesOf(Instruction *I, 4055263508Sdim SmallPtrSet<Instruction *, 8> &Insts) { 4056263508Sdim unsigned NumUses = 0; 4057263508Sdim for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { 4058263508Sdim if (Insts.count(dyn_cast<Instruction>(*Use))) 4059263508Sdim ++NumUses; 4060263508Sdim if (NumUses > 1) 4061263508Sdim return true; 4062243789Sdim } 4063243789Sdim 4064263508Sdim return false; 4065263508Sdim} 4066249423Sdim 4067263508Sdimstatic bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) { 4068263508Sdim for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) 4069263508Sdim if (!Set.count(dyn_cast<Instruction>(*Use))) 4070263508Sdim return false; 4071243789Sdim return true; 4072243789Sdim} 4073243789Sdim 4074243789Sdimbool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, 4075243789Sdim ReductionKind Kind) { 4076243789Sdim if (Phi->getNumIncomingValues() != 2) 4077243789Sdim return false; 4078243789Sdim 4079249423Sdim // Reduction variables are only found in the loop header block. 4080249423Sdim if (Phi->getParent() != TheLoop->getHeader()) 4081249423Sdim return false; 4082243789Sdim 4083249423Sdim // Obtain the reduction start value from the value that comes from the loop 4084249423Sdim // preheader. 4085249423Sdim Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); 4086249423Sdim 4087243789Sdim // ExitInstruction is the single value which is used outside the loop. 4088243789Sdim // We only allow for a single reduction value to be used outside the loop. 4089243789Sdim // This includes users of the reduction, variables (which form a cycle 4090243789Sdim // which ends in the phi node). 4091243789Sdim Instruction *ExitInstruction = 0; 4092263508Sdim // Indicates that we found a reduction operation in our scan. 4093263508Sdim bool FoundReduxOp = false; 4094243789Sdim 4095263508Sdim // We start with the PHI node and scan for all of the users of this 4096263508Sdim // instruction. All users must be instructions that can be used as reduction 4097263508Sdim // variables (such as ADD). We must have a single out-of-block user. The cycle 4098263508Sdim // must include the original PHI. 4099263508Sdim bool FoundStartPHI = false; 4100251662Sdim 4101251662Sdim // To recognize min/max patterns formed by a icmp select sequence, we store 4102251662Sdim // the number of instruction we saw from the recognized min/max pattern, 4103263508Sdim // to make sure we only see exactly the two instructions. 4104251662Sdim unsigned NumCmpSelectPatternInst = 0; 4105251662Sdim ReductionInstDesc ReduxDesc(false, 0); 4106251662Sdim 4107251662Sdim SmallPtrSet<Instruction *, 8> VisitedInsts; 4108263508Sdim SmallVector<Instruction *, 8> Worklist; 4109263508Sdim Worklist.push_back(Phi); 4110263508Sdim VisitedInsts.insert(Phi); 4111263508Sdim 4112263508Sdim // A value in the reduction can be used: 4113263508Sdim // - By the reduction: 4114263508Sdim // - Reduction operation: 4115263508Sdim // - One use of reduction value (safe). 4116263508Sdim // - Multiple use of reduction value (not safe). 4117263508Sdim // - PHI: 4118263508Sdim // - All uses of the PHI must be the reduction (safe). 4119263508Sdim // - Otherwise, not safe. 4120263508Sdim // - By one instruction outside of the loop (safe). 4121263508Sdim // - By further instructions outside of the loop (not safe). 4122263508Sdim // - By an instruction that is not part of the reduction (not safe). 4123263508Sdim // This is either: 4124263508Sdim // * An instruction type other than PHI or the reduction operation. 4125263508Sdim // * A PHI in the header other than the initial PHI. 4126263508Sdim while (!Worklist.empty()) { 4127263508Sdim Instruction *Cur = Worklist.back(); 4128263508Sdim Worklist.pop_back(); 4129263508Sdim 4130263508Sdim // No Users. 4131263508Sdim // If the instruction has no users then this is a broken chain and can't be 4132263508Sdim // a reduction variable. 4133263508Sdim if (Cur->use_empty()) 4134243789Sdim return false; 4135243789Sdim 4136263508Sdim bool IsAPhi = isa<PHINode>(Cur); 4137243789Sdim 4138263508Sdim // A header PHI use other than the original PHI. 4139263508Sdim if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent()) 4140263508Sdim return false; 4141243789Sdim 4142263508Sdim // Reductions of instructions such as Div, and Sub is only possible if the 4143263508Sdim // LHS is the reduction variable. 4144263508Sdim if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) && 4145263508Sdim !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) && 4146263508Sdim !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0)))) 4147263508Sdim return false; 4148249423Sdim 4149263508Sdim // Any reduction instruction must be of one of the allowed kinds. 4150263508Sdim ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc); 4151263508Sdim if (!ReduxDesc.IsReduction) 4152263508Sdim return false; 4153263508Sdim 4154263508Sdim // A reduction operation must only have one use of the reduction value. 4155263508Sdim if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && 4156263508Sdim hasMultipleUsesOf(Cur, VisitedInsts)) 4157263508Sdim return false; 4158263508Sdim 4159263508Sdim // All inputs to a PHI node must be a reduction value. 4160263508Sdim if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts)) 4161263508Sdim return false; 4162263508Sdim 4163263508Sdim if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) || 4164263508Sdim isa<SelectInst>(Cur))) 4165263508Sdim ++NumCmpSelectPatternInst; 4166263508Sdim if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) || 4167263508Sdim isa<SelectInst>(Cur))) 4168263508Sdim ++NumCmpSelectPatternInst; 4169263508Sdim 4170263508Sdim // Check whether we found a reduction operator. 4171263508Sdim FoundReduxOp |= !IsAPhi; 4172263508Sdim 4173263508Sdim // Process users of current instruction. Push non PHI nodes after PHI nodes 4174263508Sdim // onto the stack. This way we are going to have seen all inputs to PHI 4175263508Sdim // nodes once we get to them. 4176263508Sdim SmallVector<Instruction *, 8> NonPHIs; 4177263508Sdim SmallVector<Instruction *, 8> PHIs; 4178263508Sdim for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E; 4179263508Sdim ++UI) { 4180263508Sdim Instruction *Usr = cast<Instruction>(*UI); 4181263508Sdim 4182243789Sdim // Check if we found the exit user. 4183263508Sdim BasicBlock *Parent = Usr->getParent(); 4184249423Sdim if (!TheLoop->contains(Parent)) { 4185263508Sdim // Exit if you find multiple outside users or if the header phi node is 4186263508Sdim // being used. In this case the user uses the value of the previous 4187263508Sdim // iteration, in which case we would loose "VF-1" iterations of the 4188263508Sdim // reduction operation if we vectorize. 4189263508Sdim if (ExitInstruction != 0 || Cur == Phi) 4190243789Sdim return false; 4191249423Sdim 4192263508Sdim // The instruction used by an outside user must be the last instruction 4193263508Sdim // before we feed back to the reduction phi. Otherwise, we loose VF-1 4194263508Sdim // operations on the value. 4195263508Sdim if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end()) 4196263508Sdim return false; 4197263508Sdim 4198263508Sdim ExitInstruction = Cur; 4199249423Sdim continue; 4200263508Sdim } 4201249423Sdim 4202266715Sdim // Process instructions only once (termination). Each reduction cycle 4203266715Sdim // value must only be used once, except by phi nodes and min/max 4204266715Sdim // reductions which are represented as a cmp followed by a select. 4205266715Sdim ReductionInstDesc IgnoredVal(false, 0); 4206263508Sdim if (VisitedInsts.insert(Usr)) { 4207263508Sdim if (isa<PHINode>(Usr)) 4208263508Sdim PHIs.push_back(Usr); 4209263508Sdim else 4210263508Sdim NonPHIs.push_back(Usr); 4211266715Sdim } else if (!isa<PHINode>(Usr) && 4212266715Sdim ((!isa<FCmpInst>(Usr) && 4213266715Sdim !isa<ICmpInst>(Usr) && 4214266715Sdim !isa<SelectInst>(Usr)) || 4215266715Sdim !isMinMaxSelectCmpPattern(Usr, IgnoredVal).IsReduction)) 4216266715Sdim return false; 4217266715Sdim 4218263508Sdim // Remember that we completed the cycle. 4219263508Sdim if (Usr == Phi) 4220263508Sdim FoundStartPHI = true; 4221263508Sdim } 4222263508Sdim Worklist.append(PHIs.begin(), PHIs.end()); 4223263508Sdim Worklist.append(NonPHIs.begin(), NonPHIs.end()); 4224263508Sdim } 4225249423Sdim 4226263508Sdim // This means we have seen one but not the other instruction of the 4227263508Sdim // pattern or more than just a select and cmp. 4228263508Sdim if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) && 4229263508Sdim NumCmpSelectPatternInst != 2) 4230263508Sdim return false; 4231249423Sdim 4232263508Sdim if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) 4233263508Sdim return false; 4234251662Sdim 4235263508Sdim // We found a reduction var if we have reached the original phi node and we 4236263508Sdim // only have a single instruction with out-of-loop users. 4237249423Sdim 4238263508Sdim // This instruction is allowed to have out-of-loop users. 4239263508Sdim AllowedExit.insert(ExitInstruction); 4240243789Sdim 4241263508Sdim // Save the description of this reduction variable. 4242263508Sdim ReductionDescriptor RD(RdxStart, ExitInstruction, Kind, 4243263508Sdim ReduxDesc.MinMaxKind); 4244263508Sdim Reductions[Phi] = RD; 4245263508Sdim // We've ended the cycle. This is a reduction variable if we have an 4246263508Sdim // outside user and it has a binary op. 4247249423Sdim 4248263508Sdim return true; 4249243789Sdim} 4250243789Sdim 4251251662Sdim/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction 4252251662Sdim/// pattern corresponding to a min(X, Y) or max(X, Y). 4253251662SdimLoopVectorizationLegality::ReductionInstDesc 4254251662SdimLoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, 4255251662Sdim ReductionInstDesc &Prev) { 4256251662Sdim 4257251662Sdim assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) && 4258251662Sdim "Expect a select instruction"); 4259251662Sdim Instruction *Cmp = 0; 4260251662Sdim SelectInst *Select = 0; 4261251662Sdim 4262251662Sdim // We must handle the select(cmp()) as a single instruction. Advance to the 4263251662Sdim // select. 4264251662Sdim if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) { 4265251662Sdim if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin()))) 4266251662Sdim return ReductionInstDesc(false, I); 4267251662Sdim return ReductionInstDesc(Select, Prev.MinMaxKind); 4268251662Sdim } 4269251662Sdim 4270251662Sdim // Only handle single use cases for now. 4271251662Sdim if (!(Select = dyn_cast<SelectInst>(I))) 4272251662Sdim return ReductionInstDesc(false, I); 4273251662Sdim if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) && 4274251662Sdim !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0)))) 4275251662Sdim return ReductionInstDesc(false, I); 4276251662Sdim if (!Cmp->hasOneUse()) 4277251662Sdim return ReductionInstDesc(false, I); 4278251662Sdim 4279251662Sdim Value *CmpLeft; 4280251662Sdim Value *CmpRight; 4281251662Sdim 4282251662Sdim // Look for a min/max pattern. 4283251662Sdim if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4284251662Sdim return ReductionInstDesc(Select, MRK_UIntMin); 4285251662Sdim else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4286251662Sdim return ReductionInstDesc(Select, MRK_UIntMax); 4287251662Sdim else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4288251662Sdim return ReductionInstDesc(Select, MRK_SIntMax); 4289251662Sdim else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4290251662Sdim return ReductionInstDesc(Select, MRK_SIntMin); 4291251662Sdim else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4292251662Sdim return ReductionInstDesc(Select, MRK_FloatMin); 4293251662Sdim else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4294251662Sdim return ReductionInstDesc(Select, MRK_FloatMax); 4295251662Sdim else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4296251662Sdim return ReductionInstDesc(Select, MRK_FloatMin); 4297251662Sdim else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select)) 4298251662Sdim return ReductionInstDesc(Select, MRK_FloatMax); 4299251662Sdim 4300251662Sdim return ReductionInstDesc(false, I); 4301251662Sdim} 4302251662Sdim 4303251662SdimLoopVectorizationLegality::ReductionInstDesc 4304243789SdimLoopVectorizationLegality::isReductionInstr(Instruction *I, 4305251662Sdim ReductionKind Kind, 4306251662Sdim ReductionInstDesc &Prev) { 4307249423Sdim bool FP = I->getType()->isFloatingPointTy(); 4308249423Sdim bool FastMath = (FP && I->isCommutative() && I->isAssociative()); 4309249423Sdim switch (I->getOpcode()) { 4310249423Sdim default: 4311251662Sdim return ReductionInstDesc(false, I); 4312249423Sdim case Instruction::PHI: 4313251662Sdim if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd && 4314251662Sdim Kind != RK_FloatMinMax)) 4315251662Sdim return ReductionInstDesc(false, I); 4316251662Sdim return ReductionInstDesc(I, Prev.MinMaxKind); 4317249423Sdim case Instruction::Sub: 4318249423Sdim case Instruction::Add: 4319251662Sdim return ReductionInstDesc(Kind == RK_IntegerAdd, I); 4320249423Sdim case Instruction::Mul: 4321251662Sdim return ReductionInstDesc(Kind == RK_IntegerMult, I); 4322249423Sdim case Instruction::And: 4323251662Sdim return ReductionInstDesc(Kind == RK_IntegerAnd, I); 4324249423Sdim case Instruction::Or: 4325251662Sdim return ReductionInstDesc(Kind == RK_IntegerOr, I); 4326249423Sdim case Instruction::Xor: 4327251662Sdim return ReductionInstDesc(Kind == RK_IntegerXor, I); 4328249423Sdim case Instruction::FMul: 4329251662Sdim return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); 4330249423Sdim case Instruction::FAdd: 4331251662Sdim return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); 4332251662Sdim case Instruction::FCmp: 4333251662Sdim case Instruction::ICmp: 4334251662Sdim case Instruction::Select: 4335251662Sdim if (Kind != RK_IntegerMinMax && 4336251662Sdim (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) 4337251662Sdim return ReductionInstDesc(false, I); 4338251662Sdim return isMinMaxSelectCmpPattern(I, Prev); 4339251662Sdim } 4340243789Sdim} 4341243789Sdim 4342249423SdimLoopVectorizationLegality::InductionKind 4343249423SdimLoopVectorizationLegality::isInductionVariable(PHINode *Phi) { 4344249423Sdim Type *PhiTy = Phi->getType(); 4345249423Sdim // We only handle integer and pointer inductions variables. 4346249423Sdim if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) 4347249423Sdim return IK_NoInduction; 4348249423Sdim 4349249423Sdim // Check that the PHI is consecutive. 4350243789Sdim const SCEV *PhiScev = SE->getSCEV(Phi); 4351243789Sdim const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); 4352243789Sdim if (!AR) { 4353243789Sdim DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); 4354249423Sdim return IK_NoInduction; 4355243789Sdim } 4356243789Sdim const SCEV *Step = AR->getStepRecurrence(*SE); 4357243789Sdim 4358249423Sdim // Integer inductions need to have a stride of one. 4359249423Sdim if (PhiTy->isIntegerTy()) { 4360249423Sdim if (Step->isOne()) 4361249423Sdim return IK_IntInduction; 4362249423Sdim if (Step->isAllOnesValue()) 4363249423Sdim return IK_ReverseIntInduction; 4364249423Sdim return IK_NoInduction; 4365249423Sdim } 4366249423Sdim 4367249423Sdim // Calculate the pointer stride and check if it is consecutive. 4368249423Sdim const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 4369249423Sdim if (!C) 4370249423Sdim return IK_NoInduction; 4371249423Sdim 4372249423Sdim assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); 4373249423Sdim uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); 4374249423Sdim if (C->getValue()->equalsInt(Size)) 4375249423Sdim return IK_PtrInduction; 4376249423Sdim else if (C->getValue()->equalsInt(0 - Size)) 4377249423Sdim return IK_ReversePtrInduction; 4378249423Sdim 4379249423Sdim return IK_NoInduction; 4380249423Sdim} 4381249423Sdim 4382249423Sdimbool LoopVectorizationLegality::isInductionVariable(const Value *V) { 4383249423Sdim Value *In0 = const_cast<Value*>(V); 4384249423Sdim PHINode *PN = dyn_cast_or_null<PHINode>(In0); 4385249423Sdim if (!PN) 4386243789Sdim return false; 4387249423Sdim 4388249423Sdim return Inductions.count(PN); 4389249423Sdim} 4390249423Sdim 4391249423Sdimbool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { 4392249423Sdim assert(TheLoop->contains(BB) && "Unknown block used"); 4393249423Sdim 4394249423Sdim // Blocks that do not dominate the latch need predication. 4395249423Sdim BasicBlock* Latch = TheLoop->getLoopLatch(); 4396249423Sdim return !DT->dominates(BB, Latch); 4397249423Sdim} 4398249423Sdim 4399263508Sdimbool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, 4400263508Sdim SmallPtrSet<Value *, 8>& SafePtrs) { 4401249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4402263508Sdim // We might be able to hoist the load. 4403263508Sdim if (it->mayReadFromMemory()) { 4404263508Sdim LoadInst *LI = dyn_cast<LoadInst>(it); 4405263508Sdim if (!LI || !SafePtrs.count(LI->getPointerOperand())) 4406263508Sdim return false; 4407263508Sdim } 4408263508Sdim 4409263508Sdim // We don't predicate stores at the moment. 4410263508Sdim if (it->mayWriteToMemory() || it->mayThrow()) 4411249423Sdim return false; 4412249423Sdim 4413263508Sdim // Check that we don't have a constant expression that can trap as operand. 4414263508Sdim for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); 4415263508Sdim OI != OE; ++OI) { 4416263508Sdim if (Constant *C = dyn_cast<Constant>(*OI)) 4417263508Sdim if (C->canTrap()) 4418263508Sdim return false; 4419263508Sdim } 4420263508Sdim 4421249423Sdim // The instructions below can trap. 4422249423Sdim switch (it->getOpcode()) { 4423249423Sdim default: continue; 4424249423Sdim case Instruction::UDiv: 4425249423Sdim case Instruction::SDiv: 4426249423Sdim case Instruction::URem: 4427249423Sdim case Instruction::SRem: 4428249423Sdim return false; 4429249423Sdim } 4430243789Sdim } 4431249423Sdim 4432243789Sdim return true; 4433243789Sdim} 4434243789Sdim 4435249423SdimLoopVectorizationCostModel::VectorizationFactor 4436249423SdimLoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, 4437249423Sdim unsigned UserVF) { 4438249423Sdim // Width 1 means no vectorize 4439249423Sdim VectorizationFactor Factor = { 1U, 0U }; 4440249423Sdim if (OptForSize && Legal->getRuntimePointerCheck()->Need) { 4441249423Sdim DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); 4442249423Sdim return Factor; 4443243789Sdim } 4444243789Sdim 4445249423Sdim // Find the trip count. 4446249423Sdim unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); 4447263508Sdim DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4448249423Sdim 4449249423Sdim unsigned WidestType = getWidestType(); 4450249423Sdim unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4451263508Sdim unsigned MaxSafeDepDist = -1U; 4452263508Sdim if (Legal->getMaxSafeDepDistBytes() != -1U) 4453263508Sdim MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 4454263508Sdim WidestRegister = ((WidestRegister < MaxSafeDepDist) ? 4455263508Sdim WidestRegister : MaxSafeDepDist); 4456249423Sdim unsigned MaxVectorSize = WidestRegister / WidestType; 4457249423Sdim DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); 4458263508Sdim DEBUG(dbgs() << "LV: The Widest register is: " 4459263508Sdim << WidestRegister << " bits.\n"); 4460249423Sdim 4461249423Sdim if (MaxVectorSize == 0) { 4462249423Sdim DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4463249423Sdim MaxVectorSize = 1; 4464249423Sdim } 4465249423Sdim 4466249423Sdim assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" 4467249423Sdim " into one vector!"); 4468249423Sdim 4469249423Sdim unsigned VF = MaxVectorSize; 4470249423Sdim 4471249423Sdim // If we optimize the program for size, avoid creating the tail loop. 4472249423Sdim if (OptForSize) { 4473249423Sdim // If we are unable to calculate the trip count then don't try to vectorize. 4474249423Sdim if (TC < 2) { 4475249423Sdim DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 4476249423Sdim return Factor; 4477249423Sdim } 4478249423Sdim 4479249423Sdim // Find the maximum SIMD width that can fit within the trip count. 4480249423Sdim VF = TC % MaxVectorSize; 4481249423Sdim 4482249423Sdim if (VF == 0) 4483249423Sdim VF = MaxVectorSize; 4484249423Sdim 4485249423Sdim // If the trip count that we found modulo the vectorization factor is not 4486249423Sdim // zero then we require a tail. 4487249423Sdim if (VF < 2) { 4488249423Sdim DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); 4489249423Sdim return Factor; 4490249423Sdim } 4491249423Sdim } 4492249423Sdim 4493249423Sdim if (UserVF != 0) { 4494249423Sdim assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 4495263508Sdim DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 4496249423Sdim 4497249423Sdim Factor.Width = UserVF; 4498249423Sdim return Factor; 4499249423Sdim } 4500249423Sdim 4501243789Sdim float Cost = expectedCost(1); 4502243789Sdim unsigned Width = 1; 4503263508Sdim DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); 4504243789Sdim for (unsigned i=2; i <= VF; i*=2) { 4505243789Sdim // Notice that the vector loop needs to be executed less times, so 4506243789Sdim // we need to divide the cost of the vector loops by the width of 4507243789Sdim // the vector elements. 4508243789Sdim float VectorCost = expectedCost(i) / (float)i; 4509263508Sdim DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << 4510243789Sdim (int)VectorCost << ".\n"); 4511243789Sdim if (VectorCost < Cost) { 4512243789Sdim Cost = VectorCost; 4513243789Sdim Width = i; 4514243789Sdim } 4515243789Sdim } 4516243789Sdim 4517243789Sdim DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); 4518249423Sdim Factor.Width = Width; 4519249423Sdim Factor.Cost = Width * Cost; 4520249423Sdim return Factor; 4521243789Sdim} 4522243789Sdim 4523249423Sdimunsigned LoopVectorizationCostModel::getWidestType() { 4524249423Sdim unsigned MaxWidth = 8; 4525249423Sdim 4526249423Sdim // For each block. 4527249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 4528249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 4529249423Sdim BasicBlock *BB = *bb; 4530249423Sdim 4531249423Sdim // For each instruction in the loop. 4532249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4533249423Sdim Type *T = it->getType(); 4534249423Sdim 4535249423Sdim // Only examine Loads, Stores and PHINodes. 4536249423Sdim if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it)) 4537249423Sdim continue; 4538249423Sdim 4539249423Sdim // Examine PHI nodes that are reduction variables. 4540249423Sdim if (PHINode *PN = dyn_cast<PHINode>(it)) 4541249423Sdim if (!Legal->getReductionVars()->count(PN)) 4542249423Sdim continue; 4543249423Sdim 4544249423Sdim // Examine the stored values. 4545249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(it)) 4546249423Sdim T = ST->getValueOperand()->getType(); 4547249423Sdim 4548249423Sdim // Ignore loaded pointer types and stored pointer types that are not 4549249423Sdim // consecutive. However, we do want to take consecutive stores/loads of 4550249423Sdim // pointer vectors into account. 4551249423Sdim if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) 4552249423Sdim continue; 4553249423Sdim 4554249423Sdim MaxWidth = std::max(MaxWidth, 4555249423Sdim (unsigned)DL->getTypeSizeInBits(T->getScalarType())); 4556249423Sdim } 4557249423Sdim } 4558249423Sdim 4559249423Sdim return MaxWidth; 4560249423Sdim} 4561249423Sdim 4562249423Sdimunsigned 4563249423SdimLoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, 4564249423Sdim unsigned UserUF, 4565249423Sdim unsigned VF, 4566249423Sdim unsigned LoopCost) { 4567249423Sdim 4568249423Sdim // -- The unroll heuristics -- 4569249423Sdim // We unroll the loop in order to expose ILP and reduce the loop overhead. 4570249423Sdim // There are many micro-architectural considerations that we can't predict 4571249423Sdim // at this level. For example frontend pressure (on decode or fetch) due to 4572249423Sdim // code size, or the number and capabilities of the execution ports. 4573249423Sdim // 4574249423Sdim // We use the following heuristics to select the unroll factor: 4575249423Sdim // 1. If the code has reductions the we unroll in order to break the cross 4576249423Sdim // iteration dependency. 4577249423Sdim // 2. If the loop is really small then we unroll in order to reduce the loop 4578249423Sdim // overhead. 4579249423Sdim // 3. We don't unroll if we think that we will spill registers to memory due 4580249423Sdim // to the increased register pressure. 4581249423Sdim 4582249423Sdim // Use the user preference, unless 'auto' is selected. 4583249423Sdim if (UserUF != 0) 4584249423Sdim return UserUF; 4585249423Sdim 4586249423Sdim // When we optimize for size we don't unroll. 4587249423Sdim if (OptForSize) 4588249423Sdim return 1; 4589249423Sdim 4590263508Sdim // We used the distance for the unroll factor. 4591263508Sdim if (Legal->getMaxSafeDepDistBytes() != -1U) 4592263508Sdim return 1; 4593263508Sdim 4594249423Sdim // Do not unroll loops with a relatively small trip count. 4595249423Sdim unsigned TC = SE->getSmallConstantTripCount(TheLoop, 4596249423Sdim TheLoop->getLoopLatch()); 4597249423Sdim if (TC > 1 && TC < TinyTripCountUnrollThreshold) 4598249423Sdim return 1; 4599249423Sdim 4600249423Sdim unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); 4601249423Sdim DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << 4602249423Sdim " vector registers\n"); 4603249423Sdim 4604249423Sdim LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); 4605249423Sdim // We divide by these constants so assume that we have at least one 4606249423Sdim // instruction that uses at least one register. 4607249423Sdim R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 4608249423Sdim R.NumInstructions = std::max(R.NumInstructions, 1U); 4609249423Sdim 4610249423Sdim // We calculate the unroll factor using the following formula. 4611249423Sdim // Subtract the number of loop invariants from the number of available 4612249423Sdim // registers. These registers are used by all of the unrolled instances. 4613249423Sdim // Next, divide the remaining registers by the number of registers that is 4614249423Sdim // required by the loop, in order to estimate how many parallel instances 4615249423Sdim // fit without causing spills. 4616249423Sdim unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; 4617249423Sdim 4618249423Sdim // Clamp the unroll factor ranges to reasonable factors. 4619249423Sdim unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); 4620249423Sdim 4621249423Sdim // If we did not calculate the cost for VF (because the user selected the VF) 4622249423Sdim // then we calculate the cost of VF here. 4623249423Sdim if (LoopCost == 0) 4624249423Sdim LoopCost = expectedCost(VF); 4625249423Sdim 4626249423Sdim // Clamp the calculated UF to be between the 1 and the max unroll factor 4627249423Sdim // that the target allows. 4628249423Sdim if (UF > MaxUnrollSize) 4629249423Sdim UF = MaxUnrollSize; 4630249423Sdim else if (UF < 1) 4631249423Sdim UF = 1; 4632249423Sdim 4633263508Sdim bool HasReductions = Legal->getReductionVars()->size(); 4634263508Sdim 4635263508Sdim // Decide if we want to unroll if we decided that it is legal to vectorize 4636263508Sdim // but not profitable. 4637263508Sdim if (VF == 1) { 4638263508Sdim if (TheLoop->getNumBlocks() > 1 || !HasReductions || 4639263508Sdim LoopCost > SmallLoopCost) 4640263508Sdim return 1; 4641263508Sdim 4642249423Sdim return UF; 4643249423Sdim } 4644249423Sdim 4645263508Sdim if (HasReductions) { 4646263508Sdim DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); 4647263508Sdim return UF; 4648263508Sdim } 4649263508Sdim 4650249423Sdim // We want to unroll tiny loops in order to reduce the loop overhead. 4651249423Sdim // We assume that the cost overhead is 1 and we use the cost model 4652249423Sdim // to estimate the cost of the loop and unroll until the cost of the 4653249423Sdim // loop overhead is about 5% of the cost of the loop. 4654263508Sdim DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 4655263508Sdim if (LoopCost < SmallLoopCost) { 4656263508Sdim DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); 4657263508Sdim unsigned NewUF = SmallLoopCost / (LoopCost + 1); 4658249423Sdim return std::min(NewUF, UF); 4659249423Sdim } 4660249423Sdim 4661263508Sdim DEBUG(dbgs() << "LV: Not Unrolling.\n"); 4662249423Sdim return 1; 4663249423Sdim} 4664249423Sdim 4665249423SdimLoopVectorizationCostModel::RegisterUsage 4666249423SdimLoopVectorizationCostModel::calculateRegisterUsage() { 4667249423Sdim // This function calculates the register usage by measuring the highest number 4668249423Sdim // of values that are alive at a single location. Obviously, this is a very 4669249423Sdim // rough estimation. We scan the loop in a topological order in order and 4670249423Sdim // assign a number to each instruction. We use RPO to ensure that defs are 4671249423Sdim // met before their users. We assume that each instruction that has in-loop 4672249423Sdim // users starts an interval. We record every time that an in-loop value is 4673249423Sdim // used, so we have a list of the first and last occurrences of each 4674249423Sdim // instruction. Next, we transpose this data structure into a multi map that 4675249423Sdim // holds the list of intervals that *end* at a specific location. This multi 4676249423Sdim // map allows us to perform a linear search. We scan the instructions linearly 4677249423Sdim // and record each time that a new interval starts, by placing it in a set. 4678249423Sdim // If we find this value in the multi-map then we remove it from the set. 4679249423Sdim // The max register usage is the maximum size of the set. 4680249423Sdim // We also search for instructions that are defined outside the loop, but are 4681249423Sdim // used inside the loop. We need this number separately from the max-interval 4682249423Sdim // usage number because when we unroll, loop-invariant values do not take 4683249423Sdim // more register. 4684249423Sdim LoopBlocksDFS DFS(TheLoop); 4685249423Sdim DFS.perform(LI); 4686249423Sdim 4687249423Sdim RegisterUsage R; 4688249423Sdim R.NumInstructions = 0; 4689249423Sdim 4690249423Sdim // Each 'key' in the map opens a new interval. The values 4691249423Sdim // of the map are the index of the 'last seen' usage of the 4692249423Sdim // instruction that is the key. 4693249423Sdim typedef DenseMap<Instruction*, unsigned> IntervalMap; 4694249423Sdim // Maps instruction to its index. 4695249423Sdim DenseMap<unsigned, Instruction*> IdxToInstr; 4696249423Sdim // Marks the end of each interval. 4697249423Sdim IntervalMap EndPoint; 4698249423Sdim // Saves the list of instruction indices that are used in the loop. 4699249423Sdim SmallSet<Instruction*, 8> Ends; 4700249423Sdim // Saves the list of values that are used in the loop but are 4701249423Sdim // defined outside the loop, such as arguments and constants. 4702249423Sdim SmallPtrSet<Value*, 8> LoopInvariants; 4703249423Sdim 4704249423Sdim unsigned Index = 0; 4705249423Sdim for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), 4706249423Sdim be = DFS.endRPO(); bb != be; ++bb) { 4707249423Sdim R.NumInstructions += (*bb)->size(); 4708249423Sdim for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 4709249423Sdim ++it) { 4710249423Sdim Instruction *I = it; 4711249423Sdim IdxToInstr[Index++] = I; 4712249423Sdim 4713249423Sdim // Save the end location of each USE. 4714249423Sdim for (unsigned i = 0; i < I->getNumOperands(); ++i) { 4715249423Sdim Value *U = I->getOperand(i); 4716249423Sdim Instruction *Instr = dyn_cast<Instruction>(U); 4717249423Sdim 4718249423Sdim // Ignore non-instruction values such as arguments, constants, etc. 4719249423Sdim if (!Instr) continue; 4720249423Sdim 4721249423Sdim // If this instruction is outside the loop then record it and continue. 4722249423Sdim if (!TheLoop->contains(Instr)) { 4723249423Sdim LoopInvariants.insert(Instr); 4724249423Sdim continue; 4725249423Sdim } 4726249423Sdim 4727249423Sdim // Overwrite previous end points. 4728249423Sdim EndPoint[Instr] = Index; 4729249423Sdim Ends.insert(Instr); 4730249423Sdim } 4731249423Sdim } 4732249423Sdim } 4733249423Sdim 4734249423Sdim // Saves the list of intervals that end with the index in 'key'. 4735249423Sdim typedef SmallVector<Instruction*, 2> InstrList; 4736249423Sdim DenseMap<unsigned, InstrList> TransposeEnds; 4737249423Sdim 4738249423Sdim // Transpose the EndPoints to a list of values that end at each index. 4739249423Sdim for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end(); 4740249423Sdim it != e; ++it) 4741249423Sdim TransposeEnds[it->second].push_back(it->first); 4742249423Sdim 4743249423Sdim SmallSet<Instruction*, 8> OpenIntervals; 4744249423Sdim unsigned MaxUsage = 0; 4745249423Sdim 4746249423Sdim 4747249423Sdim DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 4748249423Sdim for (unsigned int i = 0; i < Index; ++i) { 4749249423Sdim Instruction *I = IdxToInstr[i]; 4750249423Sdim // Ignore instructions that are never used within the loop. 4751249423Sdim if (!Ends.count(I)) continue; 4752249423Sdim 4753249423Sdim // Remove all of the instructions that end at this location. 4754249423Sdim InstrList &List = TransposeEnds[i]; 4755249423Sdim for (unsigned int j=0, e = List.size(); j < e; ++j) 4756249423Sdim OpenIntervals.erase(List[j]); 4757249423Sdim 4758249423Sdim // Count the number of live interals. 4759249423Sdim MaxUsage = std::max(MaxUsage, OpenIntervals.size()); 4760249423Sdim 4761249423Sdim DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << 4762263508Sdim OpenIntervals.size() << '\n'); 4763249423Sdim 4764249423Sdim // Add the current instruction to the list of open intervals. 4765249423Sdim OpenIntervals.insert(I); 4766249423Sdim } 4767249423Sdim 4768249423Sdim unsigned Invariant = LoopInvariants.size(); 4769263508Sdim DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); 4770263508Sdim DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); 4771263508Sdim DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); 4772249423Sdim 4773249423Sdim R.LoopInvariantRegs = Invariant; 4774249423Sdim R.MaxLocalUsers = MaxUsage; 4775249423Sdim return R; 4776249423Sdim} 4777249423Sdim 4778243789Sdimunsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { 4779243789Sdim unsigned Cost = 0; 4780243789Sdim 4781249423Sdim // For each block. 4782249423Sdim for (Loop::block_iterator bb = TheLoop->block_begin(), 4783249423Sdim be = TheLoop->block_end(); bb != be; ++bb) { 4784249423Sdim unsigned BlockCost = 0; 4785249423Sdim BasicBlock *BB = *bb; 4786249423Sdim 4787249423Sdim // For each instruction in the old loop. 4788249423Sdim for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 4789249423Sdim // Skip dbg intrinsics. 4790249423Sdim if (isa<DbgInfoIntrinsic>(it)) 4791249423Sdim continue; 4792249423Sdim 4793249423Sdim unsigned C = getInstructionCost(it, VF); 4794263508Sdim BlockCost += C; 4795263508Sdim DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << 4796263508Sdim VF << " For instruction: " << *it << '\n'); 4797249423Sdim } 4798249423Sdim 4799249423Sdim // We assume that if-converted blocks have a 50% chance of being executed. 4800249423Sdim // When the code is scalar then some of the blocks are avoided due to CF. 4801249423Sdim // When the code is vectorized we execute all code paths. 4802263508Sdim if (VF == 1 && Legal->blockNeedsPredication(*bb)) 4803249423Sdim BlockCost /= 2; 4804249423Sdim 4805249423Sdim Cost += BlockCost; 4806243789Sdim } 4807243789Sdim 4808243789Sdim return Cost; 4809243789Sdim} 4810243789Sdim 4811263508Sdim/// \brief Check whether the address computation for a non-consecutive memory 4812263508Sdim/// access looks like an unlikely candidate for being merged into the indexing 4813263508Sdim/// mode. 4814263508Sdim/// 4815263508Sdim/// We look for a GEP which has one index that is an induction variable and all 4816263508Sdim/// other indices are loop invariant. If the stride of this access is also 4817263508Sdim/// within a small bound we decide that this address computation can likely be 4818263508Sdim/// merged into the addressing mode. 4819263508Sdim/// In all other cases, we identify the address computation as complex. 4820263508Sdimstatic bool isLikelyComplexAddressComputation(Value *Ptr, 4821263508Sdim LoopVectorizationLegality *Legal, 4822263508Sdim ScalarEvolution *SE, 4823263508Sdim const Loop *TheLoop) { 4824263508Sdim GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); 4825263508Sdim if (!Gep) 4826263508Sdim return true; 4827263508Sdim 4828263508Sdim // We are looking for a gep with all loop invariant indices except for one 4829263508Sdim // which should be an induction variable. 4830263508Sdim unsigned NumOperands = Gep->getNumOperands(); 4831263508Sdim for (unsigned i = 1; i < NumOperands; ++i) { 4832263508Sdim Value *Opd = Gep->getOperand(i); 4833263508Sdim if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 4834263508Sdim !Legal->isInductionVariable(Opd)) 4835263508Sdim return true; 4836263508Sdim } 4837263508Sdim 4838263508Sdim // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step 4839263508Sdim // can likely be merged into the address computation. 4840263508Sdim unsigned MaxMergeDistance = 64; 4841263508Sdim 4842263508Sdim const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr)); 4843263508Sdim if (!AddRec) 4844263508Sdim return true; 4845263508Sdim 4846263508Sdim // Check the step is constant. 4847263508Sdim const SCEV *Step = AddRec->getStepRecurrence(*SE); 4848263508Sdim // Calculate the pointer stride and check if it is consecutive. 4849263508Sdim const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 4850263508Sdim if (!C) 4851263508Sdim return true; 4852263508Sdim 4853263508Sdim const APInt &APStepVal = C->getValue()->getValue(); 4854263508Sdim 4855263508Sdim // Huge step value - give up. 4856263508Sdim if (APStepVal.getBitWidth() > 64) 4857263508Sdim return true; 4858263508Sdim 4859263508Sdim int64_t StepVal = APStepVal.getSExtValue(); 4860263508Sdim 4861263508Sdim return StepVal > MaxMergeDistance; 4862263508Sdim} 4863263508Sdim 4864243789Sdimunsigned 4865243789SdimLoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 4866243789Sdim // If we know that this instruction will remain uniform, check the cost of 4867243789Sdim // the scalar version. 4868243789Sdim if (Legal->isUniformAfterVectorization(I)) 4869243789Sdim VF = 1; 4870243789Sdim 4871243789Sdim Type *RetTy = I->getType(); 4872243789Sdim Type *VectorTy = ToVectorTy(RetTy, VF); 4873243789Sdim 4874243789Sdim // TODO: We need to estimate the cost of intrinsic calls. 4875243789Sdim switch (I->getOpcode()) { 4876249423Sdim case Instruction::GetElementPtr: 4877249423Sdim // We mark this instruction as zero-cost because the cost of GEPs in 4878249423Sdim // vectorized code depends on whether the corresponding memory instruction 4879249423Sdim // is scalarized or not. Therefore, we handle GEPs with the memory 4880249423Sdim // instruction cost. 4881249423Sdim return 0; 4882249423Sdim case Instruction::Br: { 4883249423Sdim return TTI.getCFInstrCost(I->getOpcode()); 4884249423Sdim } 4885249423Sdim case Instruction::PHI: 4886249423Sdim //TODO: IF-converted IFs become selects. 4887249423Sdim return 0; 4888249423Sdim case Instruction::Add: 4889249423Sdim case Instruction::FAdd: 4890249423Sdim case Instruction::Sub: 4891249423Sdim case Instruction::FSub: 4892249423Sdim case Instruction::Mul: 4893249423Sdim case Instruction::FMul: 4894249423Sdim case Instruction::UDiv: 4895249423Sdim case Instruction::SDiv: 4896249423Sdim case Instruction::FDiv: 4897249423Sdim case Instruction::URem: 4898249423Sdim case Instruction::SRem: 4899249423Sdim case Instruction::FRem: 4900249423Sdim case Instruction::Shl: 4901249423Sdim case Instruction::LShr: 4902249423Sdim case Instruction::AShr: 4903249423Sdim case Instruction::And: 4904249423Sdim case Instruction::Or: 4905249423Sdim case Instruction::Xor: { 4906249423Sdim // Certain instructions can be cheaper to vectorize if they have a constant 4907249423Sdim // second vector operand. One example of this are shifts on x86. 4908249423Sdim TargetTransformInfo::OperandValueKind Op1VK = 4909249423Sdim TargetTransformInfo::OK_AnyValue; 4910249423Sdim TargetTransformInfo::OperandValueKind Op2VK = 4911249423Sdim TargetTransformInfo::OK_AnyValue; 4912243789Sdim 4913249423Sdim if (isa<ConstantInt>(I->getOperand(1))) 4914249423Sdim Op2VK = TargetTransformInfo::OK_UniformConstantValue; 4915243789Sdim 4916249423Sdim return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); 4917249423Sdim } 4918249423Sdim case Instruction::Select: { 4919249423Sdim SelectInst *SI = cast<SelectInst>(I); 4920249423Sdim const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 4921249423Sdim bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 4922249423Sdim Type *CondTy = SI->getCondition()->getType(); 4923249423Sdim if (!ScalarCond) 4924249423Sdim CondTy = VectorType::get(CondTy, VF); 4925243789Sdim 4926249423Sdim return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); 4927249423Sdim } 4928249423Sdim case Instruction::ICmp: 4929249423Sdim case Instruction::FCmp: { 4930249423Sdim Type *ValTy = I->getOperand(0)->getType(); 4931249423Sdim VectorTy = ToVectorTy(ValTy, VF); 4932249423Sdim return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); 4933249423Sdim } 4934249423Sdim case Instruction::Store: 4935249423Sdim case Instruction::Load: { 4936249423Sdim StoreInst *SI = dyn_cast<StoreInst>(I); 4937249423Sdim LoadInst *LI = dyn_cast<LoadInst>(I); 4938249423Sdim Type *ValTy = (SI ? SI->getValueOperand()->getType() : 4939249423Sdim LI->getType()); 4940249423Sdim VectorTy = ToVectorTy(ValTy, VF); 4941243789Sdim 4942249423Sdim unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment(); 4943249423Sdim unsigned AS = SI ? SI->getPointerAddressSpace() : 4944249423Sdim LI->getPointerAddressSpace(); 4945249423Sdim Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); 4946249423Sdim // We add the cost of address computation here instead of with the gep 4947249423Sdim // instruction because only here we know whether the operation is 4948249423Sdim // scalarized. 4949249423Sdim if (VF == 1) 4950249423Sdim return TTI.getAddressComputationCost(VectorTy) + 4951249423Sdim TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 4952243789Sdim 4953249423Sdim // Scalarized loads/stores. 4954251662Sdim int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 4955251662Sdim bool Reverse = ConsecutiveStride < 0; 4956251662Sdim unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); 4957251662Sdim unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; 4958251662Sdim if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { 4959263508Sdim bool IsComplexComputation = 4960263508Sdim isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); 4961249423Sdim unsigned Cost = 0; 4962249423Sdim // The cost of extracting from the value vector and pointer vector. 4963249423Sdim Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 4964249423Sdim for (unsigned i = 0; i < VF; ++i) { 4965249423Sdim // The cost of extracting the pointer operand. 4966249423Sdim Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i); 4967249423Sdim // In case of STORE, the cost of ExtractElement from the vector. 4968249423Sdim // In case of LOAD, the cost of InsertElement into the returned 4969249423Sdim // vector. 4970249423Sdim Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement : 4971249423Sdim Instruction::InsertElement, 4972249423Sdim VectorTy, i); 4973243789Sdim } 4974243789Sdim 4975249423Sdim // The cost of the scalar loads/stores. 4976263508Sdim Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation); 4977249423Sdim Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 4978249423Sdim Alignment, AS); 4979249423Sdim return Cost; 4980243789Sdim } 4981243789Sdim 4982249423Sdim // Wide load/stores. 4983249423Sdim unsigned Cost = TTI.getAddressComputationCost(VectorTy); 4984249423Sdim Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 4985243789Sdim 4986249423Sdim if (Reverse) 4987249423Sdim Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, 4988249423Sdim VectorTy, 0); 4989249423Sdim return Cost; 4990249423Sdim } 4991249423Sdim case Instruction::ZExt: 4992249423Sdim case Instruction::SExt: 4993249423Sdim case Instruction::FPToUI: 4994249423Sdim case Instruction::FPToSI: 4995249423Sdim case Instruction::FPExt: 4996249423Sdim case Instruction::PtrToInt: 4997249423Sdim case Instruction::IntToPtr: 4998249423Sdim case Instruction::SIToFP: 4999249423Sdim case Instruction::UIToFP: 5000249423Sdim case Instruction::Trunc: 5001249423Sdim case Instruction::FPTrunc: 5002249423Sdim case Instruction::BitCast: { 5003249423Sdim // We optimize the truncation of induction variable. 5004249423Sdim // The cost of these is the same as the scalar operation. 5005249423Sdim if (I->getOpcode() == Instruction::Trunc && 5006249423Sdim Legal->isInductionVariable(I->getOperand(0))) 5007249423Sdim return TTI.getCastInstrCost(I->getOpcode(), I->getType(), 5008249423Sdim I->getOperand(0)->getType()); 5009243789Sdim 5010249423Sdim Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); 5011249423Sdim return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); 5012249423Sdim } 5013249423Sdim case Instruction::Call: { 5014249423Sdim CallInst *CI = cast<CallInst>(I); 5015249423Sdim Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 5016249423Sdim assert(ID && "Not an intrinsic call!"); 5017249423Sdim Type *RetTy = ToVectorTy(CI->getType(), VF); 5018249423Sdim SmallVector<Type*, 4> Tys; 5019249423Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 5020249423Sdim Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); 5021249423Sdim return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); 5022249423Sdim } 5023249423Sdim default: { 5024249423Sdim // We are scalarizing the instruction. Return the cost of the scalar 5025249423Sdim // instruction, plus the cost of insert and extract into vector 5026249423Sdim // elements, times the vector width. 5027249423Sdim unsigned Cost = 0; 5028243789Sdim 5029249423Sdim if (!RetTy->isVoidTy() && VF != 1) { 5030249423Sdim unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement, 5031249423Sdim VectorTy); 5032249423Sdim unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement, 5033249423Sdim VectorTy); 5034249423Sdim 5035243789Sdim // The cost of inserting the results plus extracting each one of the 5036243789Sdim // operands. 5037243789Sdim Cost += VF * (InsCost + ExtCost * I->getNumOperands()); 5038249423Sdim } 5039243789Sdim 5040249423Sdim // The cost of executing VF copies of the scalar instruction. This opcode 5041249423Sdim // is unknown. Assume that it is the same as 'mul'. 5042249423Sdim Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy); 5043249423Sdim return Cost; 5044249423Sdim } 5045243789Sdim }// end of switch. 5046243789Sdim} 5047243789Sdim 5048243789SdimType* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { 5049243789Sdim if (Scalar->isVoidTy() || VF == 1) 5050243789Sdim return Scalar; 5051243789Sdim return VectorType::get(Scalar, VF); 5052243789Sdim} 5053243789Sdim 5054243789Sdimchar LoopVectorize::ID = 0; 5055243789Sdimstatic const char lv_name[] = "Loop Vectorization"; 5056243789SdimINITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 5057249423SdimINITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 5058263508SdimINITIALIZE_PASS_DEPENDENCY(DominatorTree) 5059243789SdimINITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 5060263508SdimINITIALIZE_PASS_DEPENDENCY(LCSSA) 5061263508SdimINITIALIZE_PASS_DEPENDENCY(LoopInfo) 5062243789SdimINITIALIZE_PASS_DEPENDENCY(LoopSimplify) 5063243789SdimINITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 5064243789Sdim 5065243789Sdimnamespace llvm { 5066263508Sdim Pass *createLoopVectorizePass(bool NoUnrolling) { 5067263508Sdim return new LoopVectorize(NoUnrolling); 5068243789Sdim } 5069243789Sdim} 5070243789Sdim 5071249423Sdimbool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 5072249423Sdim // Check for a store. 5073249423Sdim if (StoreInst *ST = dyn_cast<StoreInst>(Inst)) 5074249423Sdim return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0; 5075249423Sdim 5076249423Sdim // Check for a load. 5077249423Sdim if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 5078249423Sdim return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0; 5079249423Sdim 5080249423Sdim return false; 5081249423Sdim} 5082263508Sdim 5083263508Sdim 5084263508Sdimvoid InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { 5085263508Sdim assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 5086263508Sdim // Holds vector parameters or scalars, in case of uniform vals. 5087263508Sdim SmallVector<VectorParts, 4> Params; 5088263508Sdim 5089263508Sdim setDebugLocFromInst(Builder, Instr); 5090263508Sdim 5091263508Sdim // Find all of the vectorized parameters. 5092263508Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5093263508Sdim Value *SrcOp = Instr->getOperand(op); 5094263508Sdim 5095263508Sdim // If we are accessing the old induction variable, use the new one. 5096263508Sdim if (SrcOp == OldInduction) { 5097263508Sdim Params.push_back(getVectorValue(SrcOp)); 5098263508Sdim continue; 5099263508Sdim } 5100263508Sdim 5101263508Sdim // Try using previously calculated values. 5102263508Sdim Instruction *SrcInst = dyn_cast<Instruction>(SrcOp); 5103263508Sdim 5104263508Sdim // If the src is an instruction that appeared earlier in the basic block 5105263508Sdim // then it should already be vectorized. 5106263508Sdim if (SrcInst && OrigLoop->contains(SrcInst)) { 5107263508Sdim assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); 5108263508Sdim // The parameter is a vector value from earlier. 5109263508Sdim Params.push_back(WidenMap.get(SrcInst)); 5110263508Sdim } else { 5111263508Sdim // The parameter is a scalar from outside the loop. Maybe even a constant. 5112263508Sdim VectorParts Scalars; 5113263508Sdim Scalars.append(UF, SrcOp); 5114263508Sdim Params.push_back(Scalars); 5115263508Sdim } 5116263508Sdim } 5117263508Sdim 5118263508Sdim assert(Params.size() == Instr->getNumOperands() && 5119263508Sdim "Invalid number of operands"); 5120263508Sdim 5121263508Sdim // Does this instruction return a value ? 5122263508Sdim bool IsVoidRetTy = Instr->getType()->isVoidTy(); 5123263508Sdim 5124263508Sdim Value *UndefVec = IsVoidRetTy ? 0 : 5125263508Sdim UndefValue::get(Instr->getType()); 5126263508Sdim // Create a new entry in the WidenMap and initialize it to Undef or Null. 5127263508Sdim VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); 5128263508Sdim 5129263508Sdim // For each vector unroll 'part': 5130263508Sdim for (unsigned Part = 0; Part < UF; ++Part) { 5131263508Sdim // For each scalar that we create: 5132263508Sdim 5133263508Sdim Instruction *Cloned = Instr->clone(); 5134263508Sdim if (!IsVoidRetTy) 5135263508Sdim Cloned->setName(Instr->getName() + ".cloned"); 5136263508Sdim // Replace the operands of the cloned instructions with extracted scalars. 5137263508Sdim for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 5138263508Sdim Value *Op = Params[op][Part]; 5139263508Sdim Cloned->setOperand(op, Op); 5140263508Sdim } 5141263508Sdim 5142263508Sdim // Place the cloned scalar in the new loop. 5143263508Sdim Builder.Insert(Cloned); 5144263508Sdim 5145263508Sdim // If the original scalar returns a value we need to place it in a vector 5146263508Sdim // so that future users will be able to use it. 5147263508Sdim if (!IsVoidRetTy) 5148263508Sdim VecResults[Part] = Cloned; 5149263508Sdim } 5150263508Sdim} 5151263508Sdim 5152263508Sdimvoid 5153263508SdimInnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr, 5154263508Sdim LoopVectorizationLegality*) { 5155263508Sdim return scalarizeInstruction(Instr); 5156263508Sdim} 5157263508Sdim 5158263508SdimValue *InnerLoopUnroller::reverseVector(Value *Vec) { 5159263508Sdim return Vec; 5160263508Sdim} 5161263508Sdim 5162263508SdimValue *InnerLoopUnroller::getBroadcastInstrs(Value *V) { 5163263508Sdim return V; 5164263508Sdim} 5165263508Sdim 5166263508SdimValue *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, 5167263508Sdim bool Negate) { 5168263508Sdim // When unrolling and the VF is 1, we only need to add a simple scalar. 5169263508Sdim Type *ITy = Val->getType(); 5170263508Sdim assert(!ITy->isVectorTy() && "Val must be a scalar"); 5171263508Sdim Constant *C = ConstantInt::get(ITy, StartIdx, Negate); 5172263508Sdim return Builder.CreateAdd(Val, C, "induction"); 5173263508Sdim} 5174263508Sdim 5175