178189Sbrian//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// 278189Sbrian// 378189Sbrian// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 478189Sbrian// See https://llvm.org/LICENSE.txt for license information. 578189Sbrian// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 66059Samurai// 778189Sbrian//===----------------------------------------------------------------------===// 878189Sbrian// 978189Sbrian// This pass merges loads/stores to/from sequential memory addresses into vector 1078189Sbrian// loads/stores. Although there's nothing GPU-specific in here, this pass is 1178189Sbrian// motivated by the microarchitectural quirks of nVidia and AMD GPUs. 1278189Sbrian// 1378189Sbrian// (For simplicity below we talk about loads only, but everything also applies 1478189Sbrian// to stores.) 156059Samurai// 1678189Sbrian// This pass is intended to be run late in the pipeline, after other 1778189Sbrian// vectorization opportunities have been exploited. So the assumption here is 1878189Sbrian// that immediately following our new vector load we'll need to extract out the 1978189Sbrian// individual elements of the load, so we can operate on them individually. 2078189Sbrian// 2178189Sbrian// On CPUs this transformation is usually not beneficial, because extracting the 2278189Sbrian// elements of a vector register is expensive on most architectures. It's 2378189Sbrian// usually better just to load each element individually into its own scalar 2478189Sbrian// register. 2578189Sbrian// 2678189Sbrian// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a 276059Samurai// "vector load" loads directly into a series of scalar registers. In effect, 2850479Speter// extracting the elements of the vector is free. It's therefore always 296059Samurai// beneficial to vectorize a sequence of loads on these architectures. 3078189Sbrian// 3143313Sbrian// Vectorizing (perhaps a better name might be "coalescing") loads can have 3231195Sbrian// large performance impacts on GPU kernels, and opportunities for vectorizing 3330715Sbrian// are common in GPU code. This pass tries very hard to find such 346059Samurai// opportunities; its runtime is quadratic in the number of loads in a BB. 356059Samurai// 3681634Sbrian// Some CPU architectures, such as ARM, have instructions that load into 3781634Sbrian// multiple scalar registers, similar to a GPU vectorized load. In theory ARM 3881634Sbrian// could use this pass (with some modifications), but currently it implements 3981634Sbrian// its own pass to do something similar to what we do here. 406059Samurai 416059Samurai#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 426059Samurai#include "llvm/ADT/APInt.h" 4336285Sbrian#include "llvm/ADT/ArrayRef.h" 4430715Sbrian#include "llvm/ADT/MapVector.h" 4530092Sbrian#include "llvm/ADT/PostOrderIterator.h" 4681634Sbrian#include "llvm/ADT/STLExtras.h" 4730715Sbrian#include "llvm/ADT/SmallPtrSet.h" 4830715Sbrian#include "llvm/ADT/SmallVector.h" 4946686Sbrian#include "llvm/ADT/Statistic.h" 5030715Sbrian#include "llvm/ADT/iterator_range.h" 5130715Sbrian#include "llvm/Analysis/AliasAnalysis.h" 5246686Sbrian#include "llvm/Analysis/MemoryLocation.h" 5346686Sbrian#include "llvm/Analysis/OrderedBasicBlock.h" 5430715Sbrian#include "llvm/Analysis/ScalarEvolution.h" 5530715Sbrian#include "llvm/Analysis/TargetTransformInfo.h" 5630715Sbrian#include "llvm/Analysis/ValueTracking.h" 5730715Sbrian#include "llvm/Analysis/VectorUtils.h" 5830715Sbrian#include "llvm/IR/Attributes.h" 5936285Sbrian#include "llvm/IR/BasicBlock.h" 6030715Sbrian#include "llvm/IR/Constants.h" 6136285Sbrian#include "llvm/IR/DataLayout.h" 6236285Sbrian#include "llvm/IR/DerivedTypes.h" 6336285Sbrian#include "llvm/IR/Dominators.h" 6481634Sbrian#include "llvm/IR/Function.h" 6581634Sbrian#include "llvm/IR/IRBuilder.h" 6636285Sbrian#include "llvm/IR/InstrTypes.h" 676059Samurai#include "llvm/IR/Instruction.h" 6836285Sbrian#include "llvm/IR/Instructions.h" 6936285Sbrian#include "llvm/IR/IntrinsicInst.h" 7036285Sbrian#include "llvm/IR/Module.h" 7136285Sbrian#include "llvm/IR/Type.h" 7236285Sbrian#include "llvm/IR/User.h" 7343313Sbrian#include "llvm/IR/Value.h" 7443313Sbrian#include "llvm/InitializePasses.h" 7543313Sbrian#include "llvm/Pass.h" 7681634Sbrian#include "llvm/Support/Casting.h" 7781634Sbrian#include "llvm/Support/Debug.h" 7836285Sbrian#include "llvm/Support/KnownBits.h" 7931195Sbrian#include "llvm/Support/MathExtras.h" 806059Samurai#include "llvm/Support/raw_ostream.h" 8158033Sbrian#include "llvm/Transforms/Utils/Local.h" 8258033Sbrian#include "llvm/Transforms/Vectorize.h" 8358033Sbrian#include <algorithm> 8458033Sbrian#include <cassert> 8558033Sbrian#include <cstdlib> 8658033Sbrian#include <tuple> 8758033Sbrian#include <utility> 8858033Sbrian 8958033Sbrianusing namespace llvm; 9058033Sbrian 9158033Sbrian#define DEBUG_TYPE "load-store-vectorizer" 9258033Sbrian 9358033SbrianSTATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 9458033SbrianSTATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 9558033Sbrian 9658033Sbrian// FIXME: Assuming stack alignment of 4 is always good enough 9758033Sbrianstatic const unsigned StackAdjustedAlignment = 4; 9858033Sbrian 9958033Sbriannamespace { 10055146Sbrian 1016059Samurai/// ChainID is an arbitrary token that is allowed to be different only for the 10258033Sbrian/// accesses that are guaranteed to be considered non-consecutive by 10358033Sbrian/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions 10458033Sbrian/// together and reducing the number of instructions the main search operates on 10558033Sbrian/// at a time, i.e. this is to reduce compile time and nothing else as the main 10658033Sbrian/// search has O(n^2) time complexity. The underlying type of ChainID should not 10758033Sbrian/// be relied upon. 10858033Sbrianusing ChainID = const Value *; 10958033Sbrianusing InstrList = SmallVector<Instruction *, 8>; 11058033Sbrianusing InstrListMap = MapVector<ChainID, InstrList>; 11158033Sbrian 11258033Sbrianclass Vectorizer { 11358033Sbrian Function &F; 11458033Sbrian AliasAnalysis &AA; 11558033Sbrian DominatorTree &DT; 11658033Sbrian ScalarEvolution &SE; 11758033Sbrian TargetTransformInfo &TTI; 11858033Sbrian const DataLayout &DL; 11958034Sbrian IRBuilder<> Builder; 12058033Sbrian 12158033Sbrianpublic: 12258033Sbrian Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, 12358033Sbrian ScalarEvolution &SE, TargetTransformInfo &TTI) 12458033Sbrian : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), 12558033Sbrian DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 12658033Sbrian 12758033Sbrian bool run(); 12858033Sbrian 12958033Sbrianprivate: 13058033Sbrian unsigned getPointerAddressSpace(Value *I); 13158033Sbrian 13258033Sbrian unsigned getAlignment(LoadInst *LI) const { 13358033Sbrian unsigned Align = LI->getAlignment(); 13458033Sbrian if (Align != 0) 13558033Sbrian return Align; 13658033Sbrian 13758033Sbrian return DL.getABITypeAlignment(LI->getType()); 13858033Sbrian } 13958033Sbrian 14058033Sbrian unsigned getAlignment(StoreInst *SI) const { 14158033Sbrian unsigned Align = SI->getAlignment(); 14258033Sbrian if (Align != 0) 14358033Sbrian return Align; 14458033Sbrian 14558033Sbrian return DL.getABITypeAlignment(SI->getValueOperand()->getType()); 14658034Sbrian } 14758033Sbrian 14858033Sbrian static const unsigned MaxDepth = 3; 14949140Sbrian 15028679Sbrian bool isConsecutiveAccess(Value *A, Value *B); 1516059Samurai bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, 1526059Samurai unsigned Depth = 0) const; 15349140Sbrian bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, 15462977Sbrian unsigned Depth) const; 1556059Samurai bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, 15662977Sbrian unsigned Depth) const; 1576059Samurai 15862977Sbrian /// After vectorization, reorder the instructions that I depends on 1596059Samurai /// (the instructions defining its operands), to ensure they dominate I. 16062977Sbrian void reorder(Instruction *I); 1616059Samurai 1626059Samurai /// Returns the first and the last instructions in Chain. 1636059Samurai std::pair<BasicBlock::iterator, BasicBlock::iterator> 1646059Samurai getBoundaryInstrs(ArrayRef<Instruction *> Chain); 165129175Sdds 166129175Sdds /// Erases the original instructions after vectorizing. 167129175Sdds void eraseInstructions(ArrayRef<Instruction *> Chain); 168129175Sdds 169129175Sdds /// "Legalize" the vector type that would be produced by combining \p 170129175Sdds /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 171129175Sdds /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 172129175Sdds /// expected to have more than 4 elements. 173129175Sdds std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 174129175Sdds splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 175129175Sdds 176129175Sdds /// Finds the largest prefix of Chain that's vectorizable, checking for 177129175Sdds /// intervening instructions which may affect the memory accessed by the 178129175Sdds /// instructions within Chain. 179129175Sdds /// 180129175Sdds /// The elements of \p Chain must be all loads or all stores and must be in 181129175Sdds /// address order. 182129175Sdds ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 183129175Sdds 184129175Sdds /// Collects load and store instructions to vectorize. 185129175Sdds std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 186129175Sdds 187129175Sdds /// Processes the collected instructions, the \p Map. The values of \p Map 188129175Sdds /// should be all loads or all stores. 189129175Sdds bool vectorizeChains(InstrListMap &Map); 190129175Sdds 191129175Sdds /// Finds the load/stores to consecutive memory addresses and vectorizes them. 192129175Sdds bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 193129175Sdds 194129175Sdds /// Vectorizes the load instructions in Chain. 195129175Sdds bool 196129175Sdds vectorizeLoadChain(ArrayRef<Instruction *> Chain, 19781634Sbrian SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 19881634Sbrian 19981634Sbrian /// Vectorizes the store instructions in Chain. 20081634Sbrian bool 20149140Sbrian vectorizeStoreChain(ArrayRef<Instruction *> Chain, 20281634Sbrian SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 20381634Sbrian 20481634Sbrian /// Check if this load/store access is misaligned accesses. 20581634Sbrian bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 20681634Sbrian unsigned Alignment); 2076059Samurai}; 20881634Sbrian 20981634Sbrianclass LoadStoreVectorizerLegacyPass : public FunctionPass { 21081634Sbrianpublic: 2116059Samurai static char ID; 21249140Sbrian 21381634Sbrian LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { 21449140Sbrian initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); 21549140Sbrian } 21649140Sbrian 21749140Sbrian bool runOnFunction(Function &F) override; 21849140Sbrian 21949140Sbrian StringRef getPassName() const override { 22081634Sbrian return "GPU Load and Store Vectorizer"; 22149140Sbrian } 222129175Sdds 22381634Sbrian void getAnalysisUsage(AnalysisUsage &AU) const override { 22481634Sbrian AU.addRequired<AAResultsWrapperPass>(); 22581634Sbrian AU.addRequired<ScalarEvolutionWrapperPass>(); 2266059Samurai AU.addRequired<DominatorTreeWrapperPass>(); 22749140Sbrian AU.addRequired<TargetTransformInfoWrapperPass>(); 22862977Sbrian AU.setPreservesCFG(); 22936285Sbrian } 23081634Sbrian}; 23181634Sbrian 23281634Sbrian} // end anonymous namespace 23381634Sbrian 23481634Sbrianchar LoadStoreVectorizerLegacyPass::ID = 0; 23581634Sbrian 23681738SbrianINITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 23781738Sbrian "Vectorize load and Store instructions", false, false) 23881634SbrianINITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 23981634SbrianINITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 24081634SbrianINITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 24181634SbrianINITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 24281634SbrianINITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 24381634SbrianINITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 24481634Sbrian "Vectorize load and store instructions", false, false) 24581634Sbrian 24681634SbrianPass *llvm::createLoadStoreVectorizerPass() { 24781634Sbrian return new LoadStoreVectorizerLegacyPass(); 24881634Sbrian} 24981634Sbrian 25081634Sbrianbool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { 25181634Sbrian // Don't vectorize when the attribute NoImplicitFloat is used. 25298243Sbrian if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 25381634Sbrian return false; 25481634Sbrian 25581634Sbrian AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 25681634Sbrian DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 25781634Sbrian ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 25881634Sbrian TargetTransformInfo &TTI = 25981634Sbrian getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 26081634Sbrian 26181634Sbrian Vectorizer V(F, AA, DT, SE, TTI); 26281634Sbrian return V.run(); 26381634Sbrian} 26465181Sbrian 26598243SbrianPreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 26681634Sbrian // Don't vectorize when the attribute NoImplicitFloat is used. 26781634Sbrian if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 26881634Sbrian return PreservedAnalyses::all(); 26981634Sbrian 27081634Sbrian AliasAnalysis &AA = AM.getResult<AAManager>(F); 27149140Sbrian DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 27281634Sbrian ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 273129175Sdds TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 27481634Sbrian 27549140Sbrian Vectorizer V(F, AA, DT, SE, TTI); 27681634Sbrian bool Changed = V.run(); 27749140Sbrian PreservedAnalyses PA; 27849140Sbrian PA.preserveSet<CFGAnalyses>(); 27949140Sbrian return Changed ? PA : PreservedAnalyses::all(); 28049140Sbrian} 28149140Sbrian 28249140Sbrian// The real propagateMetadata expects a SmallVector<Value*>, but we deal in 28336285Sbrian// vectors of Instructions. 28449140Sbrianstatic void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 28549140Sbrian SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 28649140Sbrian propagateMetadata(I, VL); 28749140Sbrian} 2886059Samurai 28949140Sbrian// Vectorizer Implementation 29081634Sbrianbool Vectorizer::run() { 29181634Sbrian bool Changed = false; 29281634Sbrian 29381634Sbrian // Scan the blocks in the function in post order. 29481634Sbrian for (BasicBlock *BB : post_order(&F)) { 29581634Sbrian InstrListMap LoadRefs, StoreRefs; 29662977Sbrian std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 29762977Sbrian Changed |= vectorizeChains(LoadRefs); 29862977Sbrian Changed |= vectorizeChains(StoreRefs); 29962977Sbrian } 30081634Sbrian 30181634Sbrian return Changed; 30281634Sbrian} 30381634Sbrian 30481634Sbrianunsigned Vectorizer::getPointerAddressSpace(Value *I) { 30581634Sbrian if (LoadInst *L = dyn_cast<LoadInst>(I)) 30649140Sbrian return L->getPointerAddressSpace(); 30781634Sbrian if (StoreInst *S = dyn_cast<StoreInst>(I)) 30862977Sbrian return S->getPointerAddressSpace(); 30981634Sbrian return -1; 31081634Sbrian} 311112660Sbrian 31281634Sbrian// FIXME: Merge with llvm::isConsecutiveAccess 31381634Sbrianbool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 31481634Sbrian Value *PtrA = getLoadStorePointerOperand(A); 31565181Sbrian Value *PtrB = getLoadStorePointerOperand(B); 31681634Sbrian unsigned ASA = getPointerAddressSpace(A); 31781634Sbrian unsigned ASB = getPointerAddressSpace(B); 31881634Sbrian 31981634Sbrian // Check that the address spaces match and that the pointers are valid. 320112660Sbrian if (!PtrA || !PtrB || (ASA != ASB)) 32162977Sbrian return false; 32262977Sbrian 32362977Sbrian // Make sure that A and B are different pointers of the same size type. 32481634Sbrian Type *PtrATy = PtrA->getType()->getPointerElementType(); 32581634Sbrian Type *PtrBTy = PtrB->getType()->getPointerElementType(); 32662977Sbrian if (PtrA == PtrB || 32781634Sbrian PtrATy->isVectorTy() != PtrBTy->isVectorTy() || 32862977Sbrian DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 32981634Sbrian DL.getTypeStoreSize(PtrATy->getScalarType()) != 33051809Sbrian DL.getTypeStoreSize(PtrBTy->getScalarType())) 33151809Sbrian return false; 33281634Sbrian 33351809Sbrian unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 33451809Sbrian APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 33549374Sbrian 33662977Sbrian return areConsecutivePointers(PtrA, PtrB, Size); 33781634Sbrian} 33862977Sbrian 33949374Sbrianbool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, 34081634Sbrian APInt PtrDelta, unsigned Depth) const { 34181634Sbrian unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); 34281634Sbrian APInt OffsetA(PtrBitWidth, 0); 34365846Sbrian APInt OffsetB(PtrBitWidth, 0); 34481634Sbrian PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 34581634Sbrian PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 34662977Sbrian 34781634Sbrian unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); 34881634Sbrian 34962977Sbrian if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) 35062977Sbrian return false; 35162977Sbrian 35262977Sbrian // In case if we have to shrink the pointer 35362977Sbrian // stripAndAccumulateInBoundsConstantOffsets should properly handle a 35462977Sbrian // possible overflow and the value should fit into a smallest data type 35581634Sbrian // used in the cast/gep chain. 35662977Sbrian assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && 35781634Sbrian OffsetB.getMinSignedBits() <= NewPtrBitWidth); 35881634Sbrian 35981634Sbrian OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); 36062977Sbrian OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); 36162977Sbrian PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); 36262977Sbrian 36365181Sbrian APInt OffsetDelta = OffsetB - OffsetA; 36465181Sbrian 36562977Sbrian // Check if they are based on the same pointer. That makes the offsets 36665181Sbrian // sufficient. 36762977Sbrian if (PtrA == PtrB) 36862977Sbrian return OffsetDelta == PtrDelta; 36962977Sbrian 37062977Sbrian // Compute the necessary base pointer delta to have the necessary final delta 37162977Sbrian // equal to the pointer delta requested. 37262977Sbrian APInt BaseDelta = PtrDelta - OffsetDelta; 37362977Sbrian 37462977Sbrian // Compute the distance with SCEV between the base pointers. 37562977Sbrian const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 37662977Sbrian const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 37762977Sbrian const SCEV *C = SE.getConstant(BaseDelta); 37862977Sbrian const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 37962977Sbrian if (X == PtrSCEVB) 38062977Sbrian return true; 38162977Sbrian 38281634Sbrian // The above check will not catch the cases where one of the pointers is 38362977Sbrian // factorized but the other one is not, such as (C + (S * (A + B))) vs 38426516Sbrian // (AS + BS). Get the minus scev. That will allow re-combining the expresions 38581634Sbrian // and getting the simplified difference. 38681634Sbrian const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); 387129175Sdds if (C == Dist) 38881634Sbrian return true; 38981634Sbrian 39081634Sbrian // Sometimes even this doesn't work, because SCEV can't always see through 39162977Sbrian // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 39262977Sbrian // things the hard way. 39362977Sbrian return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); 39462977Sbrian} 39562977Sbrian 39662977Sbrianbool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, 39762977Sbrian APInt PtrDelta, 398129175Sdds unsigned Depth) const { 399129175Sdds auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); 40062977Sbrian auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); 40162977Sbrian if (!GEPA || !GEPB) 40262977Sbrian return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); 40381634Sbrian 40462977Sbrian // Look through GEPs after checking they're the same except for the last 40562977Sbrian // index. 40662977Sbrian if (GEPA->getNumOperands() != GEPB->getNumOperands() || 40762977Sbrian GEPA->getPointerOperand() != GEPB->getPointerOperand()) 40862977Sbrian return false; 40962977Sbrian gep_type_iterator GTIA = gep_type_begin(GEPA); 41062977Sbrian gep_type_iterator GTIB = gep_type_begin(GEPB); 41162977Sbrian for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { 41262977Sbrian if (GTIA.getOperand() != GTIB.getOperand()) 41362977Sbrian return false; 41462977Sbrian ++GTIA; 41562977Sbrian ++GTIB; 41662977Sbrian } 41749140Sbrian 41862977Sbrian Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); 41981634Sbrian Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); 42081634Sbrian if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 42162977Sbrian OpA->getType() != OpB->getType()) 42249140Sbrian return false; 42362977Sbrian 42462977Sbrian if (PtrDelta.isNegative()) { 42562977Sbrian if (PtrDelta.isMinSignedValue()) 42662977Sbrian return false; 42762977Sbrian PtrDelta.negate(); 42862977Sbrian std::swap(OpA, OpB); 42962977Sbrian } 43062977Sbrian uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); 43162977Sbrian if (PtrDelta.urem(Stride) != 0) 43262977Sbrian return false; 43362977Sbrian unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); 43449140Sbrian APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); 43562977Sbrian 43662977Sbrian // Only look through a ZExt/SExt. 43762977Sbrian if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 43862977Sbrian return false; 4396059Samurai 44049140Sbrian bool Signed = isa<SExtInst>(OpA); 44149140Sbrian 44249140Sbrian // At this point A could be a function parameter, i.e. not an instruction 44349140Sbrian Value *ValA = OpA->getOperand(0); 44449140Sbrian OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 44549140Sbrian if (!OpB || ValA->getType() != OpB->getType()) 44662977Sbrian return false; 44765181Sbrian 44862977Sbrian // Now we need to prove that adding IdxDiff to ValA won't overflow. 44962977Sbrian bool Safe = false; 45062977Sbrian // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to 45165181Sbrian // ValA, we're okay. 45265181Sbrian if (OpB->getOpcode() == Instruction::Add && 45365181Sbrian isa<ConstantInt>(OpB->getOperand(1)) && 45481634Sbrian IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { 45565181Sbrian if (Signed) 456129175Sdds Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); 457129175Sdds else 458129175Sdds Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); 45965181Sbrian } 46065181Sbrian 46162977Sbrian unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); 46265181Sbrian 46365181Sbrian // Second attempt: 46481634Sbrian // If all set bits of IdxDiff or any higher order bit other than the sign bit 46598243Sbrian // are known to be zero in ValA, we can add Diff to it while guaranteeing no 46698243Sbrian // overflow of any sort. 467129175Sdds if (!Safe) { 46881634Sbrian OpA = dyn_cast<Instruction>(ValA); 46965181Sbrian if (!OpA) 47098243Sbrian return false; 47181634Sbrian KnownBits Known(BitWidth); 47265181Sbrian computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); 47349140Sbrian APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); 47449140Sbrian if (Signed) 4756059Samurai BitsAllowedToBeSet.clearBit(BitWidth - 1); 4766059Samurai if (BitsAllowedToBeSet.ult(IdxDiff)) 4776059Samurai return false; 47865181Sbrian } 47965181Sbrian 48081634Sbrian const SCEV *OffsetSCEVA = SE.getSCEV(ValA); 48198243Sbrian const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 48298243Sbrian const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); 483129175Sdds const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); 484129175Sdds return X == OffsetSCEVB; 48565181Sbrian} 48665181Sbrian 48781634Sbrianbool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, 4886059Samurai const APInt &PtrDelta, 4896059Samurai unsigned Depth) const { 4906059Samurai if (Depth++ == MaxDepth) 49158033Sbrian return false; 49258033Sbrian 49358033Sbrian if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { 49458033Sbrian if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { 49558033Sbrian return SelectA->getCondition() == SelectB->getCondition() && 49677487Sbrian areConsecutivePointers(SelectA->getTrueValue(), 49758033Sbrian SelectB->getTrueValue(), PtrDelta, Depth) && 49858033Sbrian areConsecutivePointers(SelectA->getFalseValue(), 49958033Sbrian SelectB->getFalseValue(), PtrDelta, Depth); 50058033Sbrian } 50158033Sbrian } 50258033Sbrian return false; 50358033Sbrian} 50458033Sbrian 50558033Sbrianvoid Vectorizer::reorder(Instruction *I) { 50658033Sbrian OrderedBasicBlock OBB(I->getParent()); 50758033Sbrian SmallPtrSet<Instruction *, 16> InstructionsToMove; 50858033Sbrian SmallVector<Instruction *, 16> Worklist; 50958033Sbrian 51062977Sbrian Worklist.push_back(I); 51158033Sbrian while (!Worklist.empty()) { 51258033Sbrian Instruction *IW = Worklist.pop_back_val(); 51358033Sbrian int NumOperands = IW->getNumOperands(); 51458033Sbrian for (int i = 0; i < NumOperands; i++) { 51558033Sbrian Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 51674049Sbrian if (!IM || IM->getOpcode() == Instruction::PHI) 51758033Sbrian continue; 51858033Sbrian 51958033Sbrian // If IM is in another BB, no need to move it, because this pass only 52074049Sbrian // vectorizes instructions within one BB. 52158033Sbrian if (IM->getParent() != I->getParent()) 52274049Sbrian continue; 52374049Sbrian 52458033Sbrian if (!OBB.dominates(IM, I)) { 52558033Sbrian InstructionsToMove.insert(IM); 52658033Sbrian Worklist.push_back(IM); 52758033Sbrian } 52874049Sbrian } 52958033Sbrian } 53058033Sbrian 53158033Sbrian // All instructions to move should follow I. Start from I, not from begin(). 53258033Sbrian for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 53358033Sbrian ++BBI) { 53458033Sbrian if (!InstructionsToMove.count(&*BBI)) 53558033Sbrian continue; 53677487Sbrian Instruction *IM = &*BBI; 53777487Sbrian --BBI; 53877487Sbrian IM->removeFromParent(); 53977487Sbrian IM->insertBefore(I); 54077487Sbrian } 54177487Sbrian} 54277487Sbrian 54377487Sbrianstd::pair<BasicBlock::iterator, BasicBlock::iterator> 54477487SbrianVectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 54558033Sbrian Instruction *C0 = Chain[0]; 54658033Sbrian BasicBlock::iterator FirstInstr = C0->getIterator(); 54758033Sbrian BasicBlock::iterator LastInstr = C0->getIterator(); 5486059Samurai 54981634Sbrian BasicBlock *BB = C0->getParent(); 55081634Sbrian unsigned NumFound = 0; 5516059Samurai for (Instruction &I : *BB) { 5526059Samurai if (!is_contained(Chain, &I)) 55381634Sbrian continue; 55481634Sbrian 55581634Sbrian ++NumFound; 5566059Samurai if (NumFound == 1) { 55758033Sbrian FirstInstr = I.getIterator(); 55858033Sbrian } 55958033Sbrian if (NumFound == Chain.size()) { 56081634Sbrian LastInstr = I.getIterator(); 56181634Sbrian break; 56281634Sbrian } 56381634Sbrian } 56481634Sbrian 56581634Sbrian // Range is [first, last). 56681634Sbrian return std::make_pair(FirstInstr, ++LastInstr); 56781634Sbrian} 56881634Sbrian 56937010Sbrianvoid Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 57081634Sbrian SmallVector<Instruction *, 16> Instrs; 57181634Sbrian for (Instruction *I : Chain) { 5726059Samurai Value *PtrOperand = getLoadStorePointerOperand(I); 57358776Sbrian assert(PtrOperand && "Instruction must have a pointer operand."); 57458776Sbrian Instrs.push_back(I); 57526692Sbrian if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 57658033Sbrian Instrs.push_back(GEP); 5776059Samurai } 57881634Sbrian 57981634Sbrian // Erase instructions. 58081634Sbrian for (Instruction *I : Instrs) 58181634Sbrian if (I->use_empty()) 58281634Sbrian I->eraseFromParent(); 58381634Sbrian} 58481634Sbrian 58581634Sbrianstd::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 58681634SbrianVectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 58781738Sbrian unsigned ElementSizeBits) { 58881634Sbrian unsigned ElementSizeBytes = ElementSizeBits / 8; 58981634Sbrian unsigned SizeBytes = ElementSizeBytes * Chain.size(); 59081634Sbrian unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; 59181634Sbrian if (NumLeft == Chain.size()) { 59281634Sbrian if ((NumLeft & 1) == 0) 59381634Sbrian NumLeft /= 2; // Split even in half 59481634Sbrian else 59581634Sbrian --NumLeft; // Split off last element 59681634Sbrian } else if (NumLeft == 0) 59781634Sbrian NumLeft = 1; 59881634Sbrian return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 59981634Sbrian} 60081634Sbrian 60181634SbrianArrayRef<Instruction *> 60281634SbrianVectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 60358033Sbrian // These are in BB order, unlike Chain, which is in address order. 6048857Srgrimes SmallVector<Instruction *, 16> MemoryInstrs; 60526692Sbrian SmallVector<Instruction *, 16> ChainInstrs; 60662778Sbrian 60762778Sbrian bool IsLoadChain = isa<LoadInst>(Chain[0]); 60862778Sbrian LLVM_DEBUG({ 60958776Sbrian for (Instruction *I : Chain) { 61058776Sbrian if (IsLoadChain) 61158776Sbrian assert(isa<LoadInst>(I) && 61228679Sbrian "All elements of Chain must be loads, or all must be stores."); 61326692Sbrian else 6146059Samurai assert(isa<StoreInst>(I) && 61581634Sbrian "All elements of Chain must be loads, or all must be stores."); 6166059Samurai } 61726692Sbrian }); 61881634Sbrian 61981634Sbrian for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 62028679Sbrian if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 62181634Sbrian if (!is_contained(Chain, &I)) 62228679Sbrian MemoryInstrs.push_back(&I); 62328679Sbrian else 62481634Sbrian ChainInstrs.push_back(&I); 62528679Sbrian } else if (isa<IntrinsicInst>(&I) && 6266059Samurai cast<IntrinsicInst>(&I)->getIntrinsicID() == 6276059Samurai Intrinsic::sideeffect) { 62851048Sbrian // Ignore llvm.sideeffect calls. 62981634Sbrian } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { 63081634Sbrian LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I 63181634Sbrian << '\n'); 63281634Sbrian break; 63381634Sbrian } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { 63481634Sbrian LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I 63581634Sbrian << '\n'); 63681634Sbrian break; 63781634Sbrian } 63881634Sbrian } 63981634Sbrian 64081634Sbrian OrderedBasicBlock OBB(Chain[0]->getParent()); 64181634Sbrian 64281634Sbrian // Loop until we find an instruction in ChainInstrs that we can't vectorize. 64381634Sbrian unsigned ChainInstrIdx = 0; 6446059Samurai Instruction *BarrierMemoryInstr = nullptr; 64581634Sbrian 64681634Sbrian for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { 64751048Sbrian Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; 64851048Sbrian 64981634Sbrian // If a barrier memory instruction was found, chain instructions that follow 65081634Sbrian // will not be added to the valid prefix. 65151048Sbrian if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) 65251048Sbrian break; 65326692Sbrian 65481634Sbrian // Check (in BB order) if any instruction prevents ChainInstr from being 65528679Sbrian // vectorized. Find and store the first such "conflicting" instruction. 65681634Sbrian for (Instruction *MemInstr : MemoryInstrs) { 65728679Sbrian // If a barrier memory instruction was found, do not check past it. 65828679Sbrian if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) 65981634Sbrian break; 66062977Sbrian 66128679Sbrian auto *MemLoad = dyn_cast<LoadInst>(MemInstr); 6626059Samurai auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); 66362778Sbrian if (MemLoad && ChainLoad) 66462778Sbrian continue; 66581634Sbrian 66681634Sbrian // We can ignore the alias if the we have a load store pair and the load 66762778Sbrian // is known to be invariant. The load cannot be clobbered by the store. 66862778Sbrian auto IsInvariantLoad = [](const LoadInst *LI) -> bool { 66962778Sbrian return LI->hasMetadata(LLVMContext::MD_invariant_load); 67081634Sbrian }; 67162778Sbrian 67262778Sbrian // We can ignore the alias as long as the load comes before the store, 67362778Sbrian // because that means we won't be moving the load past the store to 67462778Sbrian // vectorize it (the vectorized load is inserted at the location of the 67562778Sbrian // first load in the chain). 67681634Sbrian if (isa<StoreInst>(MemInstr) && ChainLoad && 67781634Sbrian (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr))) 67862977Sbrian continue; 67962778Sbrian 68062778Sbrian // Same case, but in reverse. 68162778Sbrian if (MemLoad && isa<StoreInst>(ChainInstr) && 68262778Sbrian (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr))) 68362778Sbrian continue; 68462778Sbrian 68562778Sbrian if (!AA.isNoAlias(MemoryLocation::get(MemInstr), 68662778Sbrian MemoryLocation::get(ChainInstr))) { 68762778Sbrian LLVM_DEBUG({ 68862778Sbrian dbgs() << "LSV: Found alias:\n" 68962778Sbrian " Aliasing instruction and pointer:\n" 69062778Sbrian << " " << *MemInstr << '\n' 69162778Sbrian << " " << *getLoadStorePointerOperand(MemInstr) << '\n' 69262778Sbrian << " Aliased instruction and pointer:\n" 69362778Sbrian << " " << *ChainInstr << '\n' 69462778Sbrian << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; 69562778Sbrian }); 69662778Sbrian // Save this aliasing memory instruction as a barrier, but allow other 69762778Sbrian // instructions that precede the barrier to be vectorized with this one. 69862778Sbrian BarrierMemoryInstr = MemInstr; 69962778Sbrian break; 70062778Sbrian } 70162778Sbrian } 70262778Sbrian // Continue the search only for store chains, since vectorizing stores that 7036059Samurai // precede an aliasing load is valid. Conversely, vectorizing loads is valid 70451048Sbrian // up to an aliasing store, but should not pull loads from further down in 70551809Sbrian // the basic block. 70651809Sbrian if (IsLoadChain && BarrierMemoryInstr) { 70751809Sbrian // The BarrierMemoryInstr is a store that precedes ChainInstr. 70851809Sbrian assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); 70981634Sbrian break; 71051809Sbrian } 71151809Sbrian } 71281634Sbrian 71351809Sbrian // Find the largest prefix of Chain whose elements are all in 71451809Sbrian // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of 71551809Sbrian // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB 71651809Sbrian // order.) 71751809Sbrian SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( 71849374Sbrian ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); 71949372Sbrian unsigned ChainIdx = 0; 72049372Sbrian for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 72149372Sbrian if (!VectorizableChainInstrs.count(Chain[ChainIdx])) 72281634Sbrian break; 72349372Sbrian } 72449372Sbrian return Chain.slice(0, ChainIdx); 72581634Sbrian} 72649372Sbrian 72749372Sbrianstatic ChainID getChainID(const Value *Ptr, const DataLayout &DL) { 72849372Sbrian const Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 72949374Sbrian if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { 73051048Sbrian // The select's themselves are distinct instructions even if they share the 73181634Sbrian // same condition and evaluate to consecutive pointers for true and false 73281634Sbrian // values of the condition. Therefore using the select's themselves for 73381634Sbrian // grouping instructions would put consecutive accesses into different lists 73481634Sbrian // and they won't be even checked for being consecutive, and won't be 73581634Sbrian // vectorized. 73681634Sbrian return Sel->getCondition(); 73781634Sbrian } 73881634Sbrian return ObjPtr; 73981634Sbrian} 74081634Sbrian 74181634Sbrianstd::pair<InstrListMap, InstrListMap> 74281634SbrianVectorizer::collectInstructions(BasicBlock *BB) { 74381634Sbrian InstrListMap LoadRefs; 74481634Sbrian InstrListMap StoreRefs; 74581634Sbrian 74681634Sbrian for (Instruction &I : *BB) { 74781634Sbrian if (!I.mayReadOrWriteMemory()) 74881634Sbrian continue; 74981634Sbrian 75081634Sbrian if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 75181634Sbrian if (!LI->isSimple()) 75236961Sbrian continue; 75336961Sbrian 75436961Sbrian // Skip if it's not legal. 75581634Sbrian if (!TTI.isLegalToVectorizeLoad(LI)) 75636961Sbrian continue; 75736961Sbrian 75881634Sbrian Type *Ty = LI->getType(); 75936961Sbrian if (!VectorType::isValidElementType(Ty->getScalarType())) 76081634Sbrian continue; 76175894Sbrian 76281634Sbrian // Skip weird non-byte sizes. They probably aren't worth the effort of 76381634Sbrian // handling correctly. 76481634Sbrian unsigned TySize = DL.getTypeSizeInBits(Ty); 76581634Sbrian if ((TySize % 8) != 0) 76681634Sbrian continue; 76781634Sbrian 76881634Sbrian // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 76936961Sbrian // functions are currently using an integer type for the vectorized 77036961Sbrian // load/store, and does not support casting between the integer type and a 77151048Sbrian // vector of pointers (e.g. i64 to <2 x i16*>) 77265846Sbrian if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 77365846Sbrian continue; 77465846Sbrian 77581634Sbrian Value *Ptr = LI->getPointerOperand(); 77665846Sbrian unsigned AS = Ptr->getType()->getPointerAddressSpace(); 77771781Sbrian unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 77881634Sbrian 77965846Sbrian unsigned VF = VecRegSize / TySize; 78065846Sbrian VectorType *VecTy = dyn_cast<VectorType>(Ty); 78165846Sbrian 78265846Sbrian // No point in looking at these if they're too big to vectorize. 78365846Sbrian if (TySize > VecRegSize / 2 || 78465846Sbrian (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 78565846Sbrian continue; 78681634Sbrian 78765846Sbrian // Make sure all the users of a vector are constant-index extracts. 78871781Sbrian if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) { 78981634Sbrian const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 79065846Sbrian return EEI && isa<ConstantInt>(EEI->getOperand(1)); 79165846Sbrian })) 79265846Sbrian continue; 79365846Sbrian 79436961Sbrian // Save the load locations. 79536961Sbrian const ChainID ID = getChainID(Ptr, DL); 79681634Sbrian LoadRefs[ID].push_back(LI); 79736961Sbrian } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 79881634Sbrian if (!SI->isSimple()) 79962977Sbrian continue; 80036961Sbrian 80136961Sbrian // Skip if it's not legal. 80281634Sbrian if (!TTI.isLegalToVectorizeStore(SI)) 80336961Sbrian continue; 80436961Sbrian 80536961Sbrian Type *Ty = SI->getValueOperand()->getType(); 80651048Sbrian if (!VectorType::isValidElementType(Ty->getScalarType())) 8076059Samurai continue; 80881634Sbrian 80981634Sbrian // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 81050867Sbrian // functions are currently using an integer type for the vectorized 81151048Sbrian // load/store, and does not support casting between the integer type and a 81281634Sbrian // vector of pointers (e.g. i64 to <2 x i16*>) 81381634Sbrian if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 81450867Sbrian continue; 81550867Sbrian 81626692Sbrian // Skip weird non-byte sizes. They probably aren't worth the effort of 81781634Sbrian // handling correctly. 81828679Sbrian unsigned TySize = DL.getTypeSizeInBits(Ty); 81981634Sbrian if ((TySize % 8) != 0) 82028679Sbrian continue; 82128679Sbrian 82281634Sbrian Value *Ptr = SI->getPointerOperand(); 82328679Sbrian unsigned AS = Ptr->getType()->getPointerAddressSpace(); 8246059Samurai unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 8256059Samurai 82662977Sbrian unsigned VF = VecRegSize / TySize; 82762977Sbrian VectorType *VecTy = dyn_cast<VectorType>(Ty); 82862977Sbrian 82962977Sbrian // No point in looking at these if they're too big to vectorize. 83062977Sbrian if (TySize > VecRegSize / 2 || 8316059Samurai (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 83228679Sbrian continue; 83362977Sbrian 83462977Sbrian if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) { 83528679Sbrian const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 8366059Samurai return EEI && isa<ConstantInt>(EEI->getOperand(1)); 83781634Sbrian })) 8386059Samurai continue; 83981634Sbrian 84062977Sbrian // Save store location. 84162977Sbrian const ChainID ID = getChainID(Ptr, DL); 84262977Sbrian StoreRefs[ID].push_back(SI); 84362977Sbrian } 84462977Sbrian } 8456059Samurai 8466059Samurai return {LoadRefs, StoreRefs}; 8476059Samurai} 84862778Sbrian 84962778Sbrianbool Vectorizer::vectorizeChains(InstrListMap &Map) { 85062778Sbrian bool Changed = false; 85162778Sbrian 85281634Sbrian for (const std::pair<ChainID, InstrList> &Chain : Map) { 85381634Sbrian unsigned Size = Chain.second.size(); 85481634Sbrian if (Size < 2) 85581634Sbrian continue; 85681634Sbrian 85781634Sbrian LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 85881634Sbrian 85981634Sbrian // Process the stores in chunks of 64. 86081634Sbrian for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 86181634Sbrian unsigned Len = std::min<unsigned>(CE - CI, 64); 8626059Samurai ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 86326692Sbrian Changed |= vectorizeInstructions(Chunk); 86481634Sbrian } 86531142Sbrian } 86636285Sbrian 86758033Sbrian return Changed; 8686059Samurai} 86936285Sbrian 87058033Sbrianbool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 87162977Sbrian LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() 87262977Sbrian << " instructions.\n"); 87362977Sbrian SmallVector<int, 16> Heads, Tails; 87481634Sbrian int ConsecutiveChain[64]; 87581634Sbrian 87636285Sbrian // Do a quadratic search on all of the given loads/stores and find all of the 87762977Sbrian // pairs of loads/stores that follow each other. 87862977Sbrian for (int i = 0, e = Instrs.size(); i < e; ++i) { 87962977Sbrian ConsecutiveChain[i] = -1; 88062977Sbrian for (int j = e - 1; j >= 0; --j) { 88162977Sbrian if (i == j) 88298243Sbrian continue; 88362977Sbrian 88462977Sbrian if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 88562977Sbrian if (ConsecutiveChain[i] != -1) { 88662977Sbrian int CurDistance = std::abs(ConsecutiveChain[i] - i); 88762977Sbrian int NewDistance = std::abs(ConsecutiveChain[i] - j); 88862977Sbrian if (j < i || NewDistance > CurDistance) 8896735Samurai continue; // Should not insert. 89058033Sbrian } 8916059Samurai 89258033Sbrian Tails.push_back(j); 89358776Sbrian Heads.push_back(i); 89458033Sbrian ConsecutiveChain[i] = j; 89558033Sbrian } 89658033Sbrian } 8976059Samurai } 8986059Samurai 89981634Sbrian bool Changed = false; 90081634Sbrian SmallPtrSet<Instruction *, 16> InstructionsProcessed; 90136285Sbrian 9026059Samurai for (int Head : Heads) { 90331343Sbrian if (InstructionsProcessed.count(Instrs[Head])) 90456413Sbrian continue; 90562977Sbrian bool LongerChainExists = false; 9066059Samurai for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 90754912Sbrian if (Head == Tails[TIt] && 90847168Sbrian !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 90947168Sbrian LongerChainExists = true; 91047168Sbrian break; 91154912Sbrian } 91281634Sbrian if (LongerChainExists) 91347168Sbrian continue; 91446686Sbrian 91526031Sbrian // We found an instr that starts a chain. Now follow the chain and try to 91662977Sbrian // vectorize it. 91781634Sbrian SmallVector<Instruction *, 16> Operands; 91881634Sbrian int I = Head; 91981634Sbrian while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { 92026031Sbrian if (InstructionsProcessed.count(Instrs[I])) 92162977Sbrian break; 92281634Sbrian 92362977Sbrian Operands.push_back(Instrs[I]); 92462977Sbrian I = ConsecutiveChain[I]; 92562977Sbrian } 92662977Sbrian 92726031Sbrian bool Vectorized = false; 92856413Sbrian if (isa<LoadInst>(*Operands.begin())) 92981634Sbrian Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 93056413Sbrian else 93156413Sbrian Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 93256413Sbrian 93356413Sbrian Changed |= Vectorized; 93456413Sbrian } 93556413Sbrian 93646686Sbrian return Changed; 93746686Sbrian} 93847168Sbrian 93947168Sbrianbool Vectorizer::vectorizeStoreChain( 94046686Sbrian ArrayRef<Instruction *> Chain, 94147168Sbrian SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 94246686Sbrian StoreInst *S0 = cast<StoreInst>(Chain[0]); 94336285Sbrian 94481634Sbrian // If the vector has an int element, default to int for the whole store. 9456059Samurai Type *StoreTy = nullptr; 9466059Samurai for (Instruction *I : Chain) { 94781634Sbrian StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 94881634Sbrian if (StoreTy->isIntOrIntVectorTy()) 9496059Samurai break; 95081634Sbrian 9516059Samurai if (StoreTy->isPtrOrPtrVectorTy()) { 95281634Sbrian StoreTy = Type::getIntNTy(F.getParent()->getContext(), 95381634Sbrian DL.getTypeSizeInBits(StoreTy)); 95481634Sbrian break; 95581634Sbrian } 95638557Sbrian } 9576059Samurai assert(StoreTy && "Failed to find store type"); 95881634Sbrian 95938544Sbrian unsigned Sz = DL.getTypeSizeInBits(StoreTy); 96081634Sbrian unsigned AS = S0->getPointerAddressSpace(); 96181634Sbrian unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 96238544Sbrian unsigned VF = VecRegSize / Sz; 96381634Sbrian unsigned ChainSize = Chain.size(); 9647001Samurai unsigned Alignment = getAlignment(S0); 9657001Samurai 96681634Sbrian if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 96781634Sbrian InstructionsProcessed->insert(Chain.begin(), Chain.end()); 96881634Sbrian return false; 9696059Samurai } 97081634Sbrian 9716059Samurai ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 97281897Sbrian if (NewChain.empty()) { 97381634Sbrian // No vectorization possible. 97481634Sbrian InstructionsProcessed->insert(Chain.begin(), Chain.end()); 97581634Sbrian return false; 97678411Sbrian } 97778411Sbrian if (NewChain.size() == 1) { 97881634Sbrian // Failed after the first instruction. Discard it and try the smaller chain. 97936285Sbrian InstructionsProcessed->insert(NewChain.front()); 98081634Sbrian return false; 98181634Sbrian } 98281634Sbrian 98381634Sbrian // Update Chain to the valid vectorizable subchain. 9846059Samurai Chain = NewChain; 98581634Sbrian ChainSize = Chain.size(); 986 987 // Check if it's legal to vectorize this chain. If not, split the chain and 988 // try again. 989 unsigned EltSzInBytes = Sz / 8; 990 unsigned SzInBytes = EltSzInBytes * ChainSize; 991 992 VectorType *VecTy; 993 VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); 994 if (VecStoreTy) 995 VecTy = VectorType::get(StoreTy->getScalarType(), 996 Chain.size() * VecStoreTy->getNumElements()); 997 else 998 VecTy = VectorType::get(StoreTy, Chain.size()); 999 1000 // If it's more than the max vector size or the target has a better 1001 // vector factor, break it into two pieces. 1002 unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); 1003 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 1004 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 1005 " Creating two separate arrays.\n"); 1006 return vectorizeStoreChain(Chain.slice(0, TargetVF), 1007 InstructionsProcessed) | 1008 vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); 1009 } 1010 1011 LLVM_DEBUG({ 1012 dbgs() << "LSV: Stores to vectorize:\n"; 1013 for (Instruction *I : Chain) 1014 dbgs() << " " << *I << "\n"; 1015 }); 1016 1017 // We won't try again to vectorize the elements of the chain, regardless of 1018 // whether we succeed below. 1019 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1020 1021 // If the store is going to be misaligned, don't vectorize it. 1022 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 1023 if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 1024 auto Chains = splitOddVectorElts(Chain, Sz); 1025 return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 1026 vectorizeStoreChain(Chains.second, InstructionsProcessed); 1027 } 1028 1029 unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), 1030 StackAdjustedAlignment, 1031 DL, S0, nullptr, &DT); 1032 if (NewAlign != 0) 1033 Alignment = NewAlign; 1034 } 1035 1036 if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { 1037 auto Chains = splitOddVectorElts(Chain, Sz); 1038 return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 1039 vectorizeStoreChain(Chains.second, InstructionsProcessed); 1040 } 1041 1042 BasicBlock::iterator First, Last; 1043 std::tie(First, Last) = getBoundaryInstrs(Chain); 1044 Builder.SetInsertPoint(&*Last); 1045 1046 Value *Vec = UndefValue::get(VecTy); 1047 1048 if (VecStoreTy) { 1049 unsigned VecWidth = VecStoreTy->getNumElements(); 1050 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1051 StoreInst *Store = cast<StoreInst>(Chain[I]); 1052 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 1053 unsigned NewIdx = J + I * VecWidth; 1054 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 1055 Builder.getInt32(J)); 1056 if (Extract->getType() != StoreTy->getScalarType()) 1057 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 1058 1059 Value *Insert = 1060 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 1061 Vec = Insert; 1062 } 1063 } 1064 } else { 1065 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1066 StoreInst *Store = cast<StoreInst>(Chain[I]); 1067 Value *Extract = Store->getValueOperand(); 1068 if (Extract->getType() != StoreTy->getScalarType()) 1069 Extract = 1070 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 1071 1072 Value *Insert = 1073 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 1074 Vec = Insert; 1075 } 1076 } 1077 1078 StoreInst *SI = Builder.CreateAlignedStore( 1079 Vec, 1080 Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), 1081 Alignment); 1082 propagateMetadata(SI, Chain); 1083 1084 eraseInstructions(Chain); 1085 ++NumVectorInstructions; 1086 NumScalarsVectorized += Chain.size(); 1087 return true; 1088} 1089 1090bool Vectorizer::vectorizeLoadChain( 1091 ArrayRef<Instruction *> Chain, 1092 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 1093 LoadInst *L0 = cast<LoadInst>(Chain[0]); 1094 1095 // If the vector has an int element, default to int for the whole load. 1096 Type *LoadTy = nullptr; 1097 for (const auto &V : Chain) { 1098 LoadTy = cast<LoadInst>(V)->getType(); 1099 if (LoadTy->isIntOrIntVectorTy()) 1100 break; 1101 1102 if (LoadTy->isPtrOrPtrVectorTy()) { 1103 LoadTy = Type::getIntNTy(F.getParent()->getContext(), 1104 DL.getTypeSizeInBits(LoadTy)); 1105 break; 1106 } 1107 } 1108 assert(LoadTy && "Can't determine LoadInst type from chain"); 1109 1110 unsigned Sz = DL.getTypeSizeInBits(LoadTy); 1111 unsigned AS = L0->getPointerAddressSpace(); 1112 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 1113 unsigned VF = VecRegSize / Sz; 1114 unsigned ChainSize = Chain.size(); 1115 unsigned Alignment = getAlignment(L0); 1116 1117 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 1118 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1119 return false; 1120 } 1121 1122 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 1123 if (NewChain.empty()) { 1124 // No vectorization possible. 1125 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1126 return false; 1127 } 1128 if (NewChain.size() == 1) { 1129 // Failed after the first instruction. Discard it and try the smaller chain. 1130 InstructionsProcessed->insert(NewChain.front()); 1131 return false; 1132 } 1133 1134 // Update Chain to the valid vectorizable subchain. 1135 Chain = NewChain; 1136 ChainSize = Chain.size(); 1137 1138 // Check if it's legal to vectorize this chain. If not, split the chain and 1139 // try again. 1140 unsigned EltSzInBytes = Sz / 8; 1141 unsigned SzInBytes = EltSzInBytes * ChainSize; 1142 VectorType *VecTy; 1143 VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); 1144 if (VecLoadTy) 1145 VecTy = VectorType::get(LoadTy->getScalarType(), 1146 Chain.size() * VecLoadTy->getNumElements()); 1147 else 1148 VecTy = VectorType::get(LoadTy, Chain.size()); 1149 1150 // If it's more than the max vector size or the target has a better 1151 // vector factor, break it into two pieces. 1152 unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); 1153 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 1154 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 1155 " Creating two separate arrays.\n"); 1156 return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | 1157 vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); 1158 } 1159 1160 // We won't try again to vectorize the elements of the chain, regardless of 1161 // whether we succeed below. 1162 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1163 1164 // If the load is going to be misaligned, don't vectorize it. 1165 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 1166 if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 1167 auto Chains = splitOddVectorElts(Chain, Sz); 1168 return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 1169 vectorizeLoadChain(Chains.second, InstructionsProcessed); 1170 } 1171 1172 Alignment = getOrEnforceKnownAlignment( 1173 L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT); 1174 } 1175 1176 if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { 1177 auto Chains = splitOddVectorElts(Chain, Sz); 1178 return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 1179 vectorizeLoadChain(Chains.second, InstructionsProcessed); 1180 } 1181 1182 LLVM_DEBUG({ 1183 dbgs() << "LSV: Loads to vectorize:\n"; 1184 for (Instruction *I : Chain) 1185 I->dump(); 1186 }); 1187 1188 // getVectorizablePrefix already computed getBoundaryInstrs. The value of 1189 // Last may have changed since then, but the value of First won't have. If it 1190 // matters, we could compute getBoundaryInstrs only once and reuse it here. 1191 BasicBlock::iterator First, Last; 1192 std::tie(First, Last) = getBoundaryInstrs(Chain); 1193 Builder.SetInsertPoint(&*First); 1194 1195 Value *Bitcast = 1196 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 1197 LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment); 1198 propagateMetadata(LI, Chain); 1199 1200 if (VecLoadTy) { 1201 SmallVector<Instruction *, 16> InstrsToErase; 1202 1203 unsigned VecWidth = VecLoadTy->getNumElements(); 1204 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1205 for (auto Use : Chain[I]->users()) { 1206 // All users of vector loads are ExtractElement instructions with 1207 // constant indices, otherwise we would have bailed before now. 1208 Instruction *UI = cast<Instruction>(Use); 1209 unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); 1210 unsigned NewIdx = Idx + I * VecWidth; 1211 Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), 1212 UI->getName()); 1213 if (V->getType() != UI->getType()) 1214 V = Builder.CreateBitCast(V, UI->getType()); 1215 1216 // Replace the old instruction. 1217 UI->replaceAllUsesWith(V); 1218 InstrsToErase.push_back(UI); 1219 } 1220 } 1221 1222 // Bitcast might not be an Instruction, if the value being loaded is a 1223 // constant. In that case, no need to reorder anything. 1224 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1225 reorder(BitcastInst); 1226 1227 for (auto I : InstrsToErase) 1228 I->eraseFromParent(); 1229 } else { 1230 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1231 Value *CV = Chain[I]; 1232 Value *V = 1233 Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); 1234 if (V->getType() != CV->getType()) { 1235 V = Builder.CreateBitOrPointerCast(V, CV->getType()); 1236 } 1237 1238 // Replace the old instruction. 1239 CV->replaceAllUsesWith(V); 1240 } 1241 1242 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1243 reorder(BitcastInst); 1244 } 1245 1246 eraseInstructions(Chain); 1247 1248 ++NumVectorInstructions; 1249 NumScalarsVectorized += Chain.size(); 1250 return true; 1251} 1252 1253bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 1254 unsigned Alignment) { 1255 if (Alignment % SzInBytes == 0) 1256 return false; 1257 1258 bool Fast = false; 1259 bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), 1260 SzInBytes * 8, AddressSpace, 1261 Alignment, &Fast); 1262 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows 1263 << " and fast? " << Fast << "\n";); 1264 return !Allows || !Fast; 1265} 1266