1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This pass merges loads/stores to/from sequential memory addresses into vector 10// loads/stores. Although there's nothing GPU-specific in here, this pass is 11// motivated by the microarchitectural quirks of nVidia and AMD GPUs. 12// 13// (For simplicity below we talk about loads only, but everything also applies 14// to stores.) 15// 16// This pass is intended to be run late in the pipeline, after other 17// vectorization opportunities have been exploited. So the assumption here is 18// that immediately following our new vector load we'll need to extract out the 19// individual elements of the load, so we can operate on them individually. 20// 21// On CPUs this transformation is usually not beneficial, because extracting the 22// elements of a vector register is expensive on most architectures. It's 23// usually better just to load each element individually into its own scalar 24// register. 25// 26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a 27// "vector load" loads directly into a series of scalar registers. In effect, 28// extracting the elements of the vector is free. It's therefore always 29// beneficial to vectorize a sequence of loads on these architectures. 30// 31// Vectorizing (perhaps a better name might be "coalescing") loads can have 32// large performance impacts on GPU kernels, and opportunities for vectorizing 33// are common in GPU code. This pass tries very hard to find such 34// opportunities; its runtime is quadratic in the number of loads in a BB. 35// 36// Some CPU architectures, such as ARM, have instructions that load into 37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM 38// could use this pass (with some modifications), but currently it implements 39// its own pass to do something similar to what we do here. 40 41#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" 42#include "llvm/ADT/APInt.h" 43#include "llvm/ADT/ArrayRef.h" 44#include "llvm/ADT/MapVector.h" 45#include "llvm/ADT/PostOrderIterator.h" 46#include "llvm/ADT/STLExtras.h" 47#include "llvm/ADT/SmallPtrSet.h" 48#include "llvm/ADT/SmallVector.h" 49#include "llvm/ADT/Statistic.h" 50#include "llvm/ADT/iterator_range.h" 51#include "llvm/Analysis/AliasAnalysis.h" 52#include "llvm/Analysis/AssumptionCache.h" 53#include "llvm/Analysis/MemoryLocation.h" 54#include "llvm/Analysis/ScalarEvolution.h" 55#include "llvm/Analysis/TargetTransformInfo.h" 56#include "llvm/Analysis/ValueTracking.h" 57#include "llvm/Analysis/VectorUtils.h" 58#include "llvm/IR/Attributes.h" 59#include "llvm/IR/BasicBlock.h" 60#include "llvm/IR/Constants.h" 61#include "llvm/IR/DataLayout.h" 62#include "llvm/IR/DerivedTypes.h" 63#include "llvm/IR/Dominators.h" 64#include "llvm/IR/Function.h" 65#include "llvm/IR/GetElementPtrTypeIterator.h" 66#include "llvm/IR/IRBuilder.h" 67#include "llvm/IR/InstrTypes.h" 68#include "llvm/IR/Instruction.h" 69#include "llvm/IR/Instructions.h" 70#include "llvm/IR/Module.h" 71#include "llvm/IR/Type.h" 72#include "llvm/IR/Value.h" 73#include "llvm/InitializePasses.h" 74#include "llvm/Pass.h" 75#include "llvm/Support/Casting.h" 76#include "llvm/Support/Debug.h" 77#include "llvm/Support/KnownBits.h" 78#include "llvm/Support/MathExtras.h" 79#include "llvm/Support/raw_ostream.h" 80#include "llvm/Transforms/Utils/Local.h" 81#include "llvm/Transforms/Vectorize.h" 82#include <algorithm> 83#include <cassert> 84#include <cstdlib> 85#include <tuple> 86#include <utility> 87 88using namespace llvm; 89 90#define DEBUG_TYPE "load-store-vectorizer" 91 92STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 93STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 94 95// FIXME: Assuming stack alignment of 4 is always good enough 96static const unsigned StackAdjustedAlignment = 4; 97 98namespace { 99 100/// ChainID is an arbitrary token that is allowed to be different only for the 101/// accesses that are guaranteed to be considered non-consecutive by 102/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions 103/// together and reducing the number of instructions the main search operates on 104/// at a time, i.e. this is to reduce compile time and nothing else as the main 105/// search has O(n^2) time complexity. The underlying type of ChainID should not 106/// be relied upon. 107using ChainID = const Value *; 108using InstrList = SmallVector<Instruction *, 8>; 109using InstrListMap = MapVector<ChainID, InstrList>; 110 111class Vectorizer { 112 Function &F; 113 AliasAnalysis &AA; 114 AssumptionCache &AC; 115 DominatorTree &DT; 116 ScalarEvolution &SE; 117 TargetTransformInfo &TTI; 118 const DataLayout &DL; 119 IRBuilder<> Builder; 120 121public: 122 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, 123 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) 124 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI), 125 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 126 127 bool run(); 128 129private: 130 unsigned getPointerAddressSpace(Value *I); 131 132 static const unsigned MaxDepth = 3; 133 134 bool isConsecutiveAccess(Value *A, Value *B); 135 bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, 136 unsigned Depth = 0) const; 137 bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, 138 unsigned Depth) const; 139 bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta, 140 unsigned Depth) const; 141 142 /// After vectorization, reorder the instructions that I depends on 143 /// (the instructions defining its operands), to ensure they dominate I. 144 void reorder(Instruction *I); 145 146 /// Returns the first and the last instructions in Chain. 147 std::pair<BasicBlock::iterator, BasicBlock::iterator> 148 getBoundaryInstrs(ArrayRef<Instruction *> Chain); 149 150 /// Erases the original instructions after vectorizing. 151 void eraseInstructions(ArrayRef<Instruction *> Chain); 152 153 /// "Legalize" the vector type that would be produced by combining \p 154 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 155 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 156 /// expected to have more than 4 elements. 157 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 158 splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 159 160 /// Finds the largest prefix of Chain that's vectorizable, checking for 161 /// intervening instructions which may affect the memory accessed by the 162 /// instructions within Chain. 163 /// 164 /// The elements of \p Chain must be all loads or all stores and must be in 165 /// address order. 166 ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 167 168 /// Collects load and store instructions to vectorize. 169 std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 170 171 /// Processes the collected instructions, the \p Map. The values of \p Map 172 /// should be all loads or all stores. 173 bool vectorizeChains(InstrListMap &Map); 174 175 /// Finds the load/stores to consecutive memory addresses and vectorizes them. 176 bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 177 178 /// Vectorizes the load instructions in Chain. 179 bool 180 vectorizeLoadChain(ArrayRef<Instruction *> Chain, 181 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 182 183 /// Vectorizes the store instructions in Chain. 184 bool 185 vectorizeStoreChain(ArrayRef<Instruction *> Chain, 186 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 187 188 /// Check if this load/store access is misaligned accesses. 189 /// Returns a \p RelativeSpeed of an operation if allowed suitable to 190 /// compare to another result for the same \p AddressSpace and potentially 191 /// different \p Alignment and \p SzInBytes. 192 bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 193 Align Alignment, unsigned &RelativeSpeed); 194}; 195 196class LoadStoreVectorizerLegacyPass : public FunctionPass { 197public: 198 static char ID; 199 200 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) { 201 initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry()); 202 } 203 204 bool runOnFunction(Function &F) override; 205 206 StringRef getPassName() const override { 207 return "GPU Load and Store Vectorizer"; 208 } 209 210 void getAnalysisUsage(AnalysisUsage &AU) const override { 211 AU.addRequired<AAResultsWrapperPass>(); 212 AU.addRequired<AssumptionCacheTracker>(); 213 AU.addRequired<ScalarEvolutionWrapperPass>(); 214 AU.addRequired<DominatorTreeWrapperPass>(); 215 AU.addRequired<TargetTransformInfoWrapperPass>(); 216 AU.setPreservesCFG(); 217 } 218}; 219 220} // end anonymous namespace 221 222char LoadStoreVectorizerLegacyPass::ID = 0; 223 224INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 225 "Vectorize load and Store instructions", false, false) 226INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 227INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); 228INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 229INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 230INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 231INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 232INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, 233 "Vectorize load and store instructions", false, false) 234 235Pass *llvm::createLoadStoreVectorizerPass() { 236 return new LoadStoreVectorizerLegacyPass(); 237} 238 239bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { 240 // Don't vectorize when the attribute NoImplicitFloat is used. 241 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 242 return false; 243 244 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 245 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 246 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 247 TargetTransformInfo &TTI = 248 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 249 250 AssumptionCache &AC = 251 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 252 253 Vectorizer V(F, AA, AC, DT, SE, TTI); 254 return V.run(); 255} 256 257PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { 258 // Don't vectorize when the attribute NoImplicitFloat is used. 259 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 260 return PreservedAnalyses::all(); 261 262 AliasAnalysis &AA = AM.getResult<AAManager>(F); 263 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 264 ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 265 TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 266 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); 267 268 Vectorizer V(F, AA, AC, DT, SE, TTI); 269 bool Changed = V.run(); 270 PreservedAnalyses PA; 271 PA.preserveSet<CFGAnalyses>(); 272 return Changed ? PA : PreservedAnalyses::all(); 273} 274 275// The real propagateMetadata expects a SmallVector<Value*>, but we deal in 276// vectors of Instructions. 277static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 278 SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 279 propagateMetadata(I, VL); 280} 281 282// Vectorizer Implementation 283bool Vectorizer::run() { 284 bool Changed = false; 285 286 // Scan the blocks in the function in post order. 287 for (BasicBlock *BB : post_order(&F)) { 288 InstrListMap LoadRefs, StoreRefs; 289 std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 290 Changed |= vectorizeChains(LoadRefs); 291 Changed |= vectorizeChains(StoreRefs); 292 } 293 294 return Changed; 295} 296 297unsigned Vectorizer::getPointerAddressSpace(Value *I) { 298 if (LoadInst *L = dyn_cast<LoadInst>(I)) 299 return L->getPointerAddressSpace(); 300 if (StoreInst *S = dyn_cast<StoreInst>(I)) 301 return S->getPointerAddressSpace(); 302 return -1; 303} 304 305// FIXME: Merge with llvm::isConsecutiveAccess 306bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 307 Value *PtrA = getLoadStorePointerOperand(A); 308 Value *PtrB = getLoadStorePointerOperand(B); 309 unsigned ASA = getPointerAddressSpace(A); 310 unsigned ASB = getPointerAddressSpace(B); 311 312 // Check that the address spaces match and that the pointers are valid. 313 if (!PtrA || !PtrB || (ASA != ASB)) 314 return false; 315 316 // Make sure that A and B are different pointers of the same size type. 317 Type *PtrATy = getLoadStoreType(A); 318 Type *PtrBTy = getLoadStoreType(B); 319 if (PtrA == PtrB || 320 PtrATy->isVectorTy() != PtrBTy->isVectorTy() || 321 DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 322 DL.getTypeStoreSize(PtrATy->getScalarType()) != 323 DL.getTypeStoreSize(PtrBTy->getScalarType())) 324 return false; 325 326 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 327 APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 328 329 return areConsecutivePointers(PtrA, PtrB, Size); 330} 331 332bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, 333 APInt PtrDelta, unsigned Depth) const { 334 unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); 335 APInt OffsetA(PtrBitWidth, 0); 336 APInt OffsetB(PtrBitWidth, 0); 337 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 338 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 339 340 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); 341 342 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) 343 return false; 344 345 // In case if we have to shrink the pointer 346 // stripAndAccumulateInBoundsConstantOffsets should properly handle a 347 // possible overflow and the value should fit into a smallest data type 348 // used in the cast/gep chain. 349 assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && 350 OffsetB.getMinSignedBits() <= NewPtrBitWidth); 351 352 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); 353 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); 354 PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); 355 356 APInt OffsetDelta = OffsetB - OffsetA; 357 358 // Check if they are based on the same pointer. That makes the offsets 359 // sufficient. 360 if (PtrA == PtrB) 361 return OffsetDelta == PtrDelta; 362 363 // Compute the necessary base pointer delta to have the necessary final delta 364 // equal to the pointer delta requested. 365 APInt BaseDelta = PtrDelta - OffsetDelta; 366 367 // Compute the distance with SCEV between the base pointers. 368 const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 369 const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 370 const SCEV *C = SE.getConstant(BaseDelta); 371 const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 372 if (X == PtrSCEVB) 373 return true; 374 375 // The above check will not catch the cases where one of the pointers is 376 // factorized but the other one is not, such as (C + (S * (A + B))) vs 377 // (AS + BS). Get the minus scev. That will allow re-combining the expresions 378 // and getting the simplified difference. 379 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); 380 if (C == Dist) 381 return true; 382 383 // Sometimes even this doesn't work, because SCEV can't always see through 384 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 385 // things the hard way. 386 return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth); 387} 388 389static bool checkNoWrapFlags(Instruction *I, bool Signed) { 390 BinaryOperator *BinOpI = cast<BinaryOperator>(I); 391 return (Signed && BinOpI->hasNoSignedWrap()) || 392 (!Signed && BinOpI->hasNoUnsignedWrap()); 393} 394 395static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, 396 unsigned MatchingOpIdxA, Instruction *AddOpB, 397 unsigned MatchingOpIdxB, bool Signed) { 398 // If both OpA and OpB is an add with NSW/NUW and with 399 // one of the operands being the same, we can guarantee that the 400 // transformation is safe if we can prove that OpA won't overflow when 401 // IdxDiff added to the other operand of OpA. 402 // For example: 403 // %tmp7 = add nsw i32 %tmp2, %v0 404 // %tmp8 = sext i32 %tmp7 to i64 405 // ... 406 // %tmp11 = add nsw i32 %v0, 1 407 // %tmp12 = add nsw i32 %tmp2, %tmp11 408 // %tmp13 = sext i32 %tmp12 to i64 409 // 410 // Both %tmp7 and %tmp2 has the nsw flag and the first operand 411 // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow 412 // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the 413 // nsw flag. 414 assert(AddOpA->getOpcode() == Instruction::Add && 415 AddOpB->getOpcode() == Instruction::Add && 416 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed)); 417 if (AddOpA->getOperand(MatchingOpIdxA) == 418 AddOpB->getOperand(MatchingOpIdxB)) { 419 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1); 420 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1); 421 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA); 422 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB); 423 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`. 424 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add && 425 checkNoWrapFlags(OtherInstrB, Signed) && 426 isa<ConstantInt>(OtherInstrB->getOperand(1))) { 427 int64_t CstVal = 428 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); 429 if (OtherInstrB->getOperand(0) == OtherOperandA && 430 IdxDiff.getSExtValue() == CstVal) 431 return true; 432 } 433 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`. 434 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add && 435 checkNoWrapFlags(OtherInstrA, Signed) && 436 isa<ConstantInt>(OtherInstrA->getOperand(1))) { 437 int64_t CstVal = 438 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); 439 if (OtherInstrA->getOperand(0) == OtherOperandB && 440 IdxDiff.getSExtValue() == -CstVal) 441 return true; 442 } 443 // Match `x +nsw/nuw (y +nsw/nuw c)` and 444 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`. 445 if (OtherInstrA && OtherInstrB && 446 OtherInstrA->getOpcode() == Instruction::Add && 447 OtherInstrB->getOpcode() == Instruction::Add && 448 checkNoWrapFlags(OtherInstrA, Signed) && 449 checkNoWrapFlags(OtherInstrB, Signed) && 450 isa<ConstantInt>(OtherInstrA->getOperand(1)) && 451 isa<ConstantInt>(OtherInstrB->getOperand(1))) { 452 int64_t CstValA = 453 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue(); 454 int64_t CstValB = 455 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue(); 456 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) && 457 IdxDiff.getSExtValue() == (CstValB - CstValA)) 458 return true; 459 } 460 } 461 return false; 462} 463 464bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, 465 APInt PtrDelta, 466 unsigned Depth) const { 467 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); 468 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); 469 if (!GEPA || !GEPB) 470 return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth); 471 472 // Look through GEPs after checking they're the same except for the last 473 // index. 474 if (GEPA->getNumOperands() != GEPB->getNumOperands() || 475 GEPA->getPointerOperand() != GEPB->getPointerOperand()) 476 return false; 477 gep_type_iterator GTIA = gep_type_begin(GEPA); 478 gep_type_iterator GTIB = gep_type_begin(GEPB); 479 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { 480 if (GTIA.getOperand() != GTIB.getOperand()) 481 return false; 482 ++GTIA; 483 ++GTIB; 484 } 485 486 Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); 487 Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); 488 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 489 OpA->getType() != OpB->getType()) 490 return false; 491 492 if (PtrDelta.isNegative()) { 493 if (PtrDelta.isMinSignedValue()) 494 return false; 495 PtrDelta.negate(); 496 std::swap(OpA, OpB); 497 } 498 uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); 499 if (PtrDelta.urem(Stride) != 0) 500 return false; 501 unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); 502 APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth); 503 504 // Only look through a ZExt/SExt. 505 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 506 return false; 507 508 bool Signed = isa<SExtInst>(OpA); 509 510 // At this point A could be a function parameter, i.e. not an instruction 511 Value *ValA = OpA->getOperand(0); 512 OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 513 if (!OpB || ValA->getType() != OpB->getType()) 514 return false; 515 516 // Now we need to prove that adding IdxDiff to ValA won't overflow. 517 bool Safe = false; 518 519 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to 520 // ValA, we're okay. 521 if (OpB->getOpcode() == Instruction::Add && 522 isa<ConstantInt>(OpB->getOperand(1)) && 523 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) && 524 checkNoWrapFlags(OpB, Signed)) 525 Safe = true; 526 527 // Second attempt: check if we have eligible add NSW/NUW instruction 528 // sequences. 529 OpA = dyn_cast<Instruction>(ValA); 530 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add && 531 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) && 532 checkNoWrapFlags(OpB, Signed)) { 533 // In the checks below a matching operand in OpA and OpB is 534 // an operand which is the same in those two instructions. 535 // Below we account for possible orders of the operands of 536 // these add instructions. 537 for (unsigned MatchingOpIdxA : {0, 1}) 538 for (unsigned MatchingOpIdxB : {0, 1}) 539 if (!Safe) 540 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB, 541 MatchingOpIdxB, Signed); 542 } 543 544 unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); 545 546 // Third attempt: 547 // If all set bits of IdxDiff or any higher order bit other than the sign bit 548 // are known to be zero in ValA, we can add Diff to it while guaranteeing no 549 // overflow of any sort. 550 if (!Safe) { 551 KnownBits Known(BitWidth); 552 computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT); 553 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); 554 if (Signed) 555 BitsAllowedToBeSet.clearBit(BitWidth - 1); 556 if (BitsAllowedToBeSet.ult(IdxDiff)) 557 return false; 558 } 559 560 const SCEV *OffsetSCEVA = SE.getSCEV(ValA); 561 const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 562 const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); 563 const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); 564 return X == OffsetSCEVB; 565} 566 567bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB, 568 const APInt &PtrDelta, 569 unsigned Depth) const { 570 if (Depth++ == MaxDepth) 571 return false; 572 573 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) { 574 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) { 575 return SelectA->getCondition() == SelectB->getCondition() && 576 areConsecutivePointers(SelectA->getTrueValue(), 577 SelectB->getTrueValue(), PtrDelta, Depth) && 578 areConsecutivePointers(SelectA->getFalseValue(), 579 SelectB->getFalseValue(), PtrDelta, Depth); 580 } 581 } 582 return false; 583} 584 585void Vectorizer::reorder(Instruction *I) { 586 SmallPtrSet<Instruction *, 16> InstructionsToMove; 587 SmallVector<Instruction *, 16> Worklist; 588 589 Worklist.push_back(I); 590 while (!Worklist.empty()) { 591 Instruction *IW = Worklist.pop_back_val(); 592 int NumOperands = IW->getNumOperands(); 593 for (int i = 0; i < NumOperands; i++) { 594 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 595 if (!IM || IM->getOpcode() == Instruction::PHI) 596 continue; 597 598 // If IM is in another BB, no need to move it, because this pass only 599 // vectorizes instructions within one BB. 600 if (IM->getParent() != I->getParent()) 601 continue; 602 603 if (!IM->comesBefore(I)) { 604 InstructionsToMove.insert(IM); 605 Worklist.push_back(IM); 606 } 607 } 608 } 609 610 // All instructions to move should follow I. Start from I, not from begin(). 611 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 612 ++BBI) { 613 if (!InstructionsToMove.count(&*BBI)) 614 continue; 615 Instruction *IM = &*BBI; 616 --BBI; 617 IM->removeFromParent(); 618 IM->insertBefore(I); 619 } 620} 621 622std::pair<BasicBlock::iterator, BasicBlock::iterator> 623Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 624 Instruction *C0 = Chain[0]; 625 BasicBlock::iterator FirstInstr = C0->getIterator(); 626 BasicBlock::iterator LastInstr = C0->getIterator(); 627 628 BasicBlock *BB = C0->getParent(); 629 unsigned NumFound = 0; 630 for (Instruction &I : *BB) { 631 if (!is_contained(Chain, &I)) 632 continue; 633 634 ++NumFound; 635 if (NumFound == 1) { 636 FirstInstr = I.getIterator(); 637 } 638 if (NumFound == Chain.size()) { 639 LastInstr = I.getIterator(); 640 break; 641 } 642 } 643 644 // Range is [first, last). 645 return std::make_pair(FirstInstr, ++LastInstr); 646} 647 648void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 649 SmallVector<Instruction *, 16> Instrs; 650 for (Instruction *I : Chain) { 651 Value *PtrOperand = getLoadStorePointerOperand(I); 652 assert(PtrOperand && "Instruction must have a pointer operand."); 653 Instrs.push_back(I); 654 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 655 Instrs.push_back(GEP); 656 } 657 658 // Erase instructions. 659 for (Instruction *I : Instrs) 660 if (I->use_empty()) 661 I->eraseFromParent(); 662} 663 664std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 665Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 666 unsigned ElementSizeBits) { 667 unsigned ElementSizeBytes = ElementSizeBits / 8; 668 unsigned SizeBytes = ElementSizeBytes * Chain.size(); 669 unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; 670 if (NumLeft == Chain.size()) { 671 if ((NumLeft & 1) == 0) 672 NumLeft /= 2; // Split even in half 673 else 674 --NumLeft; // Split off last element 675 } else if (NumLeft == 0) 676 NumLeft = 1; 677 return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 678} 679 680ArrayRef<Instruction *> 681Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 682 // These are in BB order, unlike Chain, which is in address order. 683 SmallVector<Instruction *, 16> MemoryInstrs; 684 SmallVector<Instruction *, 16> ChainInstrs; 685 686 bool IsLoadChain = isa<LoadInst>(Chain[0]); 687 LLVM_DEBUG({ 688 for (Instruction *I : Chain) { 689 if (IsLoadChain) 690 assert(isa<LoadInst>(I) && 691 "All elements of Chain must be loads, or all must be stores."); 692 else 693 assert(isa<StoreInst>(I) && 694 "All elements of Chain must be loads, or all must be stores."); 695 } 696 }); 697 698 for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 699 if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) { 700 ChainInstrs.push_back(&I); 701 continue; 702 } 703 if (!isGuaranteedToTransferExecutionToSuccessor(&I)) { 704 LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: " 705 << I << '\n'); 706 break; 707 } 708 if (I.mayReadOrWriteMemory()) 709 MemoryInstrs.push_back(&I); 710 } 711 712 // Loop until we find an instruction in ChainInstrs that we can't vectorize. 713 unsigned ChainInstrIdx = 0; 714 Instruction *BarrierMemoryInstr = nullptr; 715 716 for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { 717 Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; 718 719 // If a barrier memory instruction was found, chain instructions that follow 720 // will not be added to the valid prefix. 721 if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr)) 722 break; 723 724 // Check (in BB order) if any instruction prevents ChainInstr from being 725 // vectorized. Find and store the first such "conflicting" instruction. 726 for (Instruction *MemInstr : MemoryInstrs) { 727 // If a barrier memory instruction was found, do not check past it. 728 if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr)) 729 break; 730 731 auto *MemLoad = dyn_cast<LoadInst>(MemInstr); 732 auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr); 733 if (MemLoad && ChainLoad) 734 continue; 735 736 // We can ignore the alias if the we have a load store pair and the load 737 // is known to be invariant. The load cannot be clobbered by the store. 738 auto IsInvariantLoad = [](const LoadInst *LI) -> bool { 739 return LI->hasMetadata(LLVMContext::MD_invariant_load); 740 }; 741 742 if (IsLoadChain) { 743 // We can ignore the alias as long as the load comes before the store, 744 // because that means we won't be moving the load past the store to 745 // vectorize it (the vectorized load is inserted at the location of the 746 // first load in the chain). 747 if (ChainInstr->comesBefore(MemInstr) || 748 (ChainLoad && IsInvariantLoad(ChainLoad))) 749 continue; 750 } else { 751 // Same case, but in reverse. 752 if (MemInstr->comesBefore(ChainInstr) || 753 (MemLoad && IsInvariantLoad(MemLoad))) 754 continue; 755 } 756 757 ModRefInfo MR = 758 AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr)); 759 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) { 760 LLVM_DEBUG({ 761 dbgs() << "LSV: Found alias:\n" 762 " Aliasing instruction:\n" 763 << " " << *MemInstr << '\n' 764 << " Aliased instruction and pointer:\n" 765 << " " << *ChainInstr << '\n' 766 << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; 767 }); 768 // Save this aliasing memory instruction as a barrier, but allow other 769 // instructions that precede the barrier to be vectorized with this one. 770 BarrierMemoryInstr = MemInstr; 771 break; 772 } 773 } 774 // Continue the search only for store chains, since vectorizing stores that 775 // precede an aliasing load is valid. Conversely, vectorizing loads is valid 776 // up to an aliasing store, but should not pull loads from further down in 777 // the basic block. 778 if (IsLoadChain && BarrierMemoryInstr) { 779 // The BarrierMemoryInstr is a store that precedes ChainInstr. 780 assert(BarrierMemoryInstr->comesBefore(ChainInstr)); 781 break; 782 } 783 } 784 785 // Find the largest prefix of Chain whose elements are all in 786 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of 787 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB 788 // order.) 789 SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( 790 ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); 791 unsigned ChainIdx = 0; 792 for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 793 if (!VectorizableChainInstrs.count(Chain[ChainIdx])) 794 break; 795 } 796 return Chain.slice(0, ChainIdx); 797} 798 799static ChainID getChainID(const Value *Ptr) { 800 const Value *ObjPtr = getUnderlyingObject(Ptr); 801 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) { 802 // The select's themselves are distinct instructions even if they share the 803 // same condition and evaluate to consecutive pointers for true and false 804 // values of the condition. Therefore using the select's themselves for 805 // grouping instructions would put consecutive accesses into different lists 806 // and they won't be even checked for being consecutive, and won't be 807 // vectorized. 808 return Sel->getCondition(); 809 } 810 return ObjPtr; 811} 812 813std::pair<InstrListMap, InstrListMap> 814Vectorizer::collectInstructions(BasicBlock *BB) { 815 InstrListMap LoadRefs; 816 InstrListMap StoreRefs; 817 818 for (Instruction &I : *BB) { 819 if (!I.mayReadOrWriteMemory()) 820 continue; 821 822 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 823 if (!LI->isSimple()) 824 continue; 825 826 // Skip if it's not legal. 827 if (!TTI.isLegalToVectorizeLoad(LI)) 828 continue; 829 830 Type *Ty = LI->getType(); 831 if (!VectorType::isValidElementType(Ty->getScalarType())) 832 continue; 833 834 // Skip weird non-byte sizes. They probably aren't worth the effort of 835 // handling correctly. 836 unsigned TySize = DL.getTypeSizeInBits(Ty); 837 if ((TySize % 8) != 0) 838 continue; 839 840 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 841 // functions are currently using an integer type for the vectorized 842 // load/store, and does not support casting between the integer type and a 843 // vector of pointers (e.g. i64 to <2 x i16*>) 844 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 845 continue; 846 847 Value *Ptr = LI->getPointerOperand(); 848 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 849 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 850 851 unsigned VF = VecRegSize / TySize; 852 VectorType *VecTy = dyn_cast<VectorType>(Ty); 853 854 // No point in looking at these if they're too big to vectorize. 855 if (TySize > VecRegSize / 2 || 856 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 857 continue; 858 859 // Save the load locations. 860 const ChainID ID = getChainID(Ptr); 861 LoadRefs[ID].push_back(LI); 862 } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 863 if (!SI->isSimple()) 864 continue; 865 866 // Skip if it's not legal. 867 if (!TTI.isLegalToVectorizeStore(SI)) 868 continue; 869 870 Type *Ty = SI->getValueOperand()->getType(); 871 if (!VectorType::isValidElementType(Ty->getScalarType())) 872 continue; 873 874 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain 875 // functions are currently using an integer type for the vectorized 876 // load/store, and does not support casting between the integer type and a 877 // vector of pointers (e.g. i64 to <2 x i16*>) 878 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy()) 879 continue; 880 881 // Skip weird non-byte sizes. They probably aren't worth the effort of 882 // handling correctly. 883 unsigned TySize = DL.getTypeSizeInBits(Ty); 884 if ((TySize % 8) != 0) 885 continue; 886 887 Value *Ptr = SI->getPointerOperand(); 888 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 889 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 890 891 unsigned VF = VecRegSize / TySize; 892 VectorType *VecTy = dyn_cast<VectorType>(Ty); 893 894 // No point in looking at these if they're too big to vectorize. 895 if (TySize > VecRegSize / 2 || 896 (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) 897 continue; 898 899 // Save store location. 900 const ChainID ID = getChainID(Ptr); 901 StoreRefs[ID].push_back(SI); 902 } 903 } 904 905 return {LoadRefs, StoreRefs}; 906} 907 908bool Vectorizer::vectorizeChains(InstrListMap &Map) { 909 bool Changed = false; 910 911 for (const std::pair<ChainID, InstrList> &Chain : Map) { 912 unsigned Size = Chain.second.size(); 913 if (Size < 2) 914 continue; 915 916 LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 917 918 // Process the stores in chunks of 64. 919 for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 920 unsigned Len = std::min<unsigned>(CE - CI, 64); 921 ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 922 Changed |= vectorizeInstructions(Chunk); 923 } 924 } 925 926 return Changed; 927} 928 929bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 930 LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() 931 << " instructions.\n"); 932 SmallVector<int, 16> Heads, Tails; 933 int ConsecutiveChain[64]; 934 935 // Do a quadratic search on all of the given loads/stores and find all of the 936 // pairs of loads/stores that follow each other. 937 for (int i = 0, e = Instrs.size(); i < e; ++i) { 938 ConsecutiveChain[i] = -1; 939 for (int j = e - 1; j >= 0; --j) { 940 if (i == j) 941 continue; 942 943 if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 944 if (ConsecutiveChain[i] != -1) { 945 int CurDistance = std::abs(ConsecutiveChain[i] - i); 946 int NewDistance = std::abs(ConsecutiveChain[i] - j); 947 if (j < i || NewDistance > CurDistance) 948 continue; // Should not insert. 949 } 950 951 Tails.push_back(j); 952 Heads.push_back(i); 953 ConsecutiveChain[i] = j; 954 } 955 } 956 } 957 958 bool Changed = false; 959 SmallPtrSet<Instruction *, 16> InstructionsProcessed; 960 961 for (int Head : Heads) { 962 if (InstructionsProcessed.count(Instrs[Head])) 963 continue; 964 bool LongerChainExists = false; 965 for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 966 if (Head == Tails[TIt] && 967 !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 968 LongerChainExists = true; 969 break; 970 } 971 if (LongerChainExists) 972 continue; 973 974 // We found an instr that starts a chain. Now follow the chain and try to 975 // vectorize it. 976 SmallVector<Instruction *, 16> Operands; 977 int I = Head; 978 while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { 979 if (InstructionsProcessed.count(Instrs[I])) 980 break; 981 982 Operands.push_back(Instrs[I]); 983 I = ConsecutiveChain[I]; 984 } 985 986 bool Vectorized = false; 987 if (isa<LoadInst>(*Operands.begin())) 988 Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 989 else 990 Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 991 992 Changed |= Vectorized; 993 } 994 995 return Changed; 996} 997 998bool Vectorizer::vectorizeStoreChain( 999 ArrayRef<Instruction *> Chain, 1000 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 1001 StoreInst *S0 = cast<StoreInst>(Chain[0]); 1002 1003 // If the vector has an int element, default to int for the whole store. 1004 Type *StoreTy = nullptr; 1005 for (Instruction *I : Chain) { 1006 StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 1007 if (StoreTy->isIntOrIntVectorTy()) 1008 break; 1009 1010 if (StoreTy->isPtrOrPtrVectorTy()) { 1011 StoreTy = Type::getIntNTy(F.getParent()->getContext(), 1012 DL.getTypeSizeInBits(StoreTy)); 1013 break; 1014 } 1015 } 1016 assert(StoreTy && "Failed to find store type"); 1017 1018 unsigned Sz = DL.getTypeSizeInBits(StoreTy); 1019 unsigned AS = S0->getPointerAddressSpace(); 1020 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 1021 unsigned VF = VecRegSize / Sz; 1022 unsigned ChainSize = Chain.size(); 1023 Align Alignment = S0->getAlign(); 1024 1025 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 1026 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1027 return false; 1028 } 1029 1030 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 1031 if (NewChain.empty()) { 1032 // No vectorization possible. 1033 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1034 return false; 1035 } 1036 if (NewChain.size() == 1) { 1037 // Failed after the first instruction. Discard it and try the smaller chain. 1038 InstructionsProcessed->insert(NewChain.front()); 1039 return false; 1040 } 1041 1042 // Update Chain to the valid vectorizable subchain. 1043 Chain = NewChain; 1044 ChainSize = Chain.size(); 1045 1046 // Check if it's legal to vectorize this chain. If not, split the chain and 1047 // try again. 1048 unsigned EltSzInBytes = Sz / 8; 1049 unsigned SzInBytes = EltSzInBytes * ChainSize; 1050 1051 FixedVectorType *VecTy; 1052 auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy); 1053 if (VecStoreTy) 1054 VecTy = FixedVectorType::get(StoreTy->getScalarType(), 1055 Chain.size() * VecStoreTy->getNumElements()); 1056 else 1057 VecTy = FixedVectorType::get(StoreTy, Chain.size()); 1058 1059 // If it's more than the max vector size or the target has a better 1060 // vector factor, break it into two pieces. 1061 unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); 1062 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 1063 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 1064 " Creating two separate arrays.\n"); 1065 bool Vectorized = false; 1066 Vectorized |= 1067 vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed); 1068 Vectorized |= 1069 vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); 1070 return Vectorized; 1071 } 1072 1073 LLVM_DEBUG({ 1074 dbgs() << "LSV: Stores to vectorize:\n"; 1075 for (Instruction *I : Chain) 1076 dbgs() << " " << *I << "\n"; 1077 }); 1078 1079 // We won't try again to vectorize the elements of the chain, regardless of 1080 // whether we succeed below. 1081 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1082 1083 // If the store is going to be misaligned, don't vectorize it. 1084 unsigned RelativeSpeed; 1085 if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { 1086 if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 1087 unsigned SpeedBefore; 1088 accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); 1089 if (SpeedBefore > RelativeSpeed) 1090 return false; 1091 1092 auto Chains = splitOddVectorElts(Chain, Sz); 1093 bool Vectorized = false; 1094 Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); 1095 Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); 1096 return Vectorized; 1097 } 1098 1099 Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), 1100 Align(StackAdjustedAlignment), 1101 DL, S0, nullptr, &DT); 1102 if (NewAlign >= Alignment) 1103 Alignment = NewAlign; 1104 else 1105 return false; 1106 } 1107 1108 if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { 1109 auto Chains = splitOddVectorElts(Chain, Sz); 1110 bool Vectorized = false; 1111 Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed); 1112 Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed); 1113 return Vectorized; 1114 } 1115 1116 BasicBlock::iterator First, Last; 1117 std::tie(First, Last) = getBoundaryInstrs(Chain); 1118 Builder.SetInsertPoint(&*Last); 1119 1120 Value *Vec = PoisonValue::get(VecTy); 1121 1122 if (VecStoreTy) { 1123 unsigned VecWidth = VecStoreTy->getNumElements(); 1124 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1125 StoreInst *Store = cast<StoreInst>(Chain[I]); 1126 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 1127 unsigned NewIdx = J + I * VecWidth; 1128 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 1129 Builder.getInt32(J)); 1130 if (Extract->getType() != StoreTy->getScalarType()) 1131 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 1132 1133 Value *Insert = 1134 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 1135 Vec = Insert; 1136 } 1137 } 1138 } else { 1139 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1140 StoreInst *Store = cast<StoreInst>(Chain[I]); 1141 Value *Extract = Store->getValueOperand(); 1142 if (Extract->getType() != StoreTy->getScalarType()) 1143 Extract = 1144 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 1145 1146 Value *Insert = 1147 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 1148 Vec = Insert; 1149 } 1150 } 1151 1152 StoreInst *SI = Builder.CreateAlignedStore( 1153 Vec, 1154 Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)), 1155 Alignment); 1156 propagateMetadata(SI, Chain); 1157 1158 eraseInstructions(Chain); 1159 ++NumVectorInstructions; 1160 NumScalarsVectorized += Chain.size(); 1161 return true; 1162} 1163 1164bool Vectorizer::vectorizeLoadChain( 1165 ArrayRef<Instruction *> Chain, 1166 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 1167 LoadInst *L0 = cast<LoadInst>(Chain[0]); 1168 1169 // If the vector has an int element, default to int for the whole load. 1170 Type *LoadTy = nullptr; 1171 for (const auto &V : Chain) { 1172 LoadTy = cast<LoadInst>(V)->getType(); 1173 if (LoadTy->isIntOrIntVectorTy()) 1174 break; 1175 1176 if (LoadTy->isPtrOrPtrVectorTy()) { 1177 LoadTy = Type::getIntNTy(F.getParent()->getContext(), 1178 DL.getTypeSizeInBits(LoadTy)); 1179 break; 1180 } 1181 } 1182 assert(LoadTy && "Can't determine LoadInst type from chain"); 1183 1184 unsigned Sz = DL.getTypeSizeInBits(LoadTy); 1185 unsigned AS = L0->getPointerAddressSpace(); 1186 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 1187 unsigned VF = VecRegSize / Sz; 1188 unsigned ChainSize = Chain.size(); 1189 Align Alignment = L0->getAlign(); 1190 1191 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 1192 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1193 return false; 1194 } 1195 1196 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 1197 if (NewChain.empty()) { 1198 // No vectorization possible. 1199 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1200 return false; 1201 } 1202 if (NewChain.size() == 1) { 1203 // Failed after the first instruction. Discard it and try the smaller chain. 1204 InstructionsProcessed->insert(NewChain.front()); 1205 return false; 1206 } 1207 1208 // Update Chain to the valid vectorizable subchain. 1209 Chain = NewChain; 1210 ChainSize = Chain.size(); 1211 1212 // Check if it's legal to vectorize this chain. If not, split the chain and 1213 // try again. 1214 unsigned EltSzInBytes = Sz / 8; 1215 unsigned SzInBytes = EltSzInBytes * ChainSize; 1216 VectorType *VecTy; 1217 auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy); 1218 if (VecLoadTy) 1219 VecTy = FixedVectorType::get(LoadTy->getScalarType(), 1220 Chain.size() * VecLoadTy->getNumElements()); 1221 else 1222 VecTy = FixedVectorType::get(LoadTy, Chain.size()); 1223 1224 // If it's more than the max vector size or the target has a better 1225 // vector factor, break it into two pieces. 1226 unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); 1227 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 1228 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 1229 " Creating two separate arrays.\n"); 1230 bool Vectorized = false; 1231 Vectorized |= 1232 vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed); 1233 Vectorized |= 1234 vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); 1235 return Vectorized; 1236 } 1237 1238 // We won't try again to vectorize the elements of the chain, regardless of 1239 // whether we succeed below. 1240 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 1241 1242 // If the load is going to be misaligned, don't vectorize it. 1243 unsigned RelativeSpeed; 1244 if (accessIsMisaligned(SzInBytes, AS, Alignment, RelativeSpeed)) { 1245 if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) { 1246 unsigned SpeedBefore; 1247 accessIsMisaligned(EltSzInBytes, AS, Alignment, SpeedBefore); 1248 if (SpeedBefore > RelativeSpeed) 1249 return false; 1250 1251 auto Chains = splitOddVectorElts(Chain, Sz); 1252 bool Vectorized = false; 1253 Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); 1254 Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); 1255 return Vectorized; 1256 } 1257 1258 Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), 1259 Align(StackAdjustedAlignment), 1260 DL, L0, nullptr, &DT); 1261 if (NewAlign >= Alignment) 1262 Alignment = NewAlign; 1263 else 1264 return false; 1265 } 1266 1267 if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { 1268 auto Chains = splitOddVectorElts(Chain, Sz); 1269 bool Vectorized = false; 1270 Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed); 1271 Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed); 1272 return Vectorized; 1273 } 1274 1275 LLVM_DEBUG({ 1276 dbgs() << "LSV: Loads to vectorize:\n"; 1277 for (Instruction *I : Chain) 1278 I->dump(); 1279 }); 1280 1281 // getVectorizablePrefix already computed getBoundaryInstrs. The value of 1282 // Last may have changed since then, but the value of First won't have. If it 1283 // matters, we could compute getBoundaryInstrs only once and reuse it here. 1284 BasicBlock::iterator First, Last; 1285 std::tie(First, Last) = getBoundaryInstrs(Chain); 1286 Builder.SetInsertPoint(&*First); 1287 1288 Value *Bitcast = 1289 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 1290 LoadInst *LI = 1291 Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment)); 1292 propagateMetadata(LI, Chain); 1293 1294 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1295 Value *CV = Chain[I]; 1296 Value *V; 1297 if (VecLoadTy) { 1298 // Extract a subvector using shufflevector. 1299 unsigned VecWidth = VecLoadTy->getNumElements(); 1300 auto Mask = 1301 llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth)); 1302 V = Builder.CreateShuffleVector(LI, Mask, CV->getName()); 1303 } else { 1304 V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); 1305 } 1306 1307 if (V->getType() != CV->getType()) { 1308 V = Builder.CreateBitOrPointerCast(V, CV->getType()); 1309 } 1310 1311 // Replace the old instruction. 1312 CV->replaceAllUsesWith(V); 1313 } 1314 1315 // Since we might have opaque pointers we might end up using the pointer 1316 // operand of the first load (wrt. memory loaded) for the vector load. Since 1317 // this first load might not be the first in the block we potentially need to 1318 // reorder the pointer operand (and its operands). If we have a bitcast though 1319 // it might be before the load and should be the reorder start instruction. 1320 // "Might" because for opaque pointers the "bitcast" is just the first loads 1321 // pointer operand, as oppposed to something we inserted at the right position 1322 // ourselves. 1323 Instruction *BCInst = dyn_cast<Instruction>(Bitcast); 1324 reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI); 1325 1326 eraseInstructions(Chain); 1327 1328 ++NumVectorInstructions; 1329 NumScalarsVectorized += Chain.size(); 1330 return true; 1331} 1332 1333bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 1334 Align Alignment, unsigned &RelativeSpeed) { 1335 RelativeSpeed = 0; 1336 if (Alignment.value() % SzInBytes == 0) 1337 return false; 1338 1339 bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), 1340 SzInBytes * 8, AddressSpace, 1341 Alignment, &RelativeSpeed); 1342 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows 1343 << " with relative speed = " << RelativeSpeed << '\n';); 1344 return !Allows || !RelativeSpeed; 1345} 1346