1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores.  Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited.  So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures.  It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
27// "vector load" loads directly into a series of scalar registers.  In effect,
28// extracting the elements of the vector is free.  It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code.  This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40
41#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
42#include "llvm/ADT/APInt.h"
43#include "llvm/ADT/ArrayRef.h"
44#include "llvm/ADT/MapVector.h"
45#include "llvm/ADT/PostOrderIterator.h"
46#include "llvm/ADT/STLExtras.h"
47#include "llvm/ADT/SmallPtrSet.h"
48#include "llvm/ADT/SmallVector.h"
49#include "llvm/ADT/Statistic.h"
50#include "llvm/ADT/iterator_range.h"
51#include "llvm/Analysis/AliasAnalysis.h"
52#include "llvm/Analysis/MemoryLocation.h"
53#include "llvm/Analysis/OrderedBasicBlock.h"
54#include "llvm/Analysis/ScalarEvolution.h"
55#include "llvm/Analysis/TargetTransformInfo.h"
56#include "llvm/Analysis/ValueTracking.h"
57#include "llvm/Analysis/VectorUtils.h"
58#include "llvm/IR/Attributes.h"
59#include "llvm/IR/BasicBlock.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DerivedTypes.h"
63#include "llvm/IR/Dominators.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/IRBuilder.h"
66#include "llvm/IR/InstrTypes.h"
67#include "llvm/IR/Instruction.h"
68#include "llvm/IR/Instructions.h"
69#include "llvm/IR/IntrinsicInst.h"
70#include "llvm/IR/Module.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/InitializePasses.h"
75#include "llvm/Pass.h"
76#include "llvm/Support/Casting.h"
77#include "llvm/Support/Debug.h"
78#include "llvm/Support/KnownBits.h"
79#include "llvm/Support/MathExtras.h"
80#include "llvm/Support/raw_ostream.h"
81#include "llvm/Transforms/Utils/Local.h"
82#include "llvm/Transforms/Vectorize.h"
83#include <algorithm>
84#include <cassert>
85#include <cstdlib>
86#include <tuple>
87#include <utility>
88
89using namespace llvm;
90
91#define DEBUG_TYPE "load-store-vectorizer"
92
93STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
94STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
95
96// FIXME: Assuming stack alignment of 4 is always good enough
97static const unsigned StackAdjustedAlignment = 4;
98
99namespace {
100
101/// ChainID is an arbitrary token that is allowed to be different only for the
102/// accesses that are guaranteed to be considered non-consecutive by
103/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
104/// together and reducing the number of instructions the main search operates on
105/// at a time, i.e. this is to reduce compile time and nothing else as the main
106/// search has O(n^2) time complexity. The underlying type of ChainID should not
107/// be relied upon.
108using ChainID = const Value *;
109using InstrList = SmallVector<Instruction *, 8>;
110using InstrListMap = MapVector<ChainID, InstrList>;
111
112class Vectorizer {
113  Function &F;
114  AliasAnalysis &AA;
115  DominatorTree &DT;
116  ScalarEvolution &SE;
117  TargetTransformInfo &TTI;
118  const DataLayout &DL;
119  IRBuilder<> Builder;
120
121public:
122  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
123             ScalarEvolution &SE, TargetTransformInfo &TTI)
124      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
125        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
126
127  bool run();
128
129private:
130  unsigned getPointerAddressSpace(Value *I);
131
132  unsigned getAlignment(LoadInst *LI) const {
133    unsigned Align = LI->getAlignment();
134    if (Align != 0)
135      return Align;
136
137    return DL.getABITypeAlignment(LI->getType());
138  }
139
140  unsigned getAlignment(StoreInst *SI) const {
141    unsigned Align = SI->getAlignment();
142    if (Align != 0)
143      return Align;
144
145    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
146  }
147
148  static const unsigned MaxDepth = 3;
149
150  bool isConsecutiveAccess(Value *A, Value *B);
151  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
152                              unsigned Depth = 0) const;
153  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
154                                   unsigned Depth) const;
155  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
156                          unsigned Depth) const;
157
158  /// After vectorization, reorder the instructions that I depends on
159  /// (the instructions defining its operands), to ensure they dominate I.
160  void reorder(Instruction *I);
161
162  /// Returns the first and the last instructions in Chain.
163  std::pair<BasicBlock::iterator, BasicBlock::iterator>
164  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
165
166  /// Erases the original instructions after vectorizing.
167  void eraseInstructions(ArrayRef<Instruction *> Chain);
168
169  /// "Legalize" the vector type that would be produced by combining \p
170  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
171  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
172  /// expected to have more than 4 elements.
173  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
174  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
175
176  /// Finds the largest prefix of Chain that's vectorizable, checking for
177  /// intervening instructions which may affect the memory accessed by the
178  /// instructions within Chain.
179  ///
180  /// The elements of \p Chain must be all loads or all stores and must be in
181  /// address order.
182  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
183
184  /// Collects load and store instructions to vectorize.
185  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
186
187  /// Processes the collected instructions, the \p Map. The values of \p Map
188  /// should be all loads or all stores.
189  bool vectorizeChains(InstrListMap &Map);
190
191  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
192  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
193
194  /// Vectorizes the load instructions in Chain.
195  bool
196  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
197                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
198
199  /// Vectorizes the store instructions in Chain.
200  bool
201  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
202                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
203
204  /// Check if this load/store access is misaligned accesses.
205  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
206                          unsigned Alignment);
207};
208
209class LoadStoreVectorizerLegacyPass : public FunctionPass {
210public:
211  static char ID;
212
213  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
214    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
215  }
216
217  bool runOnFunction(Function &F) override;
218
219  StringRef getPassName() const override {
220    return "GPU Load and Store Vectorizer";
221  }
222
223  void getAnalysisUsage(AnalysisUsage &AU) const override {
224    AU.addRequired<AAResultsWrapperPass>();
225    AU.addRequired<ScalarEvolutionWrapperPass>();
226    AU.addRequired<DominatorTreeWrapperPass>();
227    AU.addRequired<TargetTransformInfoWrapperPass>();
228    AU.setPreservesCFG();
229  }
230};
231
232} // end anonymous namespace
233
234char LoadStoreVectorizerLegacyPass::ID = 0;
235
236INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
237                      "Vectorize load and Store instructions", false, false)
238INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
239INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
240INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
241INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
242INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
243INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
244                    "Vectorize load and store instructions", false, false)
245
246Pass *llvm::createLoadStoreVectorizerPass() {
247  return new LoadStoreVectorizerLegacyPass();
248}
249
250bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
251  // Don't vectorize when the attribute NoImplicitFloat is used.
252  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
253    return false;
254
255  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
256  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
257  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
258  TargetTransformInfo &TTI =
259      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
260
261  Vectorizer V(F, AA, DT, SE, TTI);
262  return V.run();
263}
264
265PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
266  // Don't vectorize when the attribute NoImplicitFloat is used.
267  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
268    return PreservedAnalyses::all();
269
270  AliasAnalysis &AA = AM.getResult<AAManager>(F);
271  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
272  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
274
275  Vectorizer V(F, AA, DT, SE, TTI);
276  bool Changed = V.run();
277  PreservedAnalyses PA;
278  PA.preserveSet<CFGAnalyses>();
279  return Changed ? PA : PreservedAnalyses::all();
280}
281
282// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
283// vectors of Instructions.
284static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
285  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
286  propagateMetadata(I, VL);
287}
288
289// Vectorizer Implementation
290bool Vectorizer::run() {
291  bool Changed = false;
292
293  // Scan the blocks in the function in post order.
294  for (BasicBlock *BB : post_order(&F)) {
295    InstrListMap LoadRefs, StoreRefs;
296    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
297    Changed |= vectorizeChains(LoadRefs);
298    Changed |= vectorizeChains(StoreRefs);
299  }
300
301  return Changed;
302}
303
304unsigned Vectorizer::getPointerAddressSpace(Value *I) {
305  if (LoadInst *L = dyn_cast<LoadInst>(I))
306    return L->getPointerAddressSpace();
307  if (StoreInst *S = dyn_cast<StoreInst>(I))
308    return S->getPointerAddressSpace();
309  return -1;
310}
311
312// FIXME: Merge with llvm::isConsecutiveAccess
313bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
314  Value *PtrA = getLoadStorePointerOperand(A);
315  Value *PtrB = getLoadStorePointerOperand(B);
316  unsigned ASA = getPointerAddressSpace(A);
317  unsigned ASB = getPointerAddressSpace(B);
318
319  // Check that the address spaces match and that the pointers are valid.
320  if (!PtrA || !PtrB || (ASA != ASB))
321    return false;
322
323  // Make sure that A and B are different pointers of the same size type.
324  Type *PtrATy = PtrA->getType()->getPointerElementType();
325  Type *PtrBTy = PtrB->getType()->getPointerElementType();
326  if (PtrA == PtrB ||
327      PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
328      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
329      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
330          DL.getTypeStoreSize(PtrBTy->getScalarType()))
331    return false;
332
333  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
334  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
335
336  return areConsecutivePointers(PtrA, PtrB, Size);
337}
338
339bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
340                                        APInt PtrDelta, unsigned Depth) const {
341  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
342  APInt OffsetA(PtrBitWidth, 0);
343  APInt OffsetB(PtrBitWidth, 0);
344  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
345  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
346
347  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
348
349  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
350    return false;
351
352  // In case if we have to shrink the pointer
353  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
354  // possible overflow and the value should fit into a smallest data type
355  // used in the cast/gep chain.
356  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
357         OffsetB.getMinSignedBits() <= NewPtrBitWidth);
358
359  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
360  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
361  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
362
363  APInt OffsetDelta = OffsetB - OffsetA;
364
365  // Check if they are based on the same pointer. That makes the offsets
366  // sufficient.
367  if (PtrA == PtrB)
368    return OffsetDelta == PtrDelta;
369
370  // Compute the necessary base pointer delta to have the necessary final delta
371  // equal to the pointer delta requested.
372  APInt BaseDelta = PtrDelta - OffsetDelta;
373
374  // Compute the distance with SCEV between the base pointers.
375  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
376  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
377  const SCEV *C = SE.getConstant(BaseDelta);
378  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
379  if (X == PtrSCEVB)
380    return true;
381
382  // The above check will not catch the cases where one of the pointers is
383  // factorized but the other one is not, such as (C + (S * (A + B))) vs
384  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
385  // and getting the simplified difference.
386  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
387  if (C == Dist)
388    return true;
389
390  // Sometimes even this doesn't work, because SCEV can't always see through
391  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
392  // things the hard way.
393  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
394}
395
396bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
397                                             APInt PtrDelta,
398                                             unsigned Depth) const {
399  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
400  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
401  if (!GEPA || !GEPB)
402    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
403
404  // Look through GEPs after checking they're the same except for the last
405  // index.
406  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
407      GEPA->getPointerOperand() != GEPB->getPointerOperand())
408    return false;
409  gep_type_iterator GTIA = gep_type_begin(GEPA);
410  gep_type_iterator GTIB = gep_type_begin(GEPB);
411  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
412    if (GTIA.getOperand() != GTIB.getOperand())
413      return false;
414    ++GTIA;
415    ++GTIB;
416  }
417
418  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
419  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
420  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
421      OpA->getType() != OpB->getType())
422    return false;
423
424  if (PtrDelta.isNegative()) {
425    if (PtrDelta.isMinSignedValue())
426      return false;
427    PtrDelta.negate();
428    std::swap(OpA, OpB);
429  }
430  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
431  if (PtrDelta.urem(Stride) != 0)
432    return false;
433  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
434  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
435
436  // Only look through a ZExt/SExt.
437  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
438    return false;
439
440  bool Signed = isa<SExtInst>(OpA);
441
442  // At this point A could be a function parameter, i.e. not an instruction
443  Value *ValA = OpA->getOperand(0);
444  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
445  if (!OpB || ValA->getType() != OpB->getType())
446    return false;
447
448  // Now we need to prove that adding IdxDiff to ValA won't overflow.
449  bool Safe = false;
450  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
451  // ValA, we're okay.
452  if (OpB->getOpcode() == Instruction::Add &&
453      isa<ConstantInt>(OpB->getOperand(1)) &&
454      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
455    if (Signed)
456      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
457    else
458      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
459  }
460
461  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
462
463  // Second attempt:
464  // If all set bits of IdxDiff or any higher order bit other than the sign bit
465  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
466  // overflow of any sort.
467  if (!Safe) {
468    OpA = dyn_cast<Instruction>(ValA);
469    if (!OpA)
470      return false;
471    KnownBits Known(BitWidth);
472    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
473    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
474    if (Signed)
475      BitsAllowedToBeSet.clearBit(BitWidth - 1);
476    if (BitsAllowedToBeSet.ult(IdxDiff))
477      return false;
478  }
479
480  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
481  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
482  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
483  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
484  return X == OffsetSCEVB;
485}
486
487bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
488                                    const APInt &PtrDelta,
489                                    unsigned Depth) const {
490  if (Depth++ == MaxDepth)
491    return false;
492
493  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
494    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
495      return SelectA->getCondition() == SelectB->getCondition() &&
496             areConsecutivePointers(SelectA->getTrueValue(),
497                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
498             areConsecutivePointers(SelectA->getFalseValue(),
499                                    SelectB->getFalseValue(), PtrDelta, Depth);
500    }
501  }
502  return false;
503}
504
505void Vectorizer::reorder(Instruction *I) {
506  OrderedBasicBlock OBB(I->getParent());
507  SmallPtrSet<Instruction *, 16> InstructionsToMove;
508  SmallVector<Instruction *, 16> Worklist;
509
510  Worklist.push_back(I);
511  while (!Worklist.empty()) {
512    Instruction *IW = Worklist.pop_back_val();
513    int NumOperands = IW->getNumOperands();
514    for (int i = 0; i < NumOperands; i++) {
515      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
516      if (!IM || IM->getOpcode() == Instruction::PHI)
517        continue;
518
519      // If IM is in another BB, no need to move it, because this pass only
520      // vectorizes instructions within one BB.
521      if (IM->getParent() != I->getParent())
522        continue;
523
524      if (!OBB.dominates(IM, I)) {
525        InstructionsToMove.insert(IM);
526        Worklist.push_back(IM);
527      }
528    }
529  }
530
531  // All instructions to move should follow I. Start from I, not from begin().
532  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
533       ++BBI) {
534    if (!InstructionsToMove.count(&*BBI))
535      continue;
536    Instruction *IM = &*BBI;
537    --BBI;
538    IM->removeFromParent();
539    IM->insertBefore(I);
540  }
541}
542
543std::pair<BasicBlock::iterator, BasicBlock::iterator>
544Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
545  Instruction *C0 = Chain[0];
546  BasicBlock::iterator FirstInstr = C0->getIterator();
547  BasicBlock::iterator LastInstr = C0->getIterator();
548
549  BasicBlock *BB = C0->getParent();
550  unsigned NumFound = 0;
551  for (Instruction &I : *BB) {
552    if (!is_contained(Chain, &I))
553      continue;
554
555    ++NumFound;
556    if (NumFound == 1) {
557      FirstInstr = I.getIterator();
558    }
559    if (NumFound == Chain.size()) {
560      LastInstr = I.getIterator();
561      break;
562    }
563  }
564
565  // Range is [first, last).
566  return std::make_pair(FirstInstr, ++LastInstr);
567}
568
569void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
570  SmallVector<Instruction *, 16> Instrs;
571  for (Instruction *I : Chain) {
572    Value *PtrOperand = getLoadStorePointerOperand(I);
573    assert(PtrOperand && "Instruction must have a pointer operand.");
574    Instrs.push_back(I);
575    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
576      Instrs.push_back(GEP);
577  }
578
579  // Erase instructions.
580  for (Instruction *I : Instrs)
581    if (I->use_empty())
582      I->eraseFromParent();
583}
584
585std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
586Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
587                               unsigned ElementSizeBits) {
588  unsigned ElementSizeBytes = ElementSizeBits / 8;
589  unsigned SizeBytes = ElementSizeBytes * Chain.size();
590  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
591  if (NumLeft == Chain.size()) {
592    if ((NumLeft & 1) == 0)
593      NumLeft /= 2; // Split even in half
594    else
595      --NumLeft;    // Split off last element
596  } else if (NumLeft == 0)
597    NumLeft = 1;
598  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
599}
600
601ArrayRef<Instruction *>
602Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
603  // These are in BB order, unlike Chain, which is in address order.
604  SmallVector<Instruction *, 16> MemoryInstrs;
605  SmallVector<Instruction *, 16> ChainInstrs;
606
607  bool IsLoadChain = isa<LoadInst>(Chain[0]);
608  LLVM_DEBUG({
609    for (Instruction *I : Chain) {
610      if (IsLoadChain)
611        assert(isa<LoadInst>(I) &&
612               "All elements of Chain must be loads, or all must be stores.");
613      else
614        assert(isa<StoreInst>(I) &&
615               "All elements of Chain must be loads, or all must be stores.");
616    }
617  });
618
619  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
620    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
621      if (!is_contained(Chain, &I))
622        MemoryInstrs.push_back(&I);
623      else
624        ChainInstrs.push_back(&I);
625    } else if (isa<IntrinsicInst>(&I) &&
626               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
627                   Intrinsic::sideeffect) {
628      // Ignore llvm.sideeffect calls.
629    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
630      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
631                        << '\n');
632      break;
633    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
634      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
635                        << '\n');
636      break;
637    }
638  }
639
640  OrderedBasicBlock OBB(Chain[0]->getParent());
641
642  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
643  unsigned ChainInstrIdx = 0;
644  Instruction *BarrierMemoryInstr = nullptr;
645
646  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
647    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
648
649    // If a barrier memory instruction was found, chain instructions that follow
650    // will not be added to the valid prefix.
651    if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
652      break;
653
654    // Check (in BB order) if any instruction prevents ChainInstr from being
655    // vectorized. Find and store the first such "conflicting" instruction.
656    for (Instruction *MemInstr : MemoryInstrs) {
657      // If a barrier memory instruction was found, do not check past it.
658      if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
659        break;
660
661      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
662      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
663      if (MemLoad && ChainLoad)
664        continue;
665
666      // We can ignore the alias if the we have a load store pair and the load
667      // is known to be invariant. The load cannot be clobbered by the store.
668      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
669        return LI->hasMetadata(LLVMContext::MD_invariant_load);
670      };
671
672      // We can ignore the alias as long as the load comes before the store,
673      // because that means we won't be moving the load past the store to
674      // vectorize it (the vectorized load is inserted at the location of the
675      // first load in the chain).
676      if (isa<StoreInst>(MemInstr) && ChainLoad &&
677          (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
678        continue;
679
680      // Same case, but in reverse.
681      if (MemLoad && isa<StoreInst>(ChainInstr) &&
682          (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
683        continue;
684
685      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
686                        MemoryLocation::get(ChainInstr))) {
687        LLVM_DEBUG({
688          dbgs() << "LSV: Found alias:\n"
689                    "  Aliasing instruction and pointer:\n"
690                 << "  " << *MemInstr << '\n'
691                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
692                 << "  Aliased instruction and pointer:\n"
693                 << "  " << *ChainInstr << '\n'
694                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
695        });
696        // Save this aliasing memory instruction as a barrier, but allow other
697        // instructions that precede the barrier to be vectorized with this one.
698        BarrierMemoryInstr = MemInstr;
699        break;
700      }
701    }
702    // Continue the search only for store chains, since vectorizing stores that
703    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
704    // up to an aliasing store, but should not pull loads from further down in
705    // the basic block.
706    if (IsLoadChain && BarrierMemoryInstr) {
707      // The BarrierMemoryInstr is a store that precedes ChainInstr.
708      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
709      break;
710    }
711  }
712
713  // Find the largest prefix of Chain whose elements are all in
714  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
715  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
716  // order.)
717  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
718      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
719  unsigned ChainIdx = 0;
720  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
721    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
722      break;
723  }
724  return Chain.slice(0, ChainIdx);
725}
726
727static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
728  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
729  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
730    // The select's themselves are distinct instructions even if they share the
731    // same condition and evaluate to consecutive pointers for true and false
732    // values of the condition. Therefore using the select's themselves for
733    // grouping instructions would put consecutive accesses into different lists
734    // and they won't be even checked for being consecutive, and won't be
735    // vectorized.
736    return Sel->getCondition();
737  }
738  return ObjPtr;
739}
740
741std::pair<InstrListMap, InstrListMap>
742Vectorizer::collectInstructions(BasicBlock *BB) {
743  InstrListMap LoadRefs;
744  InstrListMap StoreRefs;
745
746  for (Instruction &I : *BB) {
747    if (!I.mayReadOrWriteMemory())
748      continue;
749
750    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
751      if (!LI->isSimple())
752        continue;
753
754      // Skip if it's not legal.
755      if (!TTI.isLegalToVectorizeLoad(LI))
756        continue;
757
758      Type *Ty = LI->getType();
759      if (!VectorType::isValidElementType(Ty->getScalarType()))
760        continue;
761
762      // Skip weird non-byte sizes. They probably aren't worth the effort of
763      // handling correctly.
764      unsigned TySize = DL.getTypeSizeInBits(Ty);
765      if ((TySize % 8) != 0)
766        continue;
767
768      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
769      // functions are currently using an integer type for the vectorized
770      // load/store, and does not support casting between the integer type and a
771      // vector of pointers (e.g. i64 to <2 x i16*>)
772      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
773        continue;
774
775      Value *Ptr = LI->getPointerOperand();
776      unsigned AS = Ptr->getType()->getPointerAddressSpace();
777      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
778
779      unsigned VF = VecRegSize / TySize;
780      VectorType *VecTy = dyn_cast<VectorType>(Ty);
781
782      // No point in looking at these if they're too big to vectorize.
783      if (TySize > VecRegSize / 2 ||
784          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
785        continue;
786
787      // Make sure all the users of a vector are constant-index extracts.
788      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
789            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
790            return EEI && isa<ConstantInt>(EEI->getOperand(1));
791          }))
792        continue;
793
794      // Save the load locations.
795      const ChainID ID = getChainID(Ptr, DL);
796      LoadRefs[ID].push_back(LI);
797    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
798      if (!SI->isSimple())
799        continue;
800
801      // Skip if it's not legal.
802      if (!TTI.isLegalToVectorizeStore(SI))
803        continue;
804
805      Type *Ty = SI->getValueOperand()->getType();
806      if (!VectorType::isValidElementType(Ty->getScalarType()))
807        continue;
808
809      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
810      // functions are currently using an integer type for the vectorized
811      // load/store, and does not support casting between the integer type and a
812      // vector of pointers (e.g. i64 to <2 x i16*>)
813      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
814        continue;
815
816      // Skip weird non-byte sizes. They probably aren't worth the effort of
817      // handling correctly.
818      unsigned TySize = DL.getTypeSizeInBits(Ty);
819      if ((TySize % 8) != 0)
820        continue;
821
822      Value *Ptr = SI->getPointerOperand();
823      unsigned AS = Ptr->getType()->getPointerAddressSpace();
824      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
825
826      unsigned VF = VecRegSize / TySize;
827      VectorType *VecTy = dyn_cast<VectorType>(Ty);
828
829      // No point in looking at these if they're too big to vectorize.
830      if (TySize > VecRegSize / 2 ||
831          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
832        continue;
833
834      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
835            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
836            return EEI && isa<ConstantInt>(EEI->getOperand(1));
837          }))
838        continue;
839
840      // Save store location.
841      const ChainID ID = getChainID(Ptr, DL);
842      StoreRefs[ID].push_back(SI);
843    }
844  }
845
846  return {LoadRefs, StoreRefs};
847}
848
849bool Vectorizer::vectorizeChains(InstrListMap &Map) {
850  bool Changed = false;
851
852  for (const std::pair<ChainID, InstrList> &Chain : Map) {
853    unsigned Size = Chain.second.size();
854    if (Size < 2)
855      continue;
856
857    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
858
859    // Process the stores in chunks of 64.
860    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
861      unsigned Len = std::min<unsigned>(CE - CI, 64);
862      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
863      Changed |= vectorizeInstructions(Chunk);
864    }
865  }
866
867  return Changed;
868}
869
870bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
871  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
872                    << " instructions.\n");
873  SmallVector<int, 16> Heads, Tails;
874  int ConsecutiveChain[64];
875
876  // Do a quadratic search on all of the given loads/stores and find all of the
877  // pairs of loads/stores that follow each other.
878  for (int i = 0, e = Instrs.size(); i < e; ++i) {
879    ConsecutiveChain[i] = -1;
880    for (int j = e - 1; j >= 0; --j) {
881      if (i == j)
882        continue;
883
884      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
885        if (ConsecutiveChain[i] != -1) {
886          int CurDistance = std::abs(ConsecutiveChain[i] - i);
887          int NewDistance = std::abs(ConsecutiveChain[i] - j);
888          if (j < i || NewDistance > CurDistance)
889            continue; // Should not insert.
890        }
891
892        Tails.push_back(j);
893        Heads.push_back(i);
894        ConsecutiveChain[i] = j;
895      }
896    }
897  }
898
899  bool Changed = false;
900  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
901
902  for (int Head : Heads) {
903    if (InstructionsProcessed.count(Instrs[Head]))
904      continue;
905    bool LongerChainExists = false;
906    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
907      if (Head == Tails[TIt] &&
908          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
909        LongerChainExists = true;
910        break;
911      }
912    if (LongerChainExists)
913      continue;
914
915    // We found an instr that starts a chain. Now follow the chain and try to
916    // vectorize it.
917    SmallVector<Instruction *, 16> Operands;
918    int I = Head;
919    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
920      if (InstructionsProcessed.count(Instrs[I]))
921        break;
922
923      Operands.push_back(Instrs[I]);
924      I = ConsecutiveChain[I];
925    }
926
927    bool Vectorized = false;
928    if (isa<LoadInst>(*Operands.begin()))
929      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
930    else
931      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
932
933    Changed |= Vectorized;
934  }
935
936  return Changed;
937}
938
939bool Vectorizer::vectorizeStoreChain(
940    ArrayRef<Instruction *> Chain,
941    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
942  StoreInst *S0 = cast<StoreInst>(Chain[0]);
943
944  // If the vector has an int element, default to int for the whole store.
945  Type *StoreTy = nullptr;
946  for (Instruction *I : Chain) {
947    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
948    if (StoreTy->isIntOrIntVectorTy())
949      break;
950
951    if (StoreTy->isPtrOrPtrVectorTy()) {
952      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
953                                DL.getTypeSizeInBits(StoreTy));
954      break;
955    }
956  }
957  assert(StoreTy && "Failed to find store type");
958
959  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
960  unsigned AS = S0->getPointerAddressSpace();
961  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
962  unsigned VF = VecRegSize / Sz;
963  unsigned ChainSize = Chain.size();
964  unsigned Alignment = getAlignment(S0);
965
966  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
967    InstructionsProcessed->insert(Chain.begin(), Chain.end());
968    return false;
969  }
970
971  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
972  if (NewChain.empty()) {
973    // No vectorization possible.
974    InstructionsProcessed->insert(Chain.begin(), Chain.end());
975    return false;
976  }
977  if (NewChain.size() == 1) {
978    // Failed after the first instruction. Discard it and try the smaller chain.
979    InstructionsProcessed->insert(NewChain.front());
980    return false;
981  }
982
983  // Update Chain to the valid vectorizable subchain.
984  Chain = NewChain;
985  ChainSize = Chain.size();
986
987  // Check if it's legal to vectorize this chain. If not, split the chain and
988  // try again.
989  unsigned EltSzInBytes = Sz / 8;
990  unsigned SzInBytes = EltSzInBytes * ChainSize;
991
992  VectorType *VecTy;
993  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
994  if (VecStoreTy)
995    VecTy = VectorType::get(StoreTy->getScalarType(),
996                            Chain.size() * VecStoreTy->getNumElements());
997  else
998    VecTy = VectorType::get(StoreTy, Chain.size());
999
1000  // If it's more than the max vector size or the target has a better
1001  // vector factor, break it into two pieces.
1002  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1003  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1004    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1005                         " Creating two separate arrays.\n");
1006    return vectorizeStoreChain(Chain.slice(0, TargetVF),
1007                               InstructionsProcessed) |
1008           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1009  }
1010
1011  LLVM_DEBUG({
1012    dbgs() << "LSV: Stores to vectorize:\n";
1013    for (Instruction *I : Chain)
1014      dbgs() << "  " << *I << "\n";
1015  });
1016
1017  // We won't try again to vectorize the elements of the chain, regardless of
1018  // whether we succeed below.
1019  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1020
1021  // If the store is going to be misaligned, don't vectorize it.
1022  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1023    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1024      auto Chains = splitOddVectorElts(Chain, Sz);
1025      return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1026             vectorizeStoreChain(Chains.second, InstructionsProcessed);
1027    }
1028
1029    unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1030                                                   StackAdjustedAlignment,
1031                                                   DL, S0, nullptr, &DT);
1032    if (NewAlign != 0)
1033      Alignment = NewAlign;
1034  }
1035
1036  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1037    auto Chains = splitOddVectorElts(Chain, Sz);
1038    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1039           vectorizeStoreChain(Chains.second, InstructionsProcessed);
1040  }
1041
1042  BasicBlock::iterator First, Last;
1043  std::tie(First, Last) = getBoundaryInstrs(Chain);
1044  Builder.SetInsertPoint(&*Last);
1045
1046  Value *Vec = UndefValue::get(VecTy);
1047
1048  if (VecStoreTy) {
1049    unsigned VecWidth = VecStoreTy->getNumElements();
1050    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1051      StoreInst *Store = cast<StoreInst>(Chain[I]);
1052      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1053        unsigned NewIdx = J + I * VecWidth;
1054        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1055                                                      Builder.getInt32(J));
1056        if (Extract->getType() != StoreTy->getScalarType())
1057          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1058
1059        Value *Insert =
1060            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1061        Vec = Insert;
1062      }
1063    }
1064  } else {
1065    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1066      StoreInst *Store = cast<StoreInst>(Chain[I]);
1067      Value *Extract = Store->getValueOperand();
1068      if (Extract->getType() != StoreTy->getScalarType())
1069        Extract =
1070            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1071
1072      Value *Insert =
1073          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1074      Vec = Insert;
1075    }
1076  }
1077
1078  StoreInst *SI = Builder.CreateAlignedStore(
1079    Vec,
1080    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1081    Alignment);
1082  propagateMetadata(SI, Chain);
1083
1084  eraseInstructions(Chain);
1085  ++NumVectorInstructions;
1086  NumScalarsVectorized += Chain.size();
1087  return true;
1088}
1089
1090bool Vectorizer::vectorizeLoadChain(
1091    ArrayRef<Instruction *> Chain,
1092    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1093  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1094
1095  // If the vector has an int element, default to int for the whole load.
1096  Type *LoadTy = nullptr;
1097  for (const auto &V : Chain) {
1098    LoadTy = cast<LoadInst>(V)->getType();
1099    if (LoadTy->isIntOrIntVectorTy())
1100      break;
1101
1102    if (LoadTy->isPtrOrPtrVectorTy()) {
1103      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1104                               DL.getTypeSizeInBits(LoadTy));
1105      break;
1106    }
1107  }
1108  assert(LoadTy && "Can't determine LoadInst type from chain");
1109
1110  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1111  unsigned AS = L0->getPointerAddressSpace();
1112  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1113  unsigned VF = VecRegSize / Sz;
1114  unsigned ChainSize = Chain.size();
1115  unsigned Alignment = getAlignment(L0);
1116
1117  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1118    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1119    return false;
1120  }
1121
1122  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1123  if (NewChain.empty()) {
1124    // No vectorization possible.
1125    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1126    return false;
1127  }
1128  if (NewChain.size() == 1) {
1129    // Failed after the first instruction. Discard it and try the smaller chain.
1130    InstructionsProcessed->insert(NewChain.front());
1131    return false;
1132  }
1133
1134  // Update Chain to the valid vectorizable subchain.
1135  Chain = NewChain;
1136  ChainSize = Chain.size();
1137
1138  // Check if it's legal to vectorize this chain. If not, split the chain and
1139  // try again.
1140  unsigned EltSzInBytes = Sz / 8;
1141  unsigned SzInBytes = EltSzInBytes * ChainSize;
1142  VectorType *VecTy;
1143  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1144  if (VecLoadTy)
1145    VecTy = VectorType::get(LoadTy->getScalarType(),
1146                            Chain.size() * VecLoadTy->getNumElements());
1147  else
1148    VecTy = VectorType::get(LoadTy, Chain.size());
1149
1150  // If it's more than the max vector size or the target has a better
1151  // vector factor, break it into two pieces.
1152  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1153  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1154    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1155                         " Creating two separate arrays.\n");
1156    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1157           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1158  }
1159
1160  // We won't try again to vectorize the elements of the chain, regardless of
1161  // whether we succeed below.
1162  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1163
1164  // If the load is going to be misaligned, don't vectorize it.
1165  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1166    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1167      auto Chains = splitOddVectorElts(Chain, Sz);
1168      return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1169             vectorizeLoadChain(Chains.second, InstructionsProcessed);
1170    }
1171
1172    Alignment = getOrEnforceKnownAlignment(
1173        L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1174  }
1175
1176  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1177    auto Chains = splitOddVectorElts(Chain, Sz);
1178    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1179           vectorizeLoadChain(Chains.second, InstructionsProcessed);
1180  }
1181
1182  LLVM_DEBUG({
1183    dbgs() << "LSV: Loads to vectorize:\n";
1184    for (Instruction *I : Chain)
1185      I->dump();
1186  });
1187
1188  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1189  // Last may have changed since then, but the value of First won't have.  If it
1190  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1191  BasicBlock::iterator First, Last;
1192  std::tie(First, Last) = getBoundaryInstrs(Chain);
1193  Builder.SetInsertPoint(&*First);
1194
1195  Value *Bitcast =
1196      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1197  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1198  propagateMetadata(LI, Chain);
1199
1200  if (VecLoadTy) {
1201    SmallVector<Instruction *, 16> InstrsToErase;
1202
1203    unsigned VecWidth = VecLoadTy->getNumElements();
1204    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1205      for (auto Use : Chain[I]->users()) {
1206        // All users of vector loads are ExtractElement instructions with
1207        // constant indices, otherwise we would have bailed before now.
1208        Instruction *UI = cast<Instruction>(Use);
1209        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1210        unsigned NewIdx = Idx + I * VecWidth;
1211        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1212                                                UI->getName());
1213        if (V->getType() != UI->getType())
1214          V = Builder.CreateBitCast(V, UI->getType());
1215
1216        // Replace the old instruction.
1217        UI->replaceAllUsesWith(V);
1218        InstrsToErase.push_back(UI);
1219      }
1220    }
1221
1222    // Bitcast might not be an Instruction, if the value being loaded is a
1223    // constant.  In that case, no need to reorder anything.
1224    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1225      reorder(BitcastInst);
1226
1227    for (auto I : InstrsToErase)
1228      I->eraseFromParent();
1229  } else {
1230    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1231      Value *CV = Chain[I];
1232      Value *V =
1233          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1234      if (V->getType() != CV->getType()) {
1235        V = Builder.CreateBitOrPointerCast(V, CV->getType());
1236      }
1237
1238      // Replace the old instruction.
1239      CV->replaceAllUsesWith(V);
1240    }
1241
1242    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1243      reorder(BitcastInst);
1244  }
1245
1246  eraseInstructions(Chain);
1247
1248  ++NumVectorInstructions;
1249  NumScalarsVectorized += Chain.size();
1250  return true;
1251}
1252
1253bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1254                                    unsigned Alignment) {
1255  if (Alignment % SzInBytes == 0)
1256    return false;
1257
1258  bool Fast = false;
1259  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1260                                                   SzInBytes * 8, AddressSpace,
1261                                                   Alignment, &Fast);
1262  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1263                    << " and fast? " << Fast << "\n";);
1264  return !Allows || !Fast;
1265}
1266