178189Sbrian//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
278189Sbrian//
378189Sbrian// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
478189Sbrian// See https://llvm.org/LICENSE.txt for license information.
578189Sbrian// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66059Samurai//
778189Sbrian//===----------------------------------------------------------------------===//
878189Sbrian//
978189Sbrian// This pass merges loads/stores to/from sequential memory addresses into vector
1078189Sbrian// loads/stores.  Although there's nothing GPU-specific in here, this pass is
1178189Sbrian// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
1278189Sbrian//
1378189Sbrian// (For simplicity below we talk about loads only, but everything also applies
1478189Sbrian// to stores.)
156059Samurai//
1678189Sbrian// This pass is intended to be run late in the pipeline, after other
1778189Sbrian// vectorization opportunities have been exploited.  So the assumption here is
1878189Sbrian// that immediately following our new vector load we'll need to extract out the
1978189Sbrian// individual elements of the load, so we can operate on them individually.
2078189Sbrian//
2178189Sbrian// On CPUs this transformation is usually not beneficial, because extracting the
2278189Sbrian// elements of a vector register is expensive on most architectures.  It's
2378189Sbrian// usually better just to load each element individually into its own scalar
2478189Sbrian// register.
2578189Sbrian//
2678189Sbrian// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
276059Samurai// "vector load" loads directly into a series of scalar registers.  In effect,
2850479Speter// extracting the elements of the vector is free.  It's therefore always
296059Samurai// beneficial to vectorize a sequence of loads on these architectures.
3078189Sbrian//
3143313Sbrian// Vectorizing (perhaps a better name might be "coalescing") loads can have
3231195Sbrian// large performance impacts on GPU kernels, and opportunities for vectorizing
3330715Sbrian// are common in GPU code.  This pass tries very hard to find such
346059Samurai// opportunities; its runtime is quadratic in the number of loads in a BB.
356059Samurai//
3681634Sbrian// Some CPU architectures, such as ARM, have instructions that load into
3781634Sbrian// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
3881634Sbrian// could use this pass (with some modifications), but currently it implements
3981634Sbrian// its own pass to do something similar to what we do here.
406059Samurai
416059Samurai#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
426059Samurai#include "llvm/ADT/APInt.h"
4336285Sbrian#include "llvm/ADT/ArrayRef.h"
4430715Sbrian#include "llvm/ADT/MapVector.h"
4530092Sbrian#include "llvm/ADT/PostOrderIterator.h"
4681634Sbrian#include "llvm/ADT/STLExtras.h"
4730715Sbrian#include "llvm/ADT/SmallPtrSet.h"
4830715Sbrian#include "llvm/ADT/SmallVector.h"
4946686Sbrian#include "llvm/ADT/Statistic.h"
5030715Sbrian#include "llvm/ADT/iterator_range.h"
5130715Sbrian#include "llvm/Analysis/AliasAnalysis.h"
5246686Sbrian#include "llvm/Analysis/MemoryLocation.h"
5346686Sbrian#include "llvm/Analysis/OrderedBasicBlock.h"
5430715Sbrian#include "llvm/Analysis/ScalarEvolution.h"
5530715Sbrian#include "llvm/Analysis/TargetTransformInfo.h"
5630715Sbrian#include "llvm/Analysis/ValueTracking.h"
5730715Sbrian#include "llvm/Analysis/VectorUtils.h"
5830715Sbrian#include "llvm/IR/Attributes.h"
5936285Sbrian#include "llvm/IR/BasicBlock.h"
6030715Sbrian#include "llvm/IR/Constants.h"
6136285Sbrian#include "llvm/IR/DataLayout.h"
6236285Sbrian#include "llvm/IR/DerivedTypes.h"
6336285Sbrian#include "llvm/IR/Dominators.h"
6481634Sbrian#include "llvm/IR/Function.h"
6581634Sbrian#include "llvm/IR/IRBuilder.h"
6636285Sbrian#include "llvm/IR/InstrTypes.h"
676059Samurai#include "llvm/IR/Instruction.h"
6836285Sbrian#include "llvm/IR/Instructions.h"
6936285Sbrian#include "llvm/IR/IntrinsicInst.h"
7036285Sbrian#include "llvm/IR/Module.h"
7136285Sbrian#include "llvm/IR/Type.h"
7236285Sbrian#include "llvm/IR/User.h"
7343313Sbrian#include "llvm/IR/Value.h"
7443313Sbrian#include "llvm/InitializePasses.h"
7543313Sbrian#include "llvm/Pass.h"
7681634Sbrian#include "llvm/Support/Casting.h"
7781634Sbrian#include "llvm/Support/Debug.h"
7836285Sbrian#include "llvm/Support/KnownBits.h"
7931195Sbrian#include "llvm/Support/MathExtras.h"
806059Samurai#include "llvm/Support/raw_ostream.h"
8158033Sbrian#include "llvm/Transforms/Utils/Local.h"
8258033Sbrian#include "llvm/Transforms/Vectorize.h"
8358033Sbrian#include <algorithm>
8458033Sbrian#include <cassert>
8558033Sbrian#include <cstdlib>
8658033Sbrian#include <tuple>
8758033Sbrian#include <utility>
8858033Sbrian
8958033Sbrianusing namespace llvm;
9058033Sbrian
9158033Sbrian#define DEBUG_TYPE "load-store-vectorizer"
9258033Sbrian
9358033SbrianSTATISTIC(NumVectorInstructions, "Number of vector accesses generated");
9458033SbrianSTATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
9558033Sbrian
9658033Sbrian// FIXME: Assuming stack alignment of 4 is always good enough
9758033Sbrianstatic const unsigned StackAdjustedAlignment = 4;
9858033Sbrian
9958033Sbriannamespace {
10055146Sbrian
1016059Samurai/// ChainID is an arbitrary token that is allowed to be different only for the
10258033Sbrian/// accesses that are guaranteed to be considered non-consecutive by
10358033Sbrian/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
10458033Sbrian/// together and reducing the number of instructions the main search operates on
10558033Sbrian/// at a time, i.e. this is to reduce compile time and nothing else as the main
10658033Sbrian/// search has O(n^2) time complexity. The underlying type of ChainID should not
10758033Sbrian/// be relied upon.
10858033Sbrianusing ChainID = const Value *;
10958033Sbrianusing InstrList = SmallVector<Instruction *, 8>;
11058033Sbrianusing InstrListMap = MapVector<ChainID, InstrList>;
11158033Sbrian
11258033Sbrianclass Vectorizer {
11358033Sbrian  Function &F;
11458033Sbrian  AliasAnalysis &AA;
11558033Sbrian  DominatorTree &DT;
11658033Sbrian  ScalarEvolution &SE;
11758033Sbrian  TargetTransformInfo &TTI;
11858033Sbrian  const DataLayout &DL;
11958034Sbrian  IRBuilder<> Builder;
12058033Sbrian
12158033Sbrianpublic:
12258033Sbrian  Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
12358033Sbrian             ScalarEvolution &SE, TargetTransformInfo &TTI)
12458033Sbrian      : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
12558033Sbrian        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
12658033Sbrian
12758033Sbrian  bool run();
12858033Sbrian
12958033Sbrianprivate:
13058033Sbrian  unsigned getPointerAddressSpace(Value *I);
13158033Sbrian
13258033Sbrian  unsigned getAlignment(LoadInst *LI) const {
13358033Sbrian    unsigned Align = LI->getAlignment();
13458033Sbrian    if (Align != 0)
13558033Sbrian      return Align;
13658033Sbrian
13758033Sbrian    return DL.getABITypeAlignment(LI->getType());
13858033Sbrian  }
13958033Sbrian
14058033Sbrian  unsigned getAlignment(StoreInst *SI) const {
14158033Sbrian    unsigned Align = SI->getAlignment();
14258033Sbrian    if (Align != 0)
14358033Sbrian      return Align;
14458033Sbrian
14558033Sbrian    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
14658034Sbrian  }
14758033Sbrian
14858033Sbrian  static const unsigned MaxDepth = 3;
14949140Sbrian
15028679Sbrian  bool isConsecutiveAccess(Value *A, Value *B);
1516059Samurai  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
1526059Samurai                              unsigned Depth = 0) const;
15349140Sbrian  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
15462977Sbrian                                   unsigned Depth) const;
1556059Samurai  bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
15662977Sbrian                          unsigned Depth) const;
1576059Samurai
15862977Sbrian  /// After vectorization, reorder the instructions that I depends on
1596059Samurai  /// (the instructions defining its operands), to ensure they dominate I.
16062977Sbrian  void reorder(Instruction *I);
1616059Samurai
1626059Samurai  /// Returns the first and the last instructions in Chain.
1636059Samurai  std::pair<BasicBlock::iterator, BasicBlock::iterator>
1646059Samurai  getBoundaryInstrs(ArrayRef<Instruction *> Chain);
165129175Sdds
166129175Sdds  /// Erases the original instructions after vectorizing.
167129175Sdds  void eraseInstructions(ArrayRef<Instruction *> Chain);
168129175Sdds
169129175Sdds  /// "Legalize" the vector type that would be produced by combining \p
170129175Sdds  /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
171129175Sdds  /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
172129175Sdds  /// expected to have more than 4 elements.
173129175Sdds  std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
174129175Sdds  splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
175129175Sdds
176129175Sdds  /// Finds the largest prefix of Chain that's vectorizable, checking for
177129175Sdds  /// intervening instructions which may affect the memory accessed by the
178129175Sdds  /// instructions within Chain.
179129175Sdds  ///
180129175Sdds  /// The elements of \p Chain must be all loads or all stores and must be in
181129175Sdds  /// address order.
182129175Sdds  ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
183129175Sdds
184129175Sdds  /// Collects load and store instructions to vectorize.
185129175Sdds  std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
186129175Sdds
187129175Sdds  /// Processes the collected instructions, the \p Map. The values of \p Map
188129175Sdds  /// should be all loads or all stores.
189129175Sdds  bool vectorizeChains(InstrListMap &Map);
190129175Sdds
191129175Sdds  /// Finds the load/stores to consecutive memory addresses and vectorizes them.
192129175Sdds  bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
193129175Sdds
194129175Sdds  /// Vectorizes the load instructions in Chain.
195129175Sdds  bool
196129175Sdds  vectorizeLoadChain(ArrayRef<Instruction *> Chain,
19781634Sbrian                     SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
19881634Sbrian
19981634Sbrian  /// Vectorizes the store instructions in Chain.
20081634Sbrian  bool
20149140Sbrian  vectorizeStoreChain(ArrayRef<Instruction *> Chain,
20281634Sbrian                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
20381634Sbrian
20481634Sbrian  /// Check if this load/store access is misaligned accesses.
20581634Sbrian  bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
20681634Sbrian                          unsigned Alignment);
2076059Samurai};
20881634Sbrian
20981634Sbrianclass LoadStoreVectorizerLegacyPass : public FunctionPass {
21081634Sbrianpublic:
2116059Samurai  static char ID;
21249140Sbrian
21381634Sbrian  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
21449140Sbrian    initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
21549140Sbrian  }
21649140Sbrian
21749140Sbrian  bool runOnFunction(Function &F) override;
21849140Sbrian
21949140Sbrian  StringRef getPassName() const override {
22081634Sbrian    return "GPU Load and Store Vectorizer";
22149140Sbrian  }
222129175Sdds
22381634Sbrian  void getAnalysisUsage(AnalysisUsage &AU) const override {
22481634Sbrian    AU.addRequired<AAResultsWrapperPass>();
22581634Sbrian    AU.addRequired<ScalarEvolutionWrapperPass>();
2266059Samurai    AU.addRequired<DominatorTreeWrapperPass>();
22749140Sbrian    AU.addRequired<TargetTransformInfoWrapperPass>();
22862977Sbrian    AU.setPreservesCFG();
22936285Sbrian  }
23081634Sbrian};
23181634Sbrian
23281634Sbrian} // end anonymous namespace
23381634Sbrian
23481634Sbrianchar LoadStoreVectorizerLegacyPass::ID = 0;
23581634Sbrian
23681738SbrianINITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
23781738Sbrian                      "Vectorize load and Store instructions", false, false)
23881634SbrianINITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
23981634SbrianINITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
24081634SbrianINITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
24181634SbrianINITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
24281634SbrianINITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
24381634SbrianINITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
24481634Sbrian                    "Vectorize load and store instructions", false, false)
24581634Sbrian
24681634SbrianPass *llvm::createLoadStoreVectorizerPass() {
24781634Sbrian  return new LoadStoreVectorizerLegacyPass();
24881634Sbrian}
24981634Sbrian
25081634Sbrianbool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
25181634Sbrian  // Don't vectorize when the attribute NoImplicitFloat is used.
25298243Sbrian  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
25381634Sbrian    return false;
25481634Sbrian
25581634Sbrian  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
25681634Sbrian  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
25781634Sbrian  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
25881634Sbrian  TargetTransformInfo &TTI =
25981634Sbrian      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
26081634Sbrian
26181634Sbrian  Vectorizer V(F, AA, DT, SE, TTI);
26281634Sbrian  return V.run();
26381634Sbrian}
26465181Sbrian
26598243SbrianPreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
26681634Sbrian  // Don't vectorize when the attribute NoImplicitFloat is used.
26781634Sbrian  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
26881634Sbrian    return PreservedAnalyses::all();
26981634Sbrian
27081634Sbrian  AliasAnalysis &AA = AM.getResult<AAManager>(F);
27149140Sbrian  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
27281634Sbrian  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273129175Sdds  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
27481634Sbrian
27549140Sbrian  Vectorizer V(F, AA, DT, SE, TTI);
27681634Sbrian  bool Changed = V.run();
27749140Sbrian  PreservedAnalyses PA;
27849140Sbrian  PA.preserveSet<CFGAnalyses>();
27949140Sbrian  return Changed ? PA : PreservedAnalyses::all();
28049140Sbrian}
28149140Sbrian
28249140Sbrian// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
28336285Sbrian// vectors of Instructions.
28449140Sbrianstatic void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
28549140Sbrian  SmallVector<Value *, 8> VL(IL.begin(), IL.end());
28649140Sbrian  propagateMetadata(I, VL);
28749140Sbrian}
2886059Samurai
28949140Sbrian// Vectorizer Implementation
29081634Sbrianbool Vectorizer::run() {
29181634Sbrian  bool Changed = false;
29281634Sbrian
29381634Sbrian  // Scan the blocks in the function in post order.
29481634Sbrian  for (BasicBlock *BB : post_order(&F)) {
29581634Sbrian    InstrListMap LoadRefs, StoreRefs;
29662977Sbrian    std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
29762977Sbrian    Changed |= vectorizeChains(LoadRefs);
29862977Sbrian    Changed |= vectorizeChains(StoreRefs);
29962977Sbrian  }
30081634Sbrian
30181634Sbrian  return Changed;
30281634Sbrian}
30381634Sbrian
30481634Sbrianunsigned Vectorizer::getPointerAddressSpace(Value *I) {
30581634Sbrian  if (LoadInst *L = dyn_cast<LoadInst>(I))
30649140Sbrian    return L->getPointerAddressSpace();
30781634Sbrian  if (StoreInst *S = dyn_cast<StoreInst>(I))
30862977Sbrian    return S->getPointerAddressSpace();
30981634Sbrian  return -1;
31081634Sbrian}
311112660Sbrian
31281634Sbrian// FIXME: Merge with llvm::isConsecutiveAccess
31381634Sbrianbool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
31481634Sbrian  Value *PtrA = getLoadStorePointerOperand(A);
31565181Sbrian  Value *PtrB = getLoadStorePointerOperand(B);
31681634Sbrian  unsigned ASA = getPointerAddressSpace(A);
31781634Sbrian  unsigned ASB = getPointerAddressSpace(B);
31881634Sbrian
31981634Sbrian  // Check that the address spaces match and that the pointers are valid.
320112660Sbrian  if (!PtrA || !PtrB || (ASA != ASB))
32162977Sbrian    return false;
32262977Sbrian
32362977Sbrian  // Make sure that A and B are different pointers of the same size type.
32481634Sbrian  Type *PtrATy = PtrA->getType()->getPointerElementType();
32581634Sbrian  Type *PtrBTy = PtrB->getType()->getPointerElementType();
32662977Sbrian  if (PtrA == PtrB ||
32781634Sbrian      PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
32862977Sbrian      DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
32981634Sbrian      DL.getTypeStoreSize(PtrATy->getScalarType()) !=
33051809Sbrian          DL.getTypeStoreSize(PtrBTy->getScalarType()))
33151809Sbrian    return false;
33281634Sbrian
33351809Sbrian  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
33451809Sbrian  APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
33549374Sbrian
33662977Sbrian  return areConsecutivePointers(PtrA, PtrB, Size);
33781634Sbrian}
33862977Sbrian
33949374Sbrianbool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
34081634Sbrian                                        APInt PtrDelta, unsigned Depth) const {
34181634Sbrian  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
34281634Sbrian  APInt OffsetA(PtrBitWidth, 0);
34365846Sbrian  APInt OffsetB(PtrBitWidth, 0);
34481634Sbrian  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
34581634Sbrian  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
34662977Sbrian
34781634Sbrian  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
34881634Sbrian
34962977Sbrian  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
35062977Sbrian    return false;
35162977Sbrian
35262977Sbrian  // In case if we have to shrink the pointer
35362977Sbrian  // stripAndAccumulateInBoundsConstantOffsets should properly handle a
35462977Sbrian  // possible overflow and the value should fit into a smallest data type
35581634Sbrian  // used in the cast/gep chain.
35662977Sbrian  assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
35781634Sbrian         OffsetB.getMinSignedBits() <= NewPtrBitWidth);
35881634Sbrian
35981634Sbrian  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
36062977Sbrian  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
36162977Sbrian  PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
36262977Sbrian
36365181Sbrian  APInt OffsetDelta = OffsetB - OffsetA;
36465181Sbrian
36562977Sbrian  // Check if they are based on the same pointer. That makes the offsets
36665181Sbrian  // sufficient.
36762977Sbrian  if (PtrA == PtrB)
36862977Sbrian    return OffsetDelta == PtrDelta;
36962977Sbrian
37062977Sbrian  // Compute the necessary base pointer delta to have the necessary final delta
37162977Sbrian  // equal to the pointer delta requested.
37262977Sbrian  APInt BaseDelta = PtrDelta - OffsetDelta;
37362977Sbrian
37462977Sbrian  // Compute the distance with SCEV between the base pointers.
37562977Sbrian  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
37662977Sbrian  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
37762977Sbrian  const SCEV *C = SE.getConstant(BaseDelta);
37862977Sbrian  const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
37962977Sbrian  if (X == PtrSCEVB)
38062977Sbrian    return true;
38162977Sbrian
38281634Sbrian  // The above check will not catch the cases where one of the pointers is
38362977Sbrian  // factorized but the other one is not, such as (C + (S * (A + B))) vs
38426516Sbrian  // (AS + BS). Get the minus scev. That will allow re-combining the expresions
38581634Sbrian  // and getting the simplified difference.
38681634Sbrian  const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
387129175Sdds  if (C == Dist)
38881634Sbrian    return true;
38981634Sbrian
39081634Sbrian  // Sometimes even this doesn't work, because SCEV can't always see through
39162977Sbrian  // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
39262977Sbrian  // things the hard way.
39362977Sbrian  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
39462977Sbrian}
39562977Sbrian
39662977Sbrianbool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
39762977Sbrian                                             APInt PtrDelta,
398129175Sdds                                             unsigned Depth) const {
399129175Sdds  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
40062977Sbrian  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
40162977Sbrian  if (!GEPA || !GEPB)
40262977Sbrian    return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
40381634Sbrian
40462977Sbrian  // Look through GEPs after checking they're the same except for the last
40562977Sbrian  // index.
40662977Sbrian  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
40762977Sbrian      GEPA->getPointerOperand() != GEPB->getPointerOperand())
40862977Sbrian    return false;
40962977Sbrian  gep_type_iterator GTIA = gep_type_begin(GEPA);
41062977Sbrian  gep_type_iterator GTIB = gep_type_begin(GEPB);
41162977Sbrian  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
41262977Sbrian    if (GTIA.getOperand() != GTIB.getOperand())
41362977Sbrian      return false;
41462977Sbrian    ++GTIA;
41562977Sbrian    ++GTIB;
41662977Sbrian  }
41749140Sbrian
41862977Sbrian  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
41981634Sbrian  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
42081634Sbrian  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
42162977Sbrian      OpA->getType() != OpB->getType())
42249140Sbrian    return false;
42362977Sbrian
42462977Sbrian  if (PtrDelta.isNegative()) {
42562977Sbrian    if (PtrDelta.isMinSignedValue())
42662977Sbrian      return false;
42762977Sbrian    PtrDelta.negate();
42862977Sbrian    std::swap(OpA, OpB);
42962977Sbrian  }
43062977Sbrian  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
43162977Sbrian  if (PtrDelta.urem(Stride) != 0)
43262977Sbrian    return false;
43362977Sbrian  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
43449140Sbrian  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
43562977Sbrian
43662977Sbrian  // Only look through a ZExt/SExt.
43762977Sbrian  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
43862977Sbrian    return false;
4396059Samurai
44049140Sbrian  bool Signed = isa<SExtInst>(OpA);
44149140Sbrian
44249140Sbrian  // At this point A could be a function parameter, i.e. not an instruction
44349140Sbrian  Value *ValA = OpA->getOperand(0);
44449140Sbrian  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
44549140Sbrian  if (!OpB || ValA->getType() != OpB->getType())
44662977Sbrian    return false;
44765181Sbrian
44862977Sbrian  // Now we need to prove that adding IdxDiff to ValA won't overflow.
44962977Sbrian  bool Safe = false;
45062977Sbrian  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
45165181Sbrian  // ValA, we're okay.
45265181Sbrian  if (OpB->getOpcode() == Instruction::Add &&
45365181Sbrian      isa<ConstantInt>(OpB->getOperand(1)) &&
45481634Sbrian      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
45565181Sbrian    if (Signed)
456129175Sdds      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
457129175Sdds    else
458129175Sdds      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
45965181Sbrian  }
46065181Sbrian
46162977Sbrian  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
46265181Sbrian
46365181Sbrian  // Second attempt:
46481634Sbrian  // If all set bits of IdxDiff or any higher order bit other than the sign bit
46598243Sbrian  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
46698243Sbrian  // overflow of any sort.
467129175Sdds  if (!Safe) {
46881634Sbrian    OpA = dyn_cast<Instruction>(ValA);
46965181Sbrian    if (!OpA)
47098243Sbrian      return false;
47181634Sbrian    KnownBits Known(BitWidth);
47265181Sbrian    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
47349140Sbrian    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
47449140Sbrian    if (Signed)
4756059Samurai      BitsAllowedToBeSet.clearBit(BitWidth - 1);
4766059Samurai    if (BitsAllowedToBeSet.ult(IdxDiff))
4776059Samurai      return false;
47865181Sbrian  }
47965181Sbrian
48081634Sbrian  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
48198243Sbrian  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
48298243Sbrian  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
483129175Sdds  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
484129175Sdds  return X == OffsetSCEVB;
48565181Sbrian}
48665181Sbrian
48781634Sbrianbool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
4886059Samurai                                    const APInt &PtrDelta,
4896059Samurai                                    unsigned Depth) const {
4906059Samurai  if (Depth++ == MaxDepth)
49158033Sbrian    return false;
49258033Sbrian
49358033Sbrian  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
49458033Sbrian    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
49558033Sbrian      return SelectA->getCondition() == SelectB->getCondition() &&
49677487Sbrian             areConsecutivePointers(SelectA->getTrueValue(),
49758033Sbrian                                    SelectB->getTrueValue(), PtrDelta, Depth) &&
49858033Sbrian             areConsecutivePointers(SelectA->getFalseValue(),
49958033Sbrian                                    SelectB->getFalseValue(), PtrDelta, Depth);
50058033Sbrian    }
50158033Sbrian  }
50258033Sbrian  return false;
50358033Sbrian}
50458033Sbrian
50558033Sbrianvoid Vectorizer::reorder(Instruction *I) {
50658033Sbrian  OrderedBasicBlock OBB(I->getParent());
50758033Sbrian  SmallPtrSet<Instruction *, 16> InstructionsToMove;
50858033Sbrian  SmallVector<Instruction *, 16> Worklist;
50958033Sbrian
51062977Sbrian  Worklist.push_back(I);
51158033Sbrian  while (!Worklist.empty()) {
51258033Sbrian    Instruction *IW = Worklist.pop_back_val();
51358033Sbrian    int NumOperands = IW->getNumOperands();
51458033Sbrian    for (int i = 0; i < NumOperands; i++) {
51558033Sbrian      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
51674049Sbrian      if (!IM || IM->getOpcode() == Instruction::PHI)
51758033Sbrian        continue;
51858033Sbrian
51958033Sbrian      // If IM is in another BB, no need to move it, because this pass only
52074049Sbrian      // vectorizes instructions within one BB.
52158033Sbrian      if (IM->getParent() != I->getParent())
52274049Sbrian        continue;
52374049Sbrian
52458033Sbrian      if (!OBB.dominates(IM, I)) {
52558033Sbrian        InstructionsToMove.insert(IM);
52658033Sbrian        Worklist.push_back(IM);
52758033Sbrian      }
52874049Sbrian    }
52958033Sbrian  }
53058033Sbrian
53158033Sbrian  // All instructions to move should follow I. Start from I, not from begin().
53258033Sbrian  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
53358033Sbrian       ++BBI) {
53458033Sbrian    if (!InstructionsToMove.count(&*BBI))
53558033Sbrian      continue;
53677487Sbrian    Instruction *IM = &*BBI;
53777487Sbrian    --BBI;
53877487Sbrian    IM->removeFromParent();
53977487Sbrian    IM->insertBefore(I);
54077487Sbrian  }
54177487Sbrian}
54277487Sbrian
54377487Sbrianstd::pair<BasicBlock::iterator, BasicBlock::iterator>
54477487SbrianVectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
54558033Sbrian  Instruction *C0 = Chain[0];
54658033Sbrian  BasicBlock::iterator FirstInstr = C0->getIterator();
54758033Sbrian  BasicBlock::iterator LastInstr = C0->getIterator();
5486059Samurai
54981634Sbrian  BasicBlock *BB = C0->getParent();
55081634Sbrian  unsigned NumFound = 0;
5516059Samurai  for (Instruction &I : *BB) {
5526059Samurai    if (!is_contained(Chain, &I))
55381634Sbrian      continue;
55481634Sbrian
55581634Sbrian    ++NumFound;
5566059Samurai    if (NumFound == 1) {
55758033Sbrian      FirstInstr = I.getIterator();
55858033Sbrian    }
55958033Sbrian    if (NumFound == Chain.size()) {
56081634Sbrian      LastInstr = I.getIterator();
56181634Sbrian      break;
56281634Sbrian    }
56381634Sbrian  }
56481634Sbrian
56581634Sbrian  // Range is [first, last).
56681634Sbrian  return std::make_pair(FirstInstr, ++LastInstr);
56781634Sbrian}
56881634Sbrian
56937010Sbrianvoid Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
57081634Sbrian  SmallVector<Instruction *, 16> Instrs;
57181634Sbrian  for (Instruction *I : Chain) {
5726059Samurai    Value *PtrOperand = getLoadStorePointerOperand(I);
57358776Sbrian    assert(PtrOperand && "Instruction must have a pointer operand.");
57458776Sbrian    Instrs.push_back(I);
57526692Sbrian    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
57658033Sbrian      Instrs.push_back(GEP);
5776059Samurai  }
57881634Sbrian
57981634Sbrian  // Erase instructions.
58081634Sbrian  for (Instruction *I : Instrs)
58181634Sbrian    if (I->use_empty())
58281634Sbrian      I->eraseFromParent();
58381634Sbrian}
58481634Sbrian
58581634Sbrianstd::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
58681634SbrianVectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
58781738Sbrian                               unsigned ElementSizeBits) {
58881634Sbrian  unsigned ElementSizeBytes = ElementSizeBits / 8;
58981634Sbrian  unsigned SizeBytes = ElementSizeBytes * Chain.size();
59081634Sbrian  unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
59181634Sbrian  if (NumLeft == Chain.size()) {
59281634Sbrian    if ((NumLeft & 1) == 0)
59381634Sbrian      NumLeft /= 2; // Split even in half
59481634Sbrian    else
59581634Sbrian      --NumLeft;    // Split off last element
59681634Sbrian  } else if (NumLeft == 0)
59781634Sbrian    NumLeft = 1;
59881634Sbrian  return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
59981634Sbrian}
60081634Sbrian
60181634SbrianArrayRef<Instruction *>
60281634SbrianVectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
60358033Sbrian  // These are in BB order, unlike Chain, which is in address order.
6048857Srgrimes  SmallVector<Instruction *, 16> MemoryInstrs;
60526692Sbrian  SmallVector<Instruction *, 16> ChainInstrs;
60662778Sbrian
60762778Sbrian  bool IsLoadChain = isa<LoadInst>(Chain[0]);
60862778Sbrian  LLVM_DEBUG({
60958776Sbrian    for (Instruction *I : Chain) {
61058776Sbrian      if (IsLoadChain)
61158776Sbrian        assert(isa<LoadInst>(I) &&
61228679Sbrian               "All elements of Chain must be loads, or all must be stores.");
61326692Sbrian      else
6146059Samurai        assert(isa<StoreInst>(I) &&
61581634Sbrian               "All elements of Chain must be loads, or all must be stores.");
6166059Samurai    }
61726692Sbrian  });
61881634Sbrian
61981634Sbrian  for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
62028679Sbrian    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
62181634Sbrian      if (!is_contained(Chain, &I))
62228679Sbrian        MemoryInstrs.push_back(&I);
62328679Sbrian      else
62481634Sbrian        ChainInstrs.push_back(&I);
62528679Sbrian    } else if (isa<IntrinsicInst>(&I) &&
6266059Samurai               cast<IntrinsicInst>(&I)->getIntrinsicID() ==
6276059Samurai                   Intrinsic::sideeffect) {
62851048Sbrian      // Ignore llvm.sideeffect calls.
62981634Sbrian    } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
63081634Sbrian      LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
63181634Sbrian                        << '\n');
63281634Sbrian      break;
63381634Sbrian    } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
63481634Sbrian      LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
63581634Sbrian                        << '\n');
63681634Sbrian      break;
63781634Sbrian    }
63881634Sbrian  }
63981634Sbrian
64081634Sbrian  OrderedBasicBlock OBB(Chain[0]->getParent());
64181634Sbrian
64281634Sbrian  // Loop until we find an instruction in ChainInstrs that we can't vectorize.
64381634Sbrian  unsigned ChainInstrIdx = 0;
6446059Samurai  Instruction *BarrierMemoryInstr = nullptr;
64581634Sbrian
64681634Sbrian  for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
64751048Sbrian    Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
64851048Sbrian
64981634Sbrian    // If a barrier memory instruction was found, chain instructions that follow
65081634Sbrian    // will not be added to the valid prefix.
65151048Sbrian    if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
65251048Sbrian      break;
65326692Sbrian
65481634Sbrian    // Check (in BB order) if any instruction prevents ChainInstr from being
65528679Sbrian    // vectorized. Find and store the first such "conflicting" instruction.
65681634Sbrian    for (Instruction *MemInstr : MemoryInstrs) {
65728679Sbrian      // If a barrier memory instruction was found, do not check past it.
65828679Sbrian      if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
65981634Sbrian        break;
66062977Sbrian
66128679Sbrian      auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
6626059Samurai      auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
66362778Sbrian      if (MemLoad && ChainLoad)
66462778Sbrian        continue;
66581634Sbrian
66681634Sbrian      // We can ignore the alias if the we have a load store pair and the load
66762778Sbrian      // is known to be invariant. The load cannot be clobbered by the store.
66862778Sbrian      auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
66962778Sbrian        return LI->hasMetadata(LLVMContext::MD_invariant_load);
67081634Sbrian      };
67162778Sbrian
67262778Sbrian      // We can ignore the alias as long as the load comes before the store,
67362778Sbrian      // because that means we won't be moving the load past the store to
67462778Sbrian      // vectorize it (the vectorized load is inserted at the location of the
67562778Sbrian      // first load in the chain).
67681634Sbrian      if (isa<StoreInst>(MemInstr) && ChainLoad &&
67781634Sbrian          (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
67862977Sbrian        continue;
67962778Sbrian
68062778Sbrian      // Same case, but in reverse.
68162778Sbrian      if (MemLoad && isa<StoreInst>(ChainInstr) &&
68262778Sbrian          (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
68362778Sbrian        continue;
68462778Sbrian
68562778Sbrian      if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
68662778Sbrian                        MemoryLocation::get(ChainInstr))) {
68762778Sbrian        LLVM_DEBUG({
68862778Sbrian          dbgs() << "LSV: Found alias:\n"
68962778Sbrian                    "  Aliasing instruction and pointer:\n"
69062778Sbrian                 << "  " << *MemInstr << '\n'
69162778Sbrian                 << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
69262778Sbrian                 << "  Aliased instruction and pointer:\n"
69362778Sbrian                 << "  " << *ChainInstr << '\n'
69462778Sbrian                 << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
69562778Sbrian        });
69662778Sbrian        // Save this aliasing memory instruction as a barrier, but allow other
69762778Sbrian        // instructions that precede the barrier to be vectorized with this one.
69862778Sbrian        BarrierMemoryInstr = MemInstr;
69962778Sbrian        break;
70062778Sbrian      }
70162778Sbrian    }
70262778Sbrian    // Continue the search only for store chains, since vectorizing stores that
7036059Samurai    // precede an aliasing load is valid. Conversely, vectorizing loads is valid
70451048Sbrian    // up to an aliasing store, but should not pull loads from further down in
70551809Sbrian    // the basic block.
70651809Sbrian    if (IsLoadChain && BarrierMemoryInstr) {
70751809Sbrian      // The BarrierMemoryInstr is a store that precedes ChainInstr.
70851809Sbrian      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
70981634Sbrian      break;
71051809Sbrian    }
71151809Sbrian  }
71281634Sbrian
71351809Sbrian  // Find the largest prefix of Chain whose elements are all in
71451809Sbrian  // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
71551809Sbrian  // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
71651809Sbrian  // order.)
71751809Sbrian  SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
71849374Sbrian      ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
71949372Sbrian  unsigned ChainIdx = 0;
72049372Sbrian  for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
72149372Sbrian    if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
72281634Sbrian      break;
72349372Sbrian  }
72449372Sbrian  return Chain.slice(0, ChainIdx);
72581634Sbrian}
72649372Sbrian
72749372Sbrianstatic ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
72849372Sbrian  const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
72949374Sbrian  if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
73051048Sbrian    // The select's themselves are distinct instructions even if they share the
73181634Sbrian    // same condition and evaluate to consecutive pointers for true and false
73281634Sbrian    // values of the condition. Therefore using the select's themselves for
73381634Sbrian    // grouping instructions would put consecutive accesses into different lists
73481634Sbrian    // and they won't be even checked for being consecutive, and won't be
73581634Sbrian    // vectorized.
73681634Sbrian    return Sel->getCondition();
73781634Sbrian  }
73881634Sbrian  return ObjPtr;
73981634Sbrian}
74081634Sbrian
74181634Sbrianstd::pair<InstrListMap, InstrListMap>
74281634SbrianVectorizer::collectInstructions(BasicBlock *BB) {
74381634Sbrian  InstrListMap LoadRefs;
74481634Sbrian  InstrListMap StoreRefs;
74581634Sbrian
74681634Sbrian  for (Instruction &I : *BB) {
74781634Sbrian    if (!I.mayReadOrWriteMemory())
74881634Sbrian      continue;
74981634Sbrian
75081634Sbrian    if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
75181634Sbrian      if (!LI->isSimple())
75236961Sbrian        continue;
75336961Sbrian
75436961Sbrian      // Skip if it's not legal.
75581634Sbrian      if (!TTI.isLegalToVectorizeLoad(LI))
75636961Sbrian        continue;
75736961Sbrian
75881634Sbrian      Type *Ty = LI->getType();
75936961Sbrian      if (!VectorType::isValidElementType(Ty->getScalarType()))
76081634Sbrian        continue;
76175894Sbrian
76281634Sbrian      // Skip weird non-byte sizes. They probably aren't worth the effort of
76381634Sbrian      // handling correctly.
76481634Sbrian      unsigned TySize = DL.getTypeSizeInBits(Ty);
76581634Sbrian      if ((TySize % 8) != 0)
76681634Sbrian        continue;
76781634Sbrian
76881634Sbrian      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
76936961Sbrian      // functions are currently using an integer type for the vectorized
77036961Sbrian      // load/store, and does not support casting between the integer type and a
77151048Sbrian      // vector of pointers (e.g. i64 to <2 x i16*>)
77265846Sbrian      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
77365846Sbrian        continue;
77465846Sbrian
77581634Sbrian      Value *Ptr = LI->getPointerOperand();
77665846Sbrian      unsigned AS = Ptr->getType()->getPointerAddressSpace();
77771781Sbrian      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
77881634Sbrian
77965846Sbrian      unsigned VF = VecRegSize / TySize;
78065846Sbrian      VectorType *VecTy = dyn_cast<VectorType>(Ty);
78165846Sbrian
78265846Sbrian      // No point in looking at these if they're too big to vectorize.
78365846Sbrian      if (TySize > VecRegSize / 2 ||
78465846Sbrian          (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
78565846Sbrian        continue;
78681634Sbrian
78765846Sbrian      // Make sure all the users of a vector are constant-index extracts.
78871781Sbrian      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
78981634Sbrian            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
79065846Sbrian            return EEI && isa<ConstantInt>(EEI->getOperand(1));
79165846Sbrian          }))
79265846Sbrian        continue;
79365846Sbrian
79436961Sbrian      // Save the load locations.
79536961Sbrian      const ChainID ID = getChainID(Ptr, DL);
79681634Sbrian      LoadRefs[ID].push_back(LI);
79736961Sbrian    } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
79881634Sbrian      if (!SI->isSimple())
79962977Sbrian        continue;
80036961Sbrian
80136961Sbrian      // Skip if it's not legal.
80281634Sbrian      if (!TTI.isLegalToVectorizeStore(SI))
80336961Sbrian        continue;
80436961Sbrian
80536961Sbrian      Type *Ty = SI->getValueOperand()->getType();
80651048Sbrian      if (!VectorType::isValidElementType(Ty->getScalarType()))
8076059Samurai        continue;
80881634Sbrian
80981634Sbrian      // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
81050867Sbrian      // functions are currently using an integer type for the vectorized
81151048Sbrian      // load/store, and does not support casting between the integer type and a
81281634Sbrian      // vector of pointers (e.g. i64 to <2 x i16*>)
81381634Sbrian      if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
81450867Sbrian        continue;
81550867Sbrian
81626692Sbrian      // Skip weird non-byte sizes. They probably aren't worth the effort of
81781634Sbrian      // handling correctly.
81828679Sbrian      unsigned TySize = DL.getTypeSizeInBits(Ty);
81981634Sbrian      if ((TySize % 8) != 0)
82028679Sbrian        continue;
82128679Sbrian
82281634Sbrian      Value *Ptr = SI->getPointerOperand();
82328679Sbrian      unsigned AS = Ptr->getType()->getPointerAddressSpace();
8246059Samurai      unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
8256059Samurai
82662977Sbrian      unsigned VF = VecRegSize / TySize;
82762977Sbrian      VectorType *VecTy = dyn_cast<VectorType>(Ty);
82862977Sbrian
82962977Sbrian      // No point in looking at these if they're too big to vectorize.
83062977Sbrian      if (TySize > VecRegSize / 2 ||
8316059Samurai          (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
83228679Sbrian        continue;
83362977Sbrian
83462977Sbrian      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
83528679Sbrian            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
8366059Samurai            return EEI && isa<ConstantInt>(EEI->getOperand(1));
83781634Sbrian          }))
8386059Samurai        continue;
83981634Sbrian
84062977Sbrian      // Save store location.
84162977Sbrian      const ChainID ID = getChainID(Ptr, DL);
84262977Sbrian      StoreRefs[ID].push_back(SI);
84362977Sbrian    }
84462977Sbrian  }
8456059Samurai
8466059Samurai  return {LoadRefs, StoreRefs};
8476059Samurai}
84862778Sbrian
84962778Sbrianbool Vectorizer::vectorizeChains(InstrListMap &Map) {
85062778Sbrian  bool Changed = false;
85162778Sbrian
85281634Sbrian  for (const std::pair<ChainID, InstrList> &Chain : Map) {
85381634Sbrian    unsigned Size = Chain.second.size();
85481634Sbrian    if (Size < 2)
85581634Sbrian      continue;
85681634Sbrian
85781634Sbrian    LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
85881634Sbrian
85981634Sbrian    // Process the stores in chunks of 64.
86081634Sbrian    for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
86181634Sbrian      unsigned Len = std::min<unsigned>(CE - CI, 64);
8626059Samurai      ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
86326692Sbrian      Changed |= vectorizeInstructions(Chunk);
86481634Sbrian    }
86531142Sbrian  }
86636285Sbrian
86758033Sbrian  return Changed;
8686059Samurai}
86936285Sbrian
87058033Sbrianbool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
87162977Sbrian  LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
87262977Sbrian                    << " instructions.\n");
87362977Sbrian  SmallVector<int, 16> Heads, Tails;
87481634Sbrian  int ConsecutiveChain[64];
87581634Sbrian
87636285Sbrian  // Do a quadratic search on all of the given loads/stores and find all of the
87762977Sbrian  // pairs of loads/stores that follow each other.
87862977Sbrian  for (int i = 0, e = Instrs.size(); i < e; ++i) {
87962977Sbrian    ConsecutiveChain[i] = -1;
88062977Sbrian    for (int j = e - 1; j >= 0; --j) {
88162977Sbrian      if (i == j)
88298243Sbrian        continue;
88362977Sbrian
88462977Sbrian      if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
88562977Sbrian        if (ConsecutiveChain[i] != -1) {
88662977Sbrian          int CurDistance = std::abs(ConsecutiveChain[i] - i);
88762977Sbrian          int NewDistance = std::abs(ConsecutiveChain[i] - j);
88862977Sbrian          if (j < i || NewDistance > CurDistance)
8896735Samurai            continue; // Should not insert.
89058033Sbrian        }
8916059Samurai
89258033Sbrian        Tails.push_back(j);
89358776Sbrian        Heads.push_back(i);
89458033Sbrian        ConsecutiveChain[i] = j;
89558033Sbrian      }
89658033Sbrian    }
8976059Samurai  }
8986059Samurai
89981634Sbrian  bool Changed = false;
90081634Sbrian  SmallPtrSet<Instruction *, 16> InstructionsProcessed;
90136285Sbrian
9026059Samurai  for (int Head : Heads) {
90331343Sbrian    if (InstructionsProcessed.count(Instrs[Head]))
90456413Sbrian      continue;
90562977Sbrian    bool LongerChainExists = false;
9066059Samurai    for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
90754912Sbrian      if (Head == Tails[TIt] &&
90847168Sbrian          !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
90947168Sbrian        LongerChainExists = true;
91047168Sbrian        break;
91154912Sbrian      }
91281634Sbrian    if (LongerChainExists)
91347168Sbrian      continue;
91446686Sbrian
91526031Sbrian    // We found an instr that starts a chain. Now follow the chain and try to
91662977Sbrian    // vectorize it.
91781634Sbrian    SmallVector<Instruction *, 16> Operands;
91881634Sbrian    int I = Head;
91981634Sbrian    while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
92026031Sbrian      if (InstructionsProcessed.count(Instrs[I]))
92162977Sbrian        break;
92281634Sbrian
92362977Sbrian      Operands.push_back(Instrs[I]);
92462977Sbrian      I = ConsecutiveChain[I];
92562977Sbrian    }
92662977Sbrian
92726031Sbrian    bool Vectorized = false;
92856413Sbrian    if (isa<LoadInst>(*Operands.begin()))
92981634Sbrian      Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
93056413Sbrian    else
93156413Sbrian      Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
93256413Sbrian
93356413Sbrian    Changed |= Vectorized;
93456413Sbrian  }
93556413Sbrian
93646686Sbrian  return Changed;
93746686Sbrian}
93847168Sbrian
93947168Sbrianbool Vectorizer::vectorizeStoreChain(
94046686Sbrian    ArrayRef<Instruction *> Chain,
94147168Sbrian    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
94246686Sbrian  StoreInst *S0 = cast<StoreInst>(Chain[0]);
94336285Sbrian
94481634Sbrian  // If the vector has an int element, default to int for the whole store.
9456059Samurai  Type *StoreTy = nullptr;
9466059Samurai  for (Instruction *I : Chain) {
94781634Sbrian    StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
94881634Sbrian    if (StoreTy->isIntOrIntVectorTy())
9496059Samurai      break;
95081634Sbrian
9516059Samurai    if (StoreTy->isPtrOrPtrVectorTy()) {
95281634Sbrian      StoreTy = Type::getIntNTy(F.getParent()->getContext(),
95381634Sbrian                                DL.getTypeSizeInBits(StoreTy));
95481634Sbrian      break;
95581634Sbrian    }
95638557Sbrian  }
9576059Samurai  assert(StoreTy && "Failed to find store type");
95881634Sbrian
95938544Sbrian  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
96081634Sbrian  unsigned AS = S0->getPointerAddressSpace();
96181634Sbrian  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
96238544Sbrian  unsigned VF = VecRegSize / Sz;
96381634Sbrian  unsigned ChainSize = Chain.size();
9647001Samurai  unsigned Alignment = getAlignment(S0);
9657001Samurai
96681634Sbrian  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
96781634Sbrian    InstructionsProcessed->insert(Chain.begin(), Chain.end());
96881634Sbrian    return false;
9696059Samurai  }
97081634Sbrian
9716059Samurai  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
97281897Sbrian  if (NewChain.empty()) {
97381634Sbrian    // No vectorization possible.
97481634Sbrian    InstructionsProcessed->insert(Chain.begin(), Chain.end());
97581634Sbrian    return false;
97678411Sbrian  }
97778411Sbrian  if (NewChain.size() == 1) {
97881634Sbrian    // Failed after the first instruction. Discard it and try the smaller chain.
97936285Sbrian    InstructionsProcessed->insert(NewChain.front());
98081634Sbrian    return false;
98181634Sbrian  }
98281634Sbrian
98381634Sbrian  // Update Chain to the valid vectorizable subchain.
9846059Samurai  Chain = NewChain;
98581634Sbrian  ChainSize = Chain.size();
986
987  // Check if it's legal to vectorize this chain. If not, split the chain and
988  // try again.
989  unsigned EltSzInBytes = Sz / 8;
990  unsigned SzInBytes = EltSzInBytes * ChainSize;
991
992  VectorType *VecTy;
993  VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
994  if (VecStoreTy)
995    VecTy = VectorType::get(StoreTy->getScalarType(),
996                            Chain.size() * VecStoreTy->getNumElements());
997  else
998    VecTy = VectorType::get(StoreTy, Chain.size());
999
1000  // If it's more than the max vector size or the target has a better
1001  // vector factor, break it into two pieces.
1002  unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
1003  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1004    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1005                         " Creating two separate arrays.\n");
1006    return vectorizeStoreChain(Chain.slice(0, TargetVF),
1007                               InstructionsProcessed) |
1008           vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
1009  }
1010
1011  LLVM_DEBUG({
1012    dbgs() << "LSV: Stores to vectorize:\n";
1013    for (Instruction *I : Chain)
1014      dbgs() << "  " << *I << "\n";
1015  });
1016
1017  // We won't try again to vectorize the elements of the chain, regardless of
1018  // whether we succeed below.
1019  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1020
1021  // If the store is going to be misaligned, don't vectorize it.
1022  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1023    if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1024      auto Chains = splitOddVectorElts(Chain, Sz);
1025      return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1026             vectorizeStoreChain(Chains.second, InstructionsProcessed);
1027    }
1028
1029    unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1030                                                   StackAdjustedAlignment,
1031                                                   DL, S0, nullptr, &DT);
1032    if (NewAlign != 0)
1033      Alignment = NewAlign;
1034  }
1035
1036  if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1037    auto Chains = splitOddVectorElts(Chain, Sz);
1038    return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1039           vectorizeStoreChain(Chains.second, InstructionsProcessed);
1040  }
1041
1042  BasicBlock::iterator First, Last;
1043  std::tie(First, Last) = getBoundaryInstrs(Chain);
1044  Builder.SetInsertPoint(&*Last);
1045
1046  Value *Vec = UndefValue::get(VecTy);
1047
1048  if (VecStoreTy) {
1049    unsigned VecWidth = VecStoreTy->getNumElements();
1050    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1051      StoreInst *Store = cast<StoreInst>(Chain[I]);
1052      for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1053        unsigned NewIdx = J + I * VecWidth;
1054        Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1055                                                      Builder.getInt32(J));
1056        if (Extract->getType() != StoreTy->getScalarType())
1057          Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1058
1059        Value *Insert =
1060            Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1061        Vec = Insert;
1062      }
1063    }
1064  } else {
1065    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1066      StoreInst *Store = cast<StoreInst>(Chain[I]);
1067      Value *Extract = Store->getValueOperand();
1068      if (Extract->getType() != StoreTy->getScalarType())
1069        Extract =
1070            Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1071
1072      Value *Insert =
1073          Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1074      Vec = Insert;
1075    }
1076  }
1077
1078  StoreInst *SI = Builder.CreateAlignedStore(
1079    Vec,
1080    Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1081    Alignment);
1082  propagateMetadata(SI, Chain);
1083
1084  eraseInstructions(Chain);
1085  ++NumVectorInstructions;
1086  NumScalarsVectorized += Chain.size();
1087  return true;
1088}
1089
1090bool Vectorizer::vectorizeLoadChain(
1091    ArrayRef<Instruction *> Chain,
1092    SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1093  LoadInst *L0 = cast<LoadInst>(Chain[0]);
1094
1095  // If the vector has an int element, default to int for the whole load.
1096  Type *LoadTy = nullptr;
1097  for (const auto &V : Chain) {
1098    LoadTy = cast<LoadInst>(V)->getType();
1099    if (LoadTy->isIntOrIntVectorTy())
1100      break;
1101
1102    if (LoadTy->isPtrOrPtrVectorTy()) {
1103      LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1104                               DL.getTypeSizeInBits(LoadTy));
1105      break;
1106    }
1107  }
1108  assert(LoadTy && "Can't determine LoadInst type from chain");
1109
1110  unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1111  unsigned AS = L0->getPointerAddressSpace();
1112  unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1113  unsigned VF = VecRegSize / Sz;
1114  unsigned ChainSize = Chain.size();
1115  unsigned Alignment = getAlignment(L0);
1116
1117  if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1118    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1119    return false;
1120  }
1121
1122  ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1123  if (NewChain.empty()) {
1124    // No vectorization possible.
1125    InstructionsProcessed->insert(Chain.begin(), Chain.end());
1126    return false;
1127  }
1128  if (NewChain.size() == 1) {
1129    // Failed after the first instruction. Discard it and try the smaller chain.
1130    InstructionsProcessed->insert(NewChain.front());
1131    return false;
1132  }
1133
1134  // Update Chain to the valid vectorizable subchain.
1135  Chain = NewChain;
1136  ChainSize = Chain.size();
1137
1138  // Check if it's legal to vectorize this chain. If not, split the chain and
1139  // try again.
1140  unsigned EltSzInBytes = Sz / 8;
1141  unsigned SzInBytes = EltSzInBytes * ChainSize;
1142  VectorType *VecTy;
1143  VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1144  if (VecLoadTy)
1145    VecTy = VectorType::get(LoadTy->getScalarType(),
1146                            Chain.size() * VecLoadTy->getNumElements());
1147  else
1148    VecTy = VectorType::get(LoadTy, Chain.size());
1149
1150  // If it's more than the max vector size or the target has a better
1151  // vector factor, break it into two pieces.
1152  unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1153  if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1154    LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1155                         " Creating two separate arrays.\n");
1156    return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1157           vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1158  }
1159
1160  // We won't try again to vectorize the elements of the chain, regardless of
1161  // whether we succeed below.
1162  InstructionsProcessed->insert(Chain.begin(), Chain.end());
1163
1164  // If the load is going to be misaligned, don't vectorize it.
1165  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1166    if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1167      auto Chains = splitOddVectorElts(Chain, Sz);
1168      return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1169             vectorizeLoadChain(Chains.second, InstructionsProcessed);
1170    }
1171
1172    Alignment = getOrEnforceKnownAlignment(
1173        L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
1174  }
1175
1176  if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1177    auto Chains = splitOddVectorElts(Chain, Sz);
1178    return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1179           vectorizeLoadChain(Chains.second, InstructionsProcessed);
1180  }
1181
1182  LLVM_DEBUG({
1183    dbgs() << "LSV: Loads to vectorize:\n";
1184    for (Instruction *I : Chain)
1185      I->dump();
1186  });
1187
1188  // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1189  // Last may have changed since then, but the value of First won't have.  If it
1190  // matters, we could compute getBoundaryInstrs only once and reuse it here.
1191  BasicBlock::iterator First, Last;
1192  std::tie(First, Last) = getBoundaryInstrs(Chain);
1193  Builder.SetInsertPoint(&*First);
1194
1195  Value *Bitcast =
1196      Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1197  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
1198  propagateMetadata(LI, Chain);
1199
1200  if (VecLoadTy) {
1201    SmallVector<Instruction *, 16> InstrsToErase;
1202
1203    unsigned VecWidth = VecLoadTy->getNumElements();
1204    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1205      for (auto Use : Chain[I]->users()) {
1206        // All users of vector loads are ExtractElement instructions with
1207        // constant indices, otherwise we would have bailed before now.
1208        Instruction *UI = cast<Instruction>(Use);
1209        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1210        unsigned NewIdx = Idx + I * VecWidth;
1211        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1212                                                UI->getName());
1213        if (V->getType() != UI->getType())
1214          V = Builder.CreateBitCast(V, UI->getType());
1215
1216        // Replace the old instruction.
1217        UI->replaceAllUsesWith(V);
1218        InstrsToErase.push_back(UI);
1219      }
1220    }
1221
1222    // Bitcast might not be an Instruction, if the value being loaded is a
1223    // constant.  In that case, no need to reorder anything.
1224    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1225      reorder(BitcastInst);
1226
1227    for (auto I : InstrsToErase)
1228      I->eraseFromParent();
1229  } else {
1230    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1231      Value *CV = Chain[I];
1232      Value *V =
1233          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1234      if (V->getType() != CV->getType()) {
1235        V = Builder.CreateBitOrPointerCast(V, CV->getType());
1236      }
1237
1238      // Replace the old instruction.
1239      CV->replaceAllUsesWith(V);
1240    }
1241
1242    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1243      reorder(BitcastInst);
1244  }
1245
1246  eraseInstructions(Chain);
1247
1248  ++NumVectorInstructions;
1249  NumScalarsVectorized += Chain.size();
1250  return true;
1251}
1252
1253bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1254                                    unsigned Alignment) {
1255  if (Alignment % SzInBytes == 0)
1256    return false;
1257
1258  bool Fast = false;
1259  bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1260                                                   SzInBytes * 8, AddressSpace,
1261                                                   Alignment, &Fast);
1262  LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1263                    << " and fast? " << Fast << "\n";);
1264  return !Allows || !Fast;
1265}
1266