1234285Sdim//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
2234285Sdim//
3234285Sdim//                     The LLVM Compiler Infrastructure
4234285Sdim//
5234285Sdim// This file is distributed under the University of Illinois Open Source
6234285Sdim// License. See LICENSE.TXT for details.
7234285Sdim//
8234285Sdim//===----------------------------------------------------------------------===//
9234285Sdim//
10234285Sdim// This file implements a basic-block vectorization pass. The algorithm was
11234285Sdim// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
12234285Sdim// et al. It works by looking for chains of pairable operations and then
13234285Sdim// pairing them.
14234285Sdim//
15234285Sdim//===----------------------------------------------------------------------===//
16234285Sdim
17234285Sdim#define BBV_NAME "bb-vectorize"
18234285Sdim#define DEBUG_TYPE BBV_NAME
19249423Sdim#include "llvm/Transforms/Vectorize.h"
20234285Sdim#include "llvm/ADT/DenseMap.h"
21234285Sdim#include "llvm/ADT/DenseSet.h"
22249423Sdim#include "llvm/ADT/STLExtras.h"
23243830Sdim#include "llvm/ADT/SmallSet.h"
24234285Sdim#include "llvm/ADT/SmallVector.h"
25234285Sdim#include "llvm/ADT/Statistic.h"
26234285Sdim#include "llvm/ADT/StringExtras.h"
27234285Sdim#include "llvm/Analysis/AliasAnalysis.h"
28234285Sdim#include "llvm/Analysis/AliasSetTracker.h"
29243830Sdim#include "llvm/Analysis/Dominators.h"
30234285Sdim#include "llvm/Analysis/ScalarEvolution.h"
31234285Sdim#include "llvm/Analysis/ScalarEvolutionExpressions.h"
32249423Sdim#include "llvm/Analysis/TargetTransformInfo.h"
33234285Sdim#include "llvm/Analysis/ValueTracking.h"
34249423Sdim#include "llvm/IR/Constants.h"
35249423Sdim#include "llvm/IR/DataLayout.h"
36249423Sdim#include "llvm/IR/DerivedTypes.h"
37249423Sdim#include "llvm/IR/Function.h"
38249423Sdim#include "llvm/IR/Instructions.h"
39249423Sdim#include "llvm/IR/IntrinsicInst.h"
40249423Sdim#include "llvm/IR/Intrinsics.h"
41249423Sdim#include "llvm/IR/LLVMContext.h"
42249423Sdim#include "llvm/IR/Metadata.h"
43249423Sdim#include "llvm/IR/Type.h"
44249423Sdim#include "llvm/Pass.h"
45234285Sdim#include "llvm/Support/CommandLine.h"
46234285Sdim#include "llvm/Support/Debug.h"
47249423Sdim#include "llvm/Support/ValueHandle.h"
48234285Sdim#include "llvm/Support/raw_ostream.h"
49239462Sdim#include "llvm/Transforms/Utils/Local.h"
50234285Sdim#include <algorithm>
51234285Sdimusing namespace llvm;
52234285Sdim
53243830Sdimstatic cl::opt<bool>
54243830SdimIgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
55243830Sdim  cl::Hidden, cl::desc("Ignore target information"));
56243830Sdim
57234285Sdimstatic cl::opt<unsigned>
58234285SdimReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
59234285Sdim  cl::desc("The required chain depth for vectorization"));
60234285Sdim
61243830Sdimstatic cl::opt<bool>
62243830SdimUseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
63243830Sdim  cl::Hidden, cl::desc("Use the chain depth requirement with"
64243830Sdim                       " target information"));
65243830Sdim
66234285Sdimstatic cl::opt<unsigned>
67234285SdimSearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
68234285Sdim  cl::desc("The maximum search distance for instruction pairs"));
69234285Sdim
70234285Sdimstatic cl::opt<bool>
71234285SdimSplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
72234285Sdim  cl::desc("Replicating one element to a pair breaks the chain"));
73234285Sdim
74234285Sdimstatic cl::opt<unsigned>
75234285SdimVectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
76234285Sdim  cl::desc("The size of the native vector registers"));
77234285Sdim
78234285Sdimstatic cl::opt<unsigned>
79234285SdimMaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
80234285Sdim  cl::desc("The maximum number of pairing iterations"));
81234285Sdim
82239462Sdimstatic cl::opt<bool>
83239462SdimPow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
84239462Sdim  cl::desc("Don't try to form non-2^n-length vectors"));
85239462Sdim
86234285Sdimstatic cl::opt<unsigned>
87234285SdimMaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
88234285Sdim  cl::desc("The maximum number of pairable instructions per group"));
89234285Sdim
90234285Sdimstatic cl::opt<unsigned>
91249423SdimMaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
92249423Sdim  cl::desc("The maximum number of candidate instruction pairs per group"));
93249423Sdim
94249423Sdimstatic cl::opt<unsigned>
95234285SdimMaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
96234285Sdim  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
97234285Sdim                       " a full cycle check"));
98234285Sdim
99234285Sdimstatic cl::opt<bool>
100239462SdimNoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
101239462Sdim  cl::desc("Don't try to vectorize boolean (i1) values"));
102239462Sdim
103239462Sdimstatic cl::opt<bool>
104234285SdimNoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
105234285Sdim  cl::desc("Don't try to vectorize integer values"));
106234285Sdim
107234285Sdimstatic cl::opt<bool>
108234285SdimNoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
109234285Sdim  cl::desc("Don't try to vectorize floating-point values"));
110234285Sdim
111243830Sdim// FIXME: This should default to false once pointer vector support works.
112234285Sdimstatic cl::opt<bool>
113243830SdimNoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
114234982Sdim  cl::desc("Don't try to vectorize pointer values"));
115234982Sdim
116234982Sdimstatic cl::opt<bool>
117234285SdimNoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
118234285Sdim  cl::desc("Don't try to vectorize casting (conversion) operations"));
119234285Sdim
120234285Sdimstatic cl::opt<bool>
121234285SdimNoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
122234285Sdim  cl::desc("Don't try to vectorize floating-point math intrinsics"));
123234285Sdim
124234285Sdimstatic cl::opt<bool>
125234285SdimNoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
126234285Sdim  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
127234285Sdim
128234285Sdimstatic cl::opt<bool>
129234982SdimNoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
130234982Sdim  cl::desc("Don't try to vectorize select instructions"));
131234982Sdim
132234982Sdimstatic cl::opt<bool>
133239462SdimNoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
134239462Sdim  cl::desc("Don't try to vectorize comparison instructions"));
135239462Sdim
136239462Sdimstatic cl::opt<bool>
137234982SdimNoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
138234982Sdim  cl::desc("Don't try to vectorize getelementptr instructions"));
139234982Sdim
140234982Sdimstatic cl::opt<bool>
141234285SdimNoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
142234285Sdim  cl::desc("Don't try to vectorize loads and stores"));
143234285Sdim
144234285Sdimstatic cl::opt<bool>
145234285SdimAlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
146234285Sdim  cl::desc("Only generate aligned loads and stores"));
147234285Sdim
148234285Sdimstatic cl::opt<bool>
149234285SdimNoMemOpBoost("bb-vectorize-no-mem-op-boost",
150234285Sdim  cl::init(false), cl::Hidden,
151234285Sdim  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
152234285Sdim
153234285Sdimstatic cl::opt<bool>
154234285SdimFastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
155234285Sdim  cl::desc("Use a fast instruction dependency analysis"));
156234285Sdim
157234285Sdim#ifndef NDEBUG
158234285Sdimstatic cl::opt<bool>
159234285SdimDebugInstructionExamination("bb-vectorize-debug-instruction-examination",
160234285Sdim  cl::init(false), cl::Hidden,
161234285Sdim  cl::desc("When debugging is enabled, output information on the"
162234285Sdim           " instruction-examination process"));
163234285Sdimstatic cl::opt<bool>
164234285SdimDebugCandidateSelection("bb-vectorize-debug-candidate-selection",
165234285Sdim  cl::init(false), cl::Hidden,
166234285Sdim  cl::desc("When debugging is enabled, output information on the"
167234285Sdim           " candidate-selection process"));
168234285Sdimstatic cl::opt<bool>
169234285SdimDebugPairSelection("bb-vectorize-debug-pair-selection",
170234285Sdim  cl::init(false), cl::Hidden,
171234285Sdim  cl::desc("When debugging is enabled, output information on the"
172234285Sdim           " pair-selection process"));
173234285Sdimstatic cl::opt<bool>
174234285SdimDebugCycleCheck("bb-vectorize-debug-cycle-check",
175234285Sdim  cl::init(false), cl::Hidden,
176234285Sdim  cl::desc("When debugging is enabled, output information on the"
177234285Sdim           " cycle-checking process"));
178243830Sdim
179243830Sdimstatic cl::opt<bool>
180243830SdimPrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
181243830Sdim  cl::init(false), cl::Hidden,
182243830Sdim  cl::desc("When debugging is enabled, dump the basic block after"
183243830Sdim           " every pair is fused"));
184234285Sdim#endif
185234285Sdim
186234285SdimSTATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
187234285Sdim
188234285Sdimnamespace {
189234285Sdim  struct BBVectorize : public BasicBlockPass {
190234285Sdim    static char ID; // Pass identification, replacement for typeid
191234285Sdim
192234285Sdim    const VectorizeConfig Config;
193234285Sdim
194234285Sdim    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
195234285Sdim      : BasicBlockPass(ID), Config(C) {
196234285Sdim      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
197234285Sdim    }
198234285Sdim
199234285Sdim    BBVectorize(Pass *P, const VectorizeConfig &C)
200234285Sdim      : BasicBlockPass(ID), Config(C) {
201234285Sdim      AA = &P->getAnalysis<AliasAnalysis>();
202243830Sdim      DT = &P->getAnalysis<DominatorTree>();
203234285Sdim      SE = &P->getAnalysis<ScalarEvolution>();
204243830Sdim      TD = P->getAnalysisIfAvailable<DataLayout>();
205249423Sdim      TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>();
206234285Sdim    }
207234285Sdim
208234285Sdim    typedef std::pair<Value *, Value *> ValuePair;
209243830Sdim    typedef std::pair<ValuePair, int> ValuePairWithCost;
210234285Sdim    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
211234285Sdim    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
212243830Sdim    typedef std::pair<VPPair, unsigned> VPPairWithType;
213234285Sdim
214234285Sdim    AliasAnalysis *AA;
215243830Sdim    DominatorTree *DT;
216234285Sdim    ScalarEvolution *SE;
217243830Sdim    DataLayout *TD;
218249423Sdim    const TargetTransformInfo *TTI;
219234285Sdim
220234285Sdim    // FIXME: const correct?
221234285Sdim
222239462Sdim    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
223234285Sdim
224234285Sdim    bool getCandidatePairs(BasicBlock &BB,
225234285Sdim                       BasicBlock::iterator &Start,
226249423Sdim                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
227243830Sdim                       DenseSet<ValuePair> &FixedOrderPairs,
228243830Sdim                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
229239462Sdim                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
230234285Sdim
231243830Sdim    // FIXME: The current implementation does not account for pairs that
232243830Sdim    // are connected in multiple ways. For example:
233243830Sdim    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
234243830Sdim    enum PairConnectionType {
235243830Sdim      PairConnectionDirect,
236243830Sdim      PairConnectionSwap,
237243830Sdim      PairConnectionSplat
238243830Sdim    };
239243830Sdim
240249423Sdim    void computeConnectedPairs(
241249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
242249423Sdim             DenseSet<ValuePair> &CandidatePairsSet,
243249423Sdim             std::vector<Value *> &PairableInsts,
244249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
245249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes);
246234285Sdim
247234285Sdim    void buildDepMap(BasicBlock &BB,
248249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
249249423Sdim             std::vector<Value *> &PairableInsts,
250249423Sdim             DenseSet<ValuePair> &PairableInstUsers);
251234285Sdim
252249423Sdim    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
253249423Sdim             DenseSet<ValuePair> &CandidatePairsSet,
254249423Sdim             DenseMap<ValuePair, int> &CandidatePairCostSavings,
255249423Sdim             std::vector<Value *> &PairableInsts,
256249423Sdim             DenseSet<ValuePair> &FixedOrderPairs,
257249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes,
258249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
259249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
260249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
261249423Sdim             DenseMap<Value *, Value *>& ChosenPairs);
262234285Sdim
263234285Sdim    void fuseChosenPairs(BasicBlock &BB,
264249423Sdim             std::vector<Value *> &PairableInsts,
265249423Sdim             DenseMap<Value *, Value *>& ChosenPairs,
266249423Sdim             DenseSet<ValuePair> &FixedOrderPairs,
267249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes,
268249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
269249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
270234285Sdim
271243830Sdim
272234285Sdim    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
273234285Sdim
274234285Sdim    bool areInstsCompatible(Instruction *I, Instruction *J,
275243830Sdim                       bool IsSimpleLoadStore, bool NonPow2Len,
276243830Sdim                       int &CostSavings, int &FixedOrder);
277234285Sdim
278234285Sdim    bool trackUsesOfI(DenseSet<Value *> &Users,
279234285Sdim                      AliasSetTracker &WriteSet, Instruction *I,
280234285Sdim                      Instruction *J, bool UpdateUsers = true,
281249423Sdim                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
282234285Sdim
283249423Sdim  void computePairsConnectedTo(
284249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
285249423Sdim             DenseSet<ValuePair> &CandidatePairsSet,
286249423Sdim             std::vector<Value *> &PairableInsts,
287249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
288249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes,
289249423Sdim             ValuePair P);
290234285Sdim
291234285Sdim    bool pairsConflict(ValuePair P, ValuePair Q,
292249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
293249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> >
294249423Sdim               *PairableInstUserMap = 0,
295249423Sdim             DenseSet<VPPair> *PairableInstUserPairSet = 0);
296234285Sdim
297234285Sdim    bool pairWillFormCycle(ValuePair P,
298249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
299249423Sdim             DenseSet<ValuePair> &CurrentPairs);
300234285Sdim
301249423Sdim    void pruneDAGFor(
302249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
303249423Sdim             std::vector<Value *> &PairableInsts,
304249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
305249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
306249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
307249423Sdim             DenseSet<VPPair> &PairableInstUserPairSet,
308249423Sdim             DenseMap<Value *, Value *> &ChosenPairs,
309249423Sdim             DenseMap<ValuePair, size_t> &DAG,
310249423Sdim             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
311249423Sdim             bool UseCycleCheck);
312234285Sdim
313249423Sdim    void buildInitialDAGFor(
314249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
315249423Sdim             DenseSet<ValuePair> &CandidatePairsSet,
316249423Sdim             std::vector<Value *> &PairableInsts,
317249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
318249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
319249423Sdim             DenseMap<Value *, Value *> &ChosenPairs,
320249423Sdim             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
321234285Sdim
322249423Sdim    void findBestDAGFor(
323249423Sdim             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
324249423Sdim             DenseSet<ValuePair> &CandidatePairsSet,
325249423Sdim             DenseMap<ValuePair, int> &CandidatePairCostSavings,
326249423Sdim             std::vector<Value *> &PairableInsts,
327249423Sdim             DenseSet<ValuePair> &FixedOrderPairs,
328249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes,
329249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
330249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
331249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
332249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
333249423Sdim             DenseSet<VPPair> &PairableInstUserPairSet,
334249423Sdim             DenseMap<Value *, Value *> &ChosenPairs,
335249423Sdim             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
336249423Sdim             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
337249423Sdim             bool UseCycleCheck);
338234285Sdim
339234285Sdim    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
340243830Sdim                     Instruction *J, unsigned o);
341234285Sdim
342234285Sdim    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
343239462Sdim                     unsigned MaskOffset, unsigned NumInElem,
344239462Sdim                     unsigned NumInElem1, unsigned IdxOffset,
345239462Sdim                     std::vector<Constant*> &Mask);
346234285Sdim
347234285Sdim    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
348234285Sdim                     Instruction *J);
349234285Sdim
350239462Sdim    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
351239462Sdim                       unsigned o, Value *&LOp, unsigned numElemL,
352243830Sdim                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
353239462Sdim                       unsigned IdxOff = 0);
354239462Sdim
355234285Sdim    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
356243830Sdim                     Instruction *J, unsigned o, bool IBeforeJ);
357234285Sdim
358234285Sdim    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
359263508Sdim                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
360243830Sdim                     bool IBeforeJ);
361234285Sdim
362234285Sdim    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
363234285Sdim                     Instruction *J, Instruction *K,
364234285Sdim                     Instruction *&InsertionPt, Instruction *&K1,
365243830Sdim                     Instruction *&K2);
366234285Sdim
367234285Sdim    void collectPairLoadMoveSet(BasicBlock &BB,
368234285Sdim                     DenseMap<Value *, Value *> &ChosenPairs,
369249423Sdim                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
370249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
371234285Sdim                     Instruction *I);
372234285Sdim
373234285Sdim    void collectLoadMoveSet(BasicBlock &BB,
374234285Sdim                     std::vector<Value *> &PairableInsts,
375234285Sdim                     DenseMap<Value *, Value *> &ChosenPairs,
376249423Sdim                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
377249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs);
378234285Sdim
379234285Sdim    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
380249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
381234285Sdim                     Instruction *I, Instruction *J);
382234285Sdim
383234285Sdim    void moveUsesOfIAfterJ(BasicBlock &BB,
384249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
385234285Sdim                     Instruction *&InsertionPt,
386234285Sdim                     Instruction *I, Instruction *J);
387234285Sdim
388239462Sdim    void combineMetadata(Instruction *K, const Instruction *J);
389239462Sdim
390234285Sdim    bool vectorizeBB(BasicBlock &BB) {
391243830Sdim      if (!DT->isReachableFromEntry(&BB)) {
392243830Sdim        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
393243830Sdim              " in " << BB.getParent()->getName() << "\n");
394243830Sdim        return false;
395243830Sdim      }
396243830Sdim
397249423Sdim      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
398243830Sdim
399234285Sdim      bool changed = false;
400234285Sdim      // Iterate a sufficient number of times to merge types of size 1 bit,
401234285Sdim      // then 2 bits, then 4, etc. up to half of the target vector width of the
402234285Sdim      // target vector register.
403239462Sdim      unsigned n = 1;
404239462Sdim      for (unsigned v = 2;
405249423Sdim           (TTI || v <= Config.VectorBits) &&
406243830Sdim           (!Config.MaxIter || n <= Config.MaxIter);
407234285Sdim           v *= 2, ++n) {
408234285Sdim        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
409234285Sdim              " for " << BB.getName() << " in " <<
410234285Sdim              BB.getParent()->getName() << "...\n");
411234285Sdim        if (vectorizePairs(BB))
412234285Sdim          changed = true;
413234285Sdim        else
414234285Sdim          break;
415234285Sdim      }
416234285Sdim
417239462Sdim      if (changed && !Pow2LenOnly) {
418239462Sdim        ++n;
419239462Sdim        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
420239462Sdim          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
421239462Sdim                n << " for " << BB.getName() << " in " <<
422239462Sdim                BB.getParent()->getName() << "...\n");
423239462Sdim          if (!vectorizePairs(BB, true)) break;
424239462Sdim        }
425239462Sdim      }
426239462Sdim
427234285Sdim      DEBUG(dbgs() << "BBV: done!\n");
428234285Sdim      return changed;
429234285Sdim    }
430234285Sdim
431234285Sdim    virtual bool runOnBasicBlock(BasicBlock &BB) {
432234285Sdim      AA = &getAnalysis<AliasAnalysis>();
433243830Sdim      DT = &getAnalysis<DominatorTree>();
434234285Sdim      SE = &getAnalysis<ScalarEvolution>();
435243830Sdim      TD = getAnalysisIfAvailable<DataLayout>();
436249423Sdim      TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
437234285Sdim
438234285Sdim      return vectorizeBB(BB);
439234285Sdim    }
440234285Sdim
441234285Sdim    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
442234285Sdim      BasicBlockPass::getAnalysisUsage(AU);
443234285Sdim      AU.addRequired<AliasAnalysis>();
444243830Sdim      AU.addRequired<DominatorTree>();
445234285Sdim      AU.addRequired<ScalarEvolution>();
446249423Sdim      AU.addRequired<TargetTransformInfo>();
447234285Sdim      AU.addPreserved<AliasAnalysis>();
448243830Sdim      AU.addPreserved<DominatorTree>();
449234285Sdim      AU.addPreserved<ScalarEvolution>();
450234285Sdim      AU.setPreservesCFG();
451234285Sdim    }
452234285Sdim
453239462Sdim    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
454239462Sdim      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
455239462Sdim             "Cannot form vector from incompatible scalar types");
456239462Sdim      Type *STy = ElemTy->getScalarType();
457239462Sdim
458239462Sdim      unsigned numElem;
459234285Sdim      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
460239462Sdim        numElem = VTy->getNumElements();
461239462Sdim      } else {
462239462Sdim        numElem = 1;
463234285Sdim      }
464234285Sdim
465239462Sdim      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
466239462Sdim        numElem += VTy->getNumElements();
467239462Sdim      } else {
468239462Sdim        numElem += 1;
469239462Sdim      }
470239462Sdim
471239462Sdim      return VectorType::get(STy, numElem);
472234285Sdim    }
473234285Sdim
474239462Sdim    static inline void getInstructionTypes(Instruction *I,
475239462Sdim                                           Type *&T1, Type *&T2) {
476249423Sdim      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
477239462Sdim        // For stores, it is the value type, not the pointer type that matters
478239462Sdim        // because the value is what will come from a vector register.
479239462Sdim
480249423Sdim        Value *IVal = SI->getValueOperand();
481239462Sdim        T1 = IVal->getType();
482239462Sdim      } else {
483239462Sdim        T1 = I->getType();
484239462Sdim      }
485239462Sdim
486249423Sdim      if (CastInst *CI = dyn_cast<CastInst>(I))
487249423Sdim        T2 = CI->getSrcTy();
488239462Sdim      else
489239462Sdim        T2 = T1;
490243830Sdim
491243830Sdim      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
492243830Sdim        T2 = SI->getCondition()->getType();
493243830Sdim      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
494243830Sdim        T2 = SI->getOperand(0)->getType();
495243830Sdim      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
496243830Sdim        T2 = CI->getOperand(0)->getType();
497243830Sdim      }
498239462Sdim    }
499239462Sdim
500234285Sdim    // Returns the weight associated with the provided value. A chain of
501234285Sdim    // candidate pairs has a length given by the sum of the weights of its
502234285Sdim    // members (one weight per pair; the weight of each member of the pair
503234285Sdim    // is assumed to be the same). This length is then compared to the
504234285Sdim    // chain-length threshold to determine if a given chain is significant
505234285Sdim    // enough to be vectorized. The length is also used in comparing
506234285Sdim    // candidate chains where longer chains are considered to be better.
507234285Sdim    // Note: when this function returns 0, the resulting instructions are
508234285Sdim    // not actually fused.
509234285Sdim    inline size_t getDepthFactor(Value *V) {
510234285Sdim      // InsertElement and ExtractElement have a depth factor of zero. This is
511234285Sdim      // for two reasons: First, they cannot be usefully fused. Second, because
512234285Sdim      // the pass generates a lot of these, they can confuse the simple metric
513249423Sdim      // used to compare the dags in the next iteration. Thus, giving them a
514234285Sdim      // weight of zero allows the pass to essentially ignore them in
515234285Sdim      // subsequent iterations when looking for vectorization opportunities
516234285Sdim      // while still tracking dependency chains that flow through those
517234285Sdim      // instructions.
518234285Sdim      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
519234285Sdim        return 0;
520234285Sdim
521234285Sdim      // Give a load or store half of the required depth so that load/store
522234285Sdim      // pairs will vectorize.
523234285Sdim      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
524234285Sdim        return Config.ReqChainDepth/2;
525234285Sdim
526234285Sdim      return 1;
527234285Sdim    }
528234285Sdim
529249423Sdim    // Returns the cost of the provided instruction using TTI.
530243830Sdim    // This does not handle loads and stores.
531243830Sdim    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
532243830Sdim      switch (Opcode) {
533243830Sdim      default: break;
534243830Sdim      case Instruction::GetElementPtr:
535243830Sdim        // We mark this instruction as zero-cost because scalar GEPs are usually
536263508Sdim        // lowered to the instruction addressing mode. At the moment we don't
537243830Sdim        // generate vector GEPs.
538243830Sdim        return 0;
539243830Sdim      case Instruction::Br:
540249423Sdim        return TTI->getCFInstrCost(Opcode);
541243830Sdim      case Instruction::PHI:
542243830Sdim        return 0;
543243830Sdim      case Instruction::Add:
544243830Sdim      case Instruction::FAdd:
545243830Sdim      case Instruction::Sub:
546243830Sdim      case Instruction::FSub:
547243830Sdim      case Instruction::Mul:
548243830Sdim      case Instruction::FMul:
549243830Sdim      case Instruction::UDiv:
550243830Sdim      case Instruction::SDiv:
551243830Sdim      case Instruction::FDiv:
552243830Sdim      case Instruction::URem:
553243830Sdim      case Instruction::SRem:
554243830Sdim      case Instruction::FRem:
555243830Sdim      case Instruction::Shl:
556243830Sdim      case Instruction::LShr:
557243830Sdim      case Instruction::AShr:
558243830Sdim      case Instruction::And:
559243830Sdim      case Instruction::Or:
560243830Sdim      case Instruction::Xor:
561249423Sdim        return TTI->getArithmeticInstrCost(Opcode, T1);
562243830Sdim      case Instruction::Select:
563243830Sdim      case Instruction::ICmp:
564243830Sdim      case Instruction::FCmp:
565249423Sdim        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
566243830Sdim      case Instruction::ZExt:
567243830Sdim      case Instruction::SExt:
568243830Sdim      case Instruction::FPToUI:
569243830Sdim      case Instruction::FPToSI:
570243830Sdim      case Instruction::FPExt:
571243830Sdim      case Instruction::PtrToInt:
572243830Sdim      case Instruction::IntToPtr:
573243830Sdim      case Instruction::SIToFP:
574243830Sdim      case Instruction::UIToFP:
575243830Sdim      case Instruction::Trunc:
576243830Sdim      case Instruction::FPTrunc:
577243830Sdim      case Instruction::BitCast:
578243830Sdim      case Instruction::ShuffleVector:
579249423Sdim        return TTI->getCastInstrCost(Opcode, T1, T2);
580243830Sdim      }
581243830Sdim
582243830Sdim      return 1;
583243830Sdim    }
584243830Sdim
585234285Sdim    // This determines the relative offset of two loads or stores, returning
586234285Sdim    // true if the offset could be determined to be some constant value.
587234285Sdim    // For example, if OffsetInElmts == 1, then J accesses the memory directly
588234285Sdim    // after I; if OffsetInElmts == -1 then I accesses the memory
589239462Sdim    // directly after J.
590234285Sdim    bool getPairPtrInfo(Instruction *I, Instruction *J,
591234285Sdim        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
592243830Sdim        unsigned &IAddressSpace, unsigned &JAddressSpace,
593243830Sdim        int64_t &OffsetInElmts, bool ComputeOffset = true) {
594234285Sdim      OffsetInElmts = 0;
595243830Sdim      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
596243830Sdim        LoadInst *LJ = cast<LoadInst>(J);
597243830Sdim        IPtr = LI->getPointerOperand();
598243830Sdim        JPtr = LJ->getPointerOperand();
599243830Sdim        IAlignment = LI->getAlignment();
600243830Sdim        JAlignment = LJ->getAlignment();
601243830Sdim        IAddressSpace = LI->getPointerAddressSpace();
602243830Sdim        JAddressSpace = LJ->getPointerAddressSpace();
603234285Sdim      } else {
604243830Sdim        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
605243830Sdim        IPtr = SI->getPointerOperand();
606243830Sdim        JPtr = SJ->getPointerOperand();
607243830Sdim        IAlignment = SI->getAlignment();
608243830Sdim        JAlignment = SJ->getAlignment();
609243830Sdim        IAddressSpace = SI->getPointerAddressSpace();
610243830Sdim        JAddressSpace = SJ->getPointerAddressSpace();
611234285Sdim      }
612234285Sdim
613243830Sdim      if (!ComputeOffset)
614243830Sdim        return true;
615243830Sdim
616234285Sdim      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
617234285Sdim      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
618234285Sdim
619234285Sdim      // If this is a trivial offset, then we'll get something like
620234285Sdim      // 1*sizeof(type). With target data, which we need anyway, this will get
621234285Sdim      // constant folded into a number.
622234285Sdim      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
623234285Sdim      if (const SCEVConstant *ConstOffSCEV =
624234285Sdim            dyn_cast<SCEVConstant>(OffsetSCEV)) {
625234285Sdim        ConstantInt *IntOff = ConstOffSCEV->getValue();
626234285Sdim        int64_t Offset = IntOff->getSExtValue();
627234285Sdim
628263508Sdim        Type *VTy = IPtr->getType()->getPointerElementType();
629234285Sdim        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
630234285Sdim
631263508Sdim        Type *VTy2 = JPtr->getType()->getPointerElementType();
632239462Sdim        if (VTy != VTy2 && Offset < 0) {
633239462Sdim          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
634239462Sdim          OffsetInElmts = Offset/VTy2TSS;
635239462Sdim          return (abs64(Offset) % VTy2TSS) == 0;
636239462Sdim        }
637234285Sdim
638234285Sdim        OffsetInElmts = Offset/VTyTSS;
639234285Sdim        return (abs64(Offset) % VTyTSS) == 0;
640234285Sdim      }
641234285Sdim
642234285Sdim      return false;
643234285Sdim    }
644234285Sdim
645234285Sdim    // Returns true if the provided CallInst represents an intrinsic that can
646234285Sdim    // be vectorized.
647234285Sdim    bool isVectorizableIntrinsic(CallInst* I) {
648234285Sdim      Function *F = I->getCalledFunction();
649234285Sdim      if (!F) return false;
650234285Sdim
651249423Sdim      Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
652234285Sdim      if (!IID) return false;
653234285Sdim
654234285Sdim      switch(IID) {
655234285Sdim      default:
656234285Sdim        return false;
657234285Sdim      case Intrinsic::sqrt:
658234285Sdim      case Intrinsic::powi:
659234285Sdim      case Intrinsic::sin:
660234285Sdim      case Intrinsic::cos:
661234285Sdim      case Intrinsic::log:
662234285Sdim      case Intrinsic::log2:
663234285Sdim      case Intrinsic::log10:
664234285Sdim      case Intrinsic::exp:
665234285Sdim      case Intrinsic::exp2:
666234285Sdim      case Intrinsic::pow:
667234285Sdim        return Config.VectorizeMath;
668234285Sdim      case Intrinsic::fma:
669249423Sdim      case Intrinsic::fmuladd:
670234285Sdim        return Config.VectorizeFMA;
671234285Sdim      }
672234285Sdim    }
673234285Sdim
674243830Sdim    bool isPureIEChain(InsertElementInst *IE) {
675243830Sdim      InsertElementInst *IENext = IE;
676243830Sdim      do {
677243830Sdim        if (!isa<UndefValue>(IENext->getOperand(0)) &&
678243830Sdim            !isa<InsertElementInst>(IENext->getOperand(0))) {
679243830Sdim          return false;
680243830Sdim        }
681243830Sdim      } while ((IENext =
682243830Sdim                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
683243830Sdim
684243830Sdim      return true;
685243830Sdim    }
686234285Sdim  };
687234285Sdim
688234285Sdim  // This function implements one vectorization iteration on the provided
689234285Sdim  // basic block. It returns true if the block is changed.
690239462Sdim  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
691234285Sdim    bool ShouldContinue;
692234285Sdim    BasicBlock::iterator Start = BB.getFirstInsertionPt();
693234285Sdim
694234285Sdim    std::vector<Value *> AllPairableInsts;
695234285Sdim    DenseMap<Value *, Value *> AllChosenPairs;
696243830Sdim    DenseSet<ValuePair> AllFixedOrderPairs;
697243830Sdim    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
698249423Sdim    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
699249423Sdim                                                 AllConnectedPairDeps;
700234285Sdim
701234285Sdim    do {
702234285Sdim      std::vector<Value *> PairableInsts;
703249423Sdim      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
704243830Sdim      DenseSet<ValuePair> FixedOrderPairs;
705243830Sdim      DenseMap<ValuePair, int> CandidatePairCostSavings;
706234285Sdim      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
707243830Sdim                                         FixedOrderPairs,
708243830Sdim                                         CandidatePairCostSavings,
709239462Sdim                                         PairableInsts, NonPow2Len);
710234285Sdim      if (PairableInsts.empty()) continue;
711234285Sdim
712249423Sdim      // Build the candidate pair set for faster lookups.
713249423Sdim      DenseSet<ValuePair> CandidatePairsSet;
714249423Sdim      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
715249423Sdim           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
716249423Sdim        for (std::vector<Value *>::iterator J = I->second.begin(),
717249423Sdim             JE = I->second.end(); J != JE; ++J)
718249423Sdim          CandidatePairsSet.insert(ValuePair(I->first, *J));
719249423Sdim
720234285Sdim      // Now we have a map of all of the pairable instructions and we need to
721234285Sdim      // select the best possible pairing. A good pairing is one such that the
722234285Sdim      // users of the pair are also paired. This defines a (directed) forest
723234285Sdim      // over the pairs such that two pairs are connected iff the second pair
724234285Sdim      // uses the first.
725234285Sdim
726234285Sdim      // Note that it only matters that both members of the second pair use some
727234285Sdim      // element of the first pair (to allow for splatting).
728234285Sdim
729249423Sdim      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
730249423Sdim                                                   ConnectedPairDeps;
731243830Sdim      DenseMap<VPPair, unsigned> PairConnectionTypes;
732249423Sdim      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
733249423Sdim                            PairableInsts, ConnectedPairs, PairConnectionTypes);
734234285Sdim      if (ConnectedPairs.empty()) continue;
735234285Sdim
736249423Sdim      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
737243830Sdim           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
738249423Sdim           I != IE; ++I)
739249423Sdim        for (std::vector<ValuePair>::iterator J = I->second.begin(),
740249423Sdim             JE = I->second.end(); J != JE; ++J)
741249423Sdim          ConnectedPairDeps[*J].push_back(I->first);
742243830Sdim
743234285Sdim      // Build the pairable-instruction dependency map
744234285Sdim      DenseSet<ValuePair> PairableInstUsers;
745234285Sdim      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
746234285Sdim
747234285Sdim      // There is now a graph of the connected pairs. For each variable, pick
748249423Sdim      // the pairing with the largest dag meeting the depth requirement on at
749249423Sdim      // least one branch. Then select all pairings that are part of that dag
750234285Sdim      // and remove them from the list of available pairings and pairable
751234285Sdim      // variables.
752234285Sdim
753234285Sdim      DenseMap<Value *, Value *> ChosenPairs;
754249423Sdim      choosePairs(CandidatePairs, CandidatePairsSet,
755249423Sdim        CandidatePairCostSavings,
756243830Sdim        PairableInsts, FixedOrderPairs, PairConnectionTypes,
757243830Sdim        ConnectedPairs, ConnectedPairDeps,
758234285Sdim        PairableInstUsers, ChosenPairs);
759234285Sdim
760234285Sdim      if (ChosenPairs.empty()) continue;
761234285Sdim      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
762234285Sdim                              PairableInsts.end());
763234285Sdim      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
764243830Sdim
765243830Sdim      // Only for the chosen pairs, propagate information on fixed-order pairs,
766243830Sdim      // pair connections, and their types to the data structures used by the
767243830Sdim      // pair fusion procedures.
768243830Sdim      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
769243830Sdim           IE = ChosenPairs.end(); I != IE; ++I) {
770243830Sdim        if (FixedOrderPairs.count(*I))
771243830Sdim          AllFixedOrderPairs.insert(*I);
772243830Sdim        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
773243830Sdim          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
774243830Sdim
775243830Sdim        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
776243830Sdim             J != IE; ++J) {
777243830Sdim          DenseMap<VPPair, unsigned>::iterator K =
778243830Sdim            PairConnectionTypes.find(VPPair(*I, *J));
779243830Sdim          if (K != PairConnectionTypes.end()) {
780243830Sdim            AllPairConnectionTypes.insert(*K);
781243830Sdim          } else {
782243830Sdim            K = PairConnectionTypes.find(VPPair(*J, *I));
783243830Sdim            if (K != PairConnectionTypes.end())
784243830Sdim              AllPairConnectionTypes.insert(*K);
785243830Sdim          }
786243830Sdim        }
787243830Sdim      }
788243830Sdim
789249423Sdim      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
790243830Sdim           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
791249423Sdim           I != IE; ++I)
792249423Sdim        for (std::vector<ValuePair>::iterator J = I->second.begin(),
793249423Sdim          JE = I->second.end(); J != JE; ++J)
794249423Sdim          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
795249423Sdim            AllConnectedPairs[I->first].push_back(*J);
796249423Sdim            AllConnectedPairDeps[*J].push_back(I->first);
797249423Sdim          }
798234285Sdim    } while (ShouldContinue);
799234285Sdim
800234285Sdim    if (AllChosenPairs.empty()) return false;
801234285Sdim    NumFusedOps += AllChosenPairs.size();
802234285Sdim
803234285Sdim    // A set of pairs has now been selected. It is now necessary to replace the
804234285Sdim    // paired instructions with vector instructions. For this procedure each
805234285Sdim    // operand must be replaced with a vector operand. This vector is formed
806234285Sdim    // by using build_vector on the old operands. The replaced values are then
807234285Sdim    // replaced with a vector_extract on the result.  Subsequent optimization
808234285Sdim    // passes should coalesce the build/extract combinations.
809234285Sdim
810243830Sdim    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
811243830Sdim                    AllPairConnectionTypes,
812243830Sdim                    AllConnectedPairs, AllConnectedPairDeps);
813239462Sdim
814239462Sdim    // It is important to cleanup here so that future iterations of this
815239462Sdim    // function have less work to do.
816243830Sdim    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
817234285Sdim    return true;
818234285Sdim  }
819234285Sdim
820234285Sdim  // This function returns true if the provided instruction is capable of being
821234285Sdim  // fused into a vector instruction. This determination is based only on the
822234285Sdim  // type and other attributes of the instruction.
823234285Sdim  bool BBVectorize::isInstVectorizable(Instruction *I,
824234285Sdim                                         bool &IsSimpleLoadStore) {
825234285Sdim    IsSimpleLoadStore = false;
826234285Sdim
827234285Sdim    if (CallInst *C = dyn_cast<CallInst>(I)) {
828234285Sdim      if (!isVectorizableIntrinsic(C))
829234285Sdim        return false;
830234285Sdim    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
831234285Sdim      // Vectorize simple loads if possbile:
832234285Sdim      IsSimpleLoadStore = L->isSimple();
833234285Sdim      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
834234285Sdim        return false;
835234285Sdim    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
836234285Sdim      // Vectorize simple stores if possbile:
837234285Sdim      IsSimpleLoadStore = S->isSimple();
838234285Sdim      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
839234285Sdim        return false;
840234285Sdim    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
841234285Sdim      // We can vectorize casts, but not casts of pointer types, etc.
842234285Sdim      if (!Config.VectorizeCasts)
843234285Sdim        return false;
844234285Sdim
845234285Sdim      Type *SrcTy = C->getSrcTy();
846234982Sdim      if (!SrcTy->isSingleValueType())
847234285Sdim        return false;
848234285Sdim
849234285Sdim      Type *DestTy = C->getDestTy();
850234982Sdim      if (!DestTy->isSingleValueType())
851234285Sdim        return false;
852234982Sdim    } else if (isa<SelectInst>(I)) {
853234982Sdim      if (!Config.VectorizeSelect)
854234982Sdim        return false;
855239462Sdim    } else if (isa<CmpInst>(I)) {
856239462Sdim      if (!Config.VectorizeCmp)
857239462Sdim        return false;
858234982Sdim    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
859234982Sdim      if (!Config.VectorizeGEP)
860234982Sdim        return false;
861234982Sdim
862234982Sdim      // Currently, vector GEPs exist only with one index.
863234982Sdim      if (G->getNumIndices() != 1)
864234982Sdim        return false;
865234285Sdim    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
866234285Sdim        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
867234285Sdim      return false;
868234285Sdim    }
869234285Sdim
870234285Sdim    // We can't vectorize memory operations without target data
871234285Sdim    if (TD == 0 && IsSimpleLoadStore)
872234285Sdim      return false;
873234285Sdim
874234285Sdim    Type *T1, *T2;
875239462Sdim    getInstructionTypes(I, T1, T2);
876234285Sdim
877234285Sdim    // Not every type can be vectorized...
878234285Sdim    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
879234285Sdim        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
880234285Sdim      return false;
881234285Sdim
882243830Sdim    if (T1->getScalarSizeInBits() == 1) {
883239462Sdim      if (!Config.VectorizeBools)
884239462Sdim        return false;
885239462Sdim    } else {
886243830Sdim      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
887239462Sdim        return false;
888239462Sdim    }
889243830Sdim
890243830Sdim    if (T2->getScalarSizeInBits() == 1) {
891243830Sdim      if (!Config.VectorizeBools)
892243830Sdim        return false;
893243830Sdim    } else {
894243830Sdim      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
895243830Sdim        return false;
896243830Sdim    }
897243830Sdim
898234285Sdim    if (!Config.VectorizeFloats
899234285Sdim        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
900234285Sdim      return false;
901234285Sdim
902239462Sdim    // Don't vectorize target-specific types.
903239462Sdim    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
904239462Sdim      return false;
905239462Sdim    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
906239462Sdim      return false;
907239462Sdim
908234982Sdim    if ((!Config.VectorizePointers || TD == 0) &&
909234982Sdim        (T1->getScalarType()->isPointerTy() ||
910234982Sdim         T2->getScalarType()->isPointerTy()))
911234982Sdim      return false;
912234982Sdim
913249423Sdim    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
914249423Sdim                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
915234285Sdim      return false;
916234285Sdim
917234285Sdim    return true;
918234285Sdim  }
919234285Sdim
920234285Sdim  // This function returns true if the two provided instructions are compatible
921234285Sdim  // (meaning that they can be fused into a vector instruction). This assumes
922234285Sdim  // that I has already been determined to be vectorizable and that J is not
923249423Sdim  // in the use dag of I.
924234285Sdim  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
925243830Sdim                       bool IsSimpleLoadStore, bool NonPow2Len,
926243830Sdim                       int &CostSavings, int &FixedOrder) {
927234285Sdim    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
928234285Sdim                     " <-> " << *J << "\n");
929234285Sdim
930243830Sdim    CostSavings = 0;
931243830Sdim    FixedOrder = 0;
932243830Sdim
933234285Sdim    // Loads and stores can be merged if they have different alignments,
934234285Sdim    // but are otherwise the same.
935239462Sdim    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
936239462Sdim                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
937239462Sdim      return false;
938234285Sdim
939239462Sdim    Type *IT1, *IT2, *JT1, *JT2;
940239462Sdim    getInstructionTypes(I, IT1, IT2);
941239462Sdim    getInstructionTypes(J, JT1, JT2);
942239462Sdim    unsigned MaxTypeBits = std::max(
943239462Sdim      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
944239462Sdim      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
945249423Sdim    if (!TTI && MaxTypeBits > Config.VectorBits)
946234285Sdim      return false;
947239462Sdim
948234285Sdim    // FIXME: handle addsub-type operations!
949234285Sdim
950234285Sdim    if (IsSimpleLoadStore) {
951234285Sdim      Value *IPtr, *JPtr;
952243830Sdim      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
953234285Sdim      int64_t OffsetInElmts = 0;
954234285Sdim      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
955243830Sdim            IAddressSpace, JAddressSpace,
956234285Sdim            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
957243830Sdim        FixedOrder = (int) OffsetInElmts;
958243830Sdim        unsigned BottomAlignment = IAlignment;
959243830Sdim        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
960243830Sdim
961243830Sdim        Type *aTypeI = isa<StoreInst>(I) ?
962243830Sdim          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
963243830Sdim        Type *aTypeJ = isa<StoreInst>(J) ?
964243830Sdim          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
965243830Sdim        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
966243830Sdim
967234285Sdim        if (Config.AlignedOnly) {
968234285Sdim          // An aligned load or store is possible only if the instruction
969234285Sdim          // with the lower offset has an alignment suitable for the
970234285Sdim          // vector type.
971234285Sdim
972234285Sdim          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
973234285Sdim          if (BottomAlignment < VecAlignment)
974234285Sdim            return false;
975234285Sdim        }
976243830Sdim
977249423Sdim        if (TTI) {
978249423Sdim          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
979249423Sdim                                                IAlignment, IAddressSpace);
980249423Sdim          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
981249423Sdim                                                JAlignment, JAddressSpace);
982249423Sdim          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
983249423Sdim                                                BottomAlignment,
984249423Sdim                                                IAddressSpace);
985249423Sdim
986249423Sdim          ICost += TTI->getAddressComputationCost(aTypeI);
987249423Sdim          JCost += TTI->getAddressComputationCost(aTypeJ);
988249423Sdim          VCost += TTI->getAddressComputationCost(VType);
989249423Sdim
990243830Sdim          if (VCost > ICost + JCost)
991243830Sdim            return false;
992243830Sdim
993243830Sdim          // We don't want to fuse to a type that will be split, even
994243830Sdim          // if the two input types will also be split and there is no other
995243830Sdim          // associated cost.
996249423Sdim          unsigned VParts = TTI->getNumberOfParts(VType);
997243830Sdim          if (VParts > 1)
998243830Sdim            return false;
999243830Sdim          else if (!VParts && VCost == ICost + JCost)
1000243830Sdim            return false;
1001243830Sdim
1002243830Sdim          CostSavings = ICost + JCost - VCost;
1003243830Sdim        }
1004234285Sdim      } else {
1005234285Sdim        return false;
1006234285Sdim      }
1007249423Sdim    } else if (TTI) {
1008243830Sdim      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
1009243830Sdim      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
1010243830Sdim      Type *VT1 = getVecTypeForPair(IT1, JT1),
1011243830Sdim           *VT2 = getVecTypeForPair(IT2, JT2);
1012249423Sdim
1013249423Sdim      // Note that this procedure is incorrect for insert and extract element
1014249423Sdim      // instructions (because combining these often results in a shuffle),
1015249423Sdim      // but this cost is ignored (because insert and extract element
1016249423Sdim      // instructions are assigned a zero depth factor and are not really
1017249423Sdim      // fused in general).
1018243830Sdim      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
1019243830Sdim
1020243830Sdim      if (VCost > ICost + JCost)
1021243830Sdim        return false;
1022243830Sdim
1023243830Sdim      // We don't want to fuse to a type that will be split, even
1024243830Sdim      // if the two input types will also be split and there is no other
1025243830Sdim      // associated cost.
1026249423Sdim      unsigned VParts1 = TTI->getNumberOfParts(VT1),
1027249423Sdim               VParts2 = TTI->getNumberOfParts(VT2);
1028243830Sdim      if (VParts1 > 1 || VParts2 > 1)
1029243830Sdim        return false;
1030243830Sdim      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1031243830Sdim        return false;
1032243830Sdim
1033243830Sdim      CostSavings = ICost + JCost - VCost;
1034234285Sdim    }
1035234285Sdim
1036234285Sdim    // The powi intrinsic is special because only the first argument is
1037234285Sdim    // vectorized, the second arguments must be equal.
1038234285Sdim    CallInst *CI = dyn_cast<CallInst>(I);
1039234285Sdim    Function *FI;
1040249423Sdim    if (CI && (FI = CI->getCalledFunction())) {
1041249423Sdim      Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
1042249423Sdim      if (IID == Intrinsic::powi) {
1043249423Sdim        Value *A1I = CI->getArgOperand(1),
1044249423Sdim              *A1J = cast<CallInst>(J)->getArgOperand(1);
1045249423Sdim        const SCEV *A1ISCEV = SE->getSCEV(A1I),
1046249423Sdim                   *A1JSCEV = SE->getSCEV(A1J);
1047249423Sdim        return (A1ISCEV == A1JSCEV);
1048249423Sdim      }
1049234285Sdim
1050249423Sdim      if (IID && TTI) {
1051249423Sdim        SmallVector<Type*, 4> Tys;
1052249423Sdim        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
1053249423Sdim          Tys.push_back(CI->getArgOperand(i)->getType());
1054249423Sdim        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
1055249423Sdim
1056249423Sdim        Tys.clear();
1057249423Sdim        CallInst *CJ = cast<CallInst>(J);
1058249423Sdim        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
1059249423Sdim          Tys.push_back(CJ->getArgOperand(i)->getType());
1060249423Sdim        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
1061249423Sdim
1062249423Sdim        Tys.clear();
1063249423Sdim        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1064249423Sdim               "Intrinsic argument counts differ");
1065249423Sdim        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1066249423Sdim          if (IID == Intrinsic::powi && i == 1)
1067249423Sdim            Tys.push_back(CI->getArgOperand(i)->getType());
1068249423Sdim          else
1069249423Sdim            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1070249423Sdim                                            CJ->getArgOperand(i)->getType()));
1071249423Sdim        }
1072249423Sdim
1073249423Sdim        Type *RetTy = getVecTypeForPair(IT1, JT1);
1074249423Sdim        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
1075249423Sdim
1076249423Sdim        if (VCost > ICost + JCost)
1077249423Sdim          return false;
1078249423Sdim
1079249423Sdim        // We don't want to fuse to a type that will be split, even
1080249423Sdim        // if the two input types will also be split and there is no other
1081249423Sdim        // associated cost.
1082249423Sdim        unsigned RetParts = TTI->getNumberOfParts(RetTy);
1083249423Sdim        if (RetParts > 1)
1084249423Sdim          return false;
1085249423Sdim        else if (!RetParts && VCost == ICost + JCost)
1086249423Sdim          return false;
1087249423Sdim
1088249423Sdim        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1089249423Sdim          if (!Tys[i]->isVectorTy())
1090249423Sdim            continue;
1091249423Sdim
1092249423Sdim          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1093249423Sdim          if (NumParts > 1)
1094249423Sdim            return false;
1095249423Sdim          else if (!NumParts && VCost == ICost + JCost)
1096249423Sdim            return false;
1097249423Sdim        }
1098249423Sdim
1099249423Sdim        CostSavings = ICost + JCost - VCost;
1100249423Sdim      }
1101234285Sdim    }
1102234285Sdim
1103234285Sdim    return true;
1104234285Sdim  }
1105234285Sdim
1106234285Sdim  // Figure out whether or not J uses I and update the users and write-set
1107234285Sdim  // structures associated with I. Specifically, Users represents the set of
1108234285Sdim  // instructions that depend on I. WriteSet represents the set
1109234285Sdim  // of memory locations that are dependent on I. If UpdateUsers is true,
1110234285Sdim  // and J uses I, then Users is updated to contain J and WriteSet is updated
1111234285Sdim  // to contain any memory locations to which J writes. The function returns
1112234285Sdim  // true if J uses I. By default, alias analysis is used to determine
1113234285Sdim  // whether J reads from memory that overlaps with a location in WriteSet.
1114249423Sdim  // If LoadMoveSet is not null, then it is a previously-computed map
1115234285Sdim  // where the key is the memory-based user instruction and the value is
1116234285Sdim  // the instruction to be compared with I. So, if LoadMoveSet is provided,
1117234285Sdim  // then the alias analysis is not used. This is necessary because this
1118234285Sdim  // function is called during the process of moving instructions during
1119234285Sdim  // vectorization and the results of the alias analysis are not stable during
1120234285Sdim  // that process.
1121234285Sdim  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
1122234285Sdim                       AliasSetTracker &WriteSet, Instruction *I,
1123234285Sdim                       Instruction *J, bool UpdateUsers,
1124249423Sdim                       DenseSet<ValuePair> *LoadMoveSetPairs) {
1125234285Sdim    bool UsesI = false;
1126234285Sdim
1127234285Sdim    // This instruction may already be marked as a user due, for example, to
1128234285Sdim    // being a member of a selected pair.
1129234285Sdim    if (Users.count(J))
1130234285Sdim      UsesI = true;
1131234285Sdim
1132234285Sdim    if (!UsesI)
1133234285Sdim      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1134234285Sdim           JU != JE; ++JU) {
1135234285Sdim        Value *V = *JU;
1136234285Sdim        if (I == V || Users.count(V)) {
1137234285Sdim          UsesI = true;
1138234285Sdim          break;
1139234285Sdim        }
1140234285Sdim      }
1141234285Sdim    if (!UsesI && J->mayReadFromMemory()) {
1142249423Sdim      if (LoadMoveSetPairs) {
1143249423Sdim        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1144234285Sdim      } else {
1145234285Sdim        for (AliasSetTracker::iterator W = WriteSet.begin(),
1146234285Sdim             WE = WriteSet.end(); W != WE; ++W) {
1147234285Sdim          if (W->aliasesUnknownInst(J, *AA)) {
1148234285Sdim            UsesI = true;
1149234285Sdim            break;
1150234285Sdim          }
1151234285Sdim        }
1152234285Sdim      }
1153234285Sdim    }
1154234285Sdim
1155234285Sdim    if (UsesI && UpdateUsers) {
1156234285Sdim      if (J->mayWriteToMemory()) WriteSet.add(J);
1157234285Sdim      Users.insert(J);
1158234285Sdim    }
1159234285Sdim
1160234285Sdim    return UsesI;
1161234285Sdim  }
1162234285Sdim
1163234285Sdim  // This function iterates over all instruction pairs in the provided
1164234285Sdim  // basic block and collects all candidate pairs for vectorization.
1165234285Sdim  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1166234285Sdim                       BasicBlock::iterator &Start,
1167249423Sdim                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1168243830Sdim                       DenseSet<ValuePair> &FixedOrderPairs,
1169243830Sdim                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
1170239462Sdim                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
1171249423Sdim    size_t TotalPairs = 0;
1172234285Sdim    BasicBlock::iterator E = BB.end();
1173234285Sdim    if (Start == E) return false;
1174234285Sdim
1175234285Sdim    bool ShouldContinue = false, IAfterStart = false;
1176234285Sdim    for (BasicBlock::iterator I = Start++; I != E; ++I) {
1177234285Sdim      if (I == Start) IAfterStart = true;
1178234285Sdim
1179234285Sdim      bool IsSimpleLoadStore;
1180234285Sdim      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
1181234285Sdim
1182234285Sdim      // Look for an instruction with which to pair instruction *I...
1183234285Sdim      DenseSet<Value *> Users;
1184234285Sdim      AliasSetTracker WriteSet(*AA);
1185263508Sdim      if (I->mayWriteToMemory()) WriteSet.add(I);
1186263508Sdim
1187234285Sdim      bool JAfterStart = IAfterStart;
1188234285Sdim      BasicBlock::iterator J = llvm::next(I);
1189234285Sdim      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1190234285Sdim        if (J == Start) JAfterStart = true;
1191234285Sdim
1192234285Sdim        // Determine if J uses I, if so, exit the loop.
1193234285Sdim        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
1194234285Sdim        if (Config.FastDep) {
1195234285Sdim          // Note: For this heuristic to be effective, independent operations
1196234285Sdim          // must tend to be intermixed. This is likely to be true from some
1197234285Sdim          // kinds of grouped loop unrolling (but not the generic LLVM pass),
1198234285Sdim          // but otherwise may require some kind of reordering pass.
1199234285Sdim
1200234285Sdim          // When using fast dependency analysis,
1201234285Sdim          // stop searching after first use:
1202234285Sdim          if (UsesI) break;
1203234285Sdim        } else {
1204234285Sdim          if (UsesI) continue;
1205234285Sdim        }
1206234285Sdim
1207234285Sdim        // J does not use I, and comes before the first use of I, so it can be
1208234285Sdim        // merged with I if the instructions are compatible.
1209243830Sdim        int CostSavings, FixedOrder;
1210243830Sdim        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
1211243830Sdim            CostSavings, FixedOrder)) continue;
1212234285Sdim
1213234285Sdim        // J is a candidate for merging with I.
1214234285Sdim        if (!PairableInsts.size() ||
1215234285Sdim             PairableInsts[PairableInsts.size()-1] != I) {
1216234285Sdim          PairableInsts.push_back(I);
1217234285Sdim        }
1218234285Sdim
1219249423Sdim        CandidatePairs[I].push_back(J);
1220249423Sdim        ++TotalPairs;
1221249423Sdim        if (TTI)
1222243830Sdim          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
1223243830Sdim                                                            CostSavings));
1224234285Sdim
1225243830Sdim        if (FixedOrder == 1)
1226243830Sdim          FixedOrderPairs.insert(ValuePair(I, J));
1227243830Sdim        else if (FixedOrder == -1)
1228243830Sdim          FixedOrderPairs.insert(ValuePair(J, I));
1229243830Sdim
1230234285Sdim        // The next call to this function must start after the last instruction
1231234285Sdim        // selected during this invocation.
1232234285Sdim        if (JAfterStart) {
1233234285Sdim          Start = llvm::next(J);
1234234285Sdim          IAfterStart = JAfterStart = false;
1235234285Sdim        }
1236234285Sdim
1237234285Sdim        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1238243830Sdim                     << *I << " <-> " << *J << " (cost savings: " <<
1239243830Sdim                     CostSavings << ")\n");
1240234285Sdim
1241234285Sdim        // If we have already found too many pairs, break here and this function
1242234285Sdim        // will be called again starting after the last instruction selected
1243234285Sdim        // during this invocation.
1244249423Sdim        if (PairableInsts.size() >= Config.MaxInsts ||
1245249423Sdim            TotalPairs >= Config.MaxPairs) {
1246234285Sdim          ShouldContinue = true;
1247234285Sdim          break;
1248234285Sdim        }
1249234285Sdim      }
1250234285Sdim
1251234285Sdim      if (ShouldContinue)
1252234285Sdim        break;
1253234285Sdim    }
1254234285Sdim
1255234285Sdim    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1256234285Sdim           << " instructions with candidate pairs\n");
1257234285Sdim
1258234285Sdim    return ShouldContinue;
1259234285Sdim  }
1260234285Sdim
1261234285Sdim  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
1262234285Sdim  // it looks for pairs such that both members have an input which is an
1263234285Sdim  // output of PI or PJ.
1264234285Sdim  void BBVectorize::computePairsConnectedTo(
1265249423Sdim                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1266249423Sdim                  DenseSet<ValuePair> &CandidatePairsSet,
1267249423Sdim                  std::vector<Value *> &PairableInsts,
1268249423Sdim                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1269249423Sdim                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
1270249423Sdim                  ValuePair P) {
1271234982Sdim    StoreInst *SI, *SJ;
1272234982Sdim
1273234285Sdim    // For each possible pairing for this variable, look at the uses of
1274234285Sdim    // the first value...
1275234285Sdim    for (Value::use_iterator I = P.first->use_begin(),
1276234285Sdim         E = P.first->use_end(); I != E; ++I) {
1277234982Sdim      if (isa<LoadInst>(*I)) {
1278234982Sdim        // A pair cannot be connected to a load because the load only takes one
1279234982Sdim        // operand (the address) and it is a scalar even after vectorization.
1280234982Sdim        continue;
1281234982Sdim      } else if ((SI = dyn_cast<StoreInst>(*I)) &&
1282234982Sdim                 P.first == SI->getPointerOperand()) {
1283234982Sdim        // Similarly, a pair cannot be connected to a store through its
1284234982Sdim        // pointer operand.
1285234982Sdim        continue;
1286234982Sdim      }
1287234982Sdim
1288234285Sdim      // For each use of the first variable, look for uses of the second
1289234285Sdim      // variable...
1290234285Sdim      for (Value::use_iterator J = P.second->use_begin(),
1291234285Sdim           E2 = P.second->use_end(); J != E2; ++J) {
1292234982Sdim        if ((SJ = dyn_cast<StoreInst>(*J)) &&
1293234982Sdim            P.second == SJ->getPointerOperand())
1294234982Sdim          continue;
1295234982Sdim
1296234285Sdim        // Look for <I, J>:
1297249423Sdim        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1298243830Sdim          VPPair VP(P, ValuePair(*I, *J));
1299249423Sdim          ConnectedPairs[VP.first].push_back(VP.second);
1300243830Sdim          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1301243830Sdim        }
1302234285Sdim
1303234285Sdim        // Look for <J, I>:
1304249423Sdim        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
1305243830Sdim          VPPair VP(P, ValuePair(*J, *I));
1306249423Sdim          ConnectedPairs[VP.first].push_back(VP.second);
1307243830Sdim          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1308243830Sdim        }
1309234285Sdim      }
1310234285Sdim
1311234285Sdim      if (Config.SplatBreaksChain) continue;
1312234285Sdim      // Look for cases where just the first value in the pair is used by
1313234285Sdim      // both members of another pair (splatting).
1314234285Sdim      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
1315234982Sdim        if ((SJ = dyn_cast<StoreInst>(*J)) &&
1316234982Sdim            P.first == SJ->getPointerOperand())
1317234982Sdim          continue;
1318234982Sdim
1319249423Sdim        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1320243830Sdim          VPPair VP(P, ValuePair(*I, *J));
1321249423Sdim          ConnectedPairs[VP.first].push_back(VP.second);
1322243830Sdim          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1323243830Sdim        }
1324234285Sdim      }
1325234285Sdim    }
1326234285Sdim
1327234285Sdim    if (Config.SplatBreaksChain) return;
1328234285Sdim    // Look for cases where just the second value in the pair is used by
1329234285Sdim    // both members of another pair (splatting).
1330234285Sdim    for (Value::use_iterator I = P.second->use_begin(),
1331234285Sdim         E = P.second->use_end(); I != E; ++I) {
1332234982Sdim      if (isa<LoadInst>(*I))
1333234982Sdim        continue;
1334234982Sdim      else if ((SI = dyn_cast<StoreInst>(*I)) &&
1335234982Sdim               P.second == SI->getPointerOperand())
1336234982Sdim        continue;
1337234982Sdim
1338234285Sdim      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
1339234982Sdim        if ((SJ = dyn_cast<StoreInst>(*J)) &&
1340234982Sdim            P.second == SJ->getPointerOperand())
1341234982Sdim          continue;
1342234982Sdim
1343249423Sdim        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
1344243830Sdim          VPPair VP(P, ValuePair(*I, *J));
1345249423Sdim          ConnectedPairs[VP.first].push_back(VP.second);
1346243830Sdim          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1347243830Sdim        }
1348234285Sdim      }
1349234285Sdim    }
1350234285Sdim  }
1351234285Sdim
1352234285Sdim  // This function figures out which pairs are connected.  Two pairs are
1353234285Sdim  // connected if some output of the first pair forms an input to both members
1354234285Sdim  // of the second pair.
1355234285Sdim  void BBVectorize::computeConnectedPairs(
1356249423Sdim                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1357249423Sdim                  DenseSet<ValuePair> &CandidatePairsSet,
1358249423Sdim                  std::vector<Value *> &PairableInsts,
1359249423Sdim                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1360249423Sdim                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
1361234285Sdim    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
1362234285Sdim         PE = PairableInsts.end(); PI != PE; ++PI) {
1363249423Sdim      DenseMap<Value *, std::vector<Value *> >::iterator PP =
1364249423Sdim        CandidatePairs.find(*PI);
1365249423Sdim      if (PP == CandidatePairs.end())
1366249423Sdim        continue;
1367234285Sdim
1368249423Sdim      for (std::vector<Value *>::iterator P = PP->second.begin(),
1369249423Sdim           E = PP->second.end(); P != E; ++P)
1370249423Sdim        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1371249423Sdim                                PairableInsts, ConnectedPairs,
1372249423Sdim                                PairConnectionTypes, ValuePair(*PI, *P));
1373234285Sdim    }
1374234285Sdim
1375249423Sdim    DEBUG(size_t TotalPairs = 0;
1376249423Sdim          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
1377249423Sdim               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1378249423Sdim            TotalPairs += I->second.size();
1379249423Sdim          dbgs() << "BBV: found " << TotalPairs
1380234285Sdim                 << " pair connections.\n");
1381234285Sdim  }
1382234285Sdim
1383234285Sdim  // This function builds a set of use tuples such that <A, B> is in the set
1384249423Sdim  // if B is in the use dag of A. If B is in the use dag of A, then B
1385234285Sdim  // depends on the output of A.
1386234285Sdim  void BBVectorize::buildDepMap(
1387234285Sdim                      BasicBlock &BB,
1388249423Sdim                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1389234285Sdim                      std::vector<Value *> &PairableInsts,
1390234285Sdim                      DenseSet<ValuePair> &PairableInstUsers) {
1391234285Sdim    DenseSet<Value *> IsInPair;
1392249423Sdim    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
1393249423Sdim         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
1394234285Sdim      IsInPair.insert(C->first);
1395249423Sdim      IsInPair.insert(C->second.begin(), C->second.end());
1396234285Sdim    }
1397234285Sdim
1398249423Sdim    // Iterate through the basic block, recording all users of each
1399234285Sdim    // pairable instruction.
1400234285Sdim
1401249423Sdim    BasicBlock::iterator E = BB.end(), EL =
1402249423Sdim      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
1403234285Sdim    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1404234285Sdim      if (IsInPair.find(I) == IsInPair.end()) continue;
1405234285Sdim
1406234285Sdim      DenseSet<Value *> Users;
1407234285Sdim      AliasSetTracker WriteSet(*AA);
1408263508Sdim      if (I->mayWriteToMemory()) WriteSet.add(I);
1409263508Sdim
1410249423Sdim      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
1411234285Sdim        (void) trackUsesOfI(Users, WriteSet, I, J);
1412234285Sdim
1413249423Sdim        if (J == EL)
1414249423Sdim          break;
1415249423Sdim      }
1416249423Sdim
1417234285Sdim      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
1418249423Sdim           U != E; ++U) {
1419249423Sdim        if (IsInPair.find(*U) == IsInPair.end()) continue;
1420234285Sdim        PairableInstUsers.insert(ValuePair(I, *U));
1421249423Sdim      }
1422249423Sdim
1423249423Sdim      if (I == EL)
1424249423Sdim        break;
1425234285Sdim    }
1426234285Sdim  }
1427234285Sdim
1428234285Sdim  // Returns true if an input to pair P is an output of pair Q and also an
1429234285Sdim  // input of pair Q is an output of pair P. If this is the case, then these
1430234285Sdim  // two pairs cannot be simultaneously fused.
1431234285Sdim  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1432249423Sdim             DenseSet<ValuePair> &PairableInstUsers,
1433249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
1434249423Sdim             DenseSet<VPPair> *PairableInstUserPairSet) {
1435234285Sdim    // Two pairs are in conflict if they are mutual Users of eachother.
1436234285Sdim    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
1437234285Sdim                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
1438234285Sdim                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
1439234285Sdim                  PairableInstUsers.count(ValuePair(P.second, Q.second));
1440234285Sdim    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
1441234285Sdim                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
1442234285Sdim                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
1443234285Sdim                  PairableInstUsers.count(ValuePair(Q.second, P.second));
1444234285Sdim    if (PairableInstUserMap) {
1445234285Sdim      // FIXME: The expensive part of the cycle check is not so much the cycle
1446234285Sdim      // check itself but this edge insertion procedure. This needs some
1447249423Sdim      // profiling and probably a different data structure.
1448234285Sdim      if (PUsesQ) {
1449249423Sdim        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1450249423Sdim          (*PairableInstUserMap)[Q].push_back(P);
1451234285Sdim      }
1452234285Sdim      if (QUsesP) {
1453249423Sdim        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1454249423Sdim          (*PairableInstUserMap)[P].push_back(Q);
1455234285Sdim      }
1456234285Sdim    }
1457234285Sdim
1458234285Sdim    return (QUsesP && PUsesQ);
1459234285Sdim  }
1460234285Sdim
1461234285Sdim  // This function walks the use graph of current pairs to see if, starting
1462234285Sdim  // from P, the walk returns to P.
1463234285Sdim  bool BBVectorize::pairWillFormCycle(ValuePair P,
1464249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1465249423Sdim             DenseSet<ValuePair> &CurrentPairs) {
1466234285Sdim    DEBUG(if (DebugCycleCheck)
1467234285Sdim            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1468234285Sdim                   << *P.second << "\n");
1469234285Sdim    // A lookup table of visisted pairs is kept because the PairableInstUserMap
1470234285Sdim    // contains non-direct associations.
1471234285Sdim    DenseSet<ValuePair> Visited;
1472234285Sdim    SmallVector<ValuePair, 32> Q;
1473234285Sdim    // General depth-first post-order traversal:
1474234285Sdim    Q.push_back(P);
1475234285Sdim    do {
1476234285Sdim      ValuePair QTop = Q.pop_back_val();
1477234285Sdim      Visited.insert(QTop);
1478234285Sdim
1479234285Sdim      DEBUG(if (DebugCycleCheck)
1480234285Sdim              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1481234285Sdim                     << *QTop.second << "\n");
1482249423Sdim      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1483249423Sdim        PairableInstUserMap.find(QTop);
1484249423Sdim      if (QQ == PairableInstUserMap.end())
1485249423Sdim        continue;
1486249423Sdim
1487249423Sdim      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
1488249423Sdim           CE = QQ->second.end(); C != CE; ++C) {
1489249423Sdim        if (*C == P) {
1490234285Sdim          DEBUG(dbgs()
1491234285Sdim                 << "BBV: rejected to prevent non-trivial cycle formation: "
1492249423Sdim                 << QTop.first << " <-> " << C->second << "\n");
1493234285Sdim          return true;
1494234285Sdim        }
1495234285Sdim
1496249423Sdim        if (CurrentPairs.count(*C) && !Visited.count(*C))
1497249423Sdim          Q.push_back(*C);
1498234285Sdim      }
1499234285Sdim    } while (!Q.empty());
1500234285Sdim
1501234285Sdim    return false;
1502234285Sdim  }
1503234285Sdim
1504249423Sdim  // This function builds the initial dag of connected pairs with the
1505234285Sdim  // pair J at the root.
1506249423Sdim  void BBVectorize::buildInitialDAGFor(
1507249423Sdim                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1508249423Sdim                  DenseSet<ValuePair> &CandidatePairsSet,
1509249423Sdim                  std::vector<Value *> &PairableInsts,
1510249423Sdim                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1511249423Sdim                  DenseSet<ValuePair> &PairableInstUsers,
1512249423Sdim                  DenseMap<Value *, Value *> &ChosenPairs,
1513249423Sdim                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
1514249423Sdim    // Each of these pairs is viewed as the root node of a DAG. The DAG
1515234285Sdim    // is then walked (depth-first). As this happens, we keep track of
1516249423Sdim    // the pairs that compose the DAG and the maximum depth of the DAG.
1517234285Sdim    SmallVector<ValuePairWithDepth, 32> Q;
1518234285Sdim    // General depth-first post-order traversal:
1519234285Sdim    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1520234285Sdim    do {
1521234285Sdim      ValuePairWithDepth QTop = Q.back();
1522234285Sdim
1523234285Sdim      // Push each child onto the queue:
1524234285Sdim      bool MoreChildren = false;
1525234285Sdim      size_t MaxChildDepth = QTop.second;
1526249423Sdim      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1527249423Sdim        ConnectedPairs.find(QTop.first);
1528249423Sdim      if (QQ != ConnectedPairs.end())
1529249423Sdim        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
1530249423Sdim             ke = QQ->second.end(); k != ke; ++k) {
1531249423Sdim          // Make sure that this child pair is still a candidate:
1532249423Sdim          if (CandidatePairsSet.count(*k)) {
1533249423Sdim            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
1534249423Sdim            if (C == DAG.end()) {
1535249423Sdim              size_t d = getDepthFactor(k->first);
1536249423Sdim              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1537249423Sdim              MoreChildren = true;
1538249423Sdim            } else {
1539249423Sdim              MaxChildDepth = std::max(MaxChildDepth, C->second);
1540249423Sdim            }
1541234285Sdim          }
1542234285Sdim        }
1543234285Sdim
1544234285Sdim      if (!MoreChildren) {
1545249423Sdim        // Record the current pair as part of the DAG:
1546249423Sdim        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1547234285Sdim        Q.pop_back();
1548234285Sdim      }
1549234285Sdim    } while (!Q.empty());
1550234285Sdim  }
1551234285Sdim
1552249423Sdim  // Given some initial dag, prune it by removing conflicting pairs (pairs
1553234285Sdim  // that cannot be simultaneously chosen for vectorization).
1554249423Sdim  void BBVectorize::pruneDAGFor(
1555249423Sdim              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1556249423Sdim              std::vector<Value *> &PairableInsts,
1557249423Sdim              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1558249423Sdim              DenseSet<ValuePair> &PairableInstUsers,
1559249423Sdim              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1560249423Sdim              DenseSet<VPPair> &PairableInstUserPairSet,
1561249423Sdim              DenseMap<Value *, Value *> &ChosenPairs,
1562249423Sdim              DenseMap<ValuePair, size_t> &DAG,
1563249423Sdim              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
1564249423Sdim              bool UseCycleCheck) {
1565234285Sdim    SmallVector<ValuePairWithDepth, 32> Q;
1566234285Sdim    // General depth-first post-order traversal:
1567234285Sdim    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1568234285Sdim    do {
1569234285Sdim      ValuePairWithDepth QTop = Q.pop_back_val();
1570249423Sdim      PrunedDAG.insert(QTop.first);
1571234285Sdim
1572234285Sdim      // Visit each child, pruning as necessary...
1573243830Sdim      SmallVector<ValuePairWithDepth, 8> BestChildren;
1574249423Sdim      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
1575249423Sdim        ConnectedPairs.find(QTop.first);
1576249423Sdim      if (QQ == ConnectedPairs.end())
1577249423Sdim        continue;
1578234285Sdim
1579249423Sdim      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
1580249423Sdim           KE = QQ->second.end(); K != KE; ++K) {
1581249423Sdim        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
1582249423Sdim        if (C == DAG.end()) continue;
1583249423Sdim
1584249423Sdim        // This child is in the DAG, now we need to make sure it is the
1585234285Sdim        // best of any conflicting children. There could be multiple
1586234285Sdim        // conflicting children, so first, determine if we're keeping
1587234285Sdim        // this child, then delete conflicting children as necessary.
1588234285Sdim
1589234285Sdim        // It is also necessary to guard against pairing-induced
1590234285Sdim        // dependencies. Consider instructions a .. x .. y .. b
1591234285Sdim        // such that (a,b) are to be fused and (x,y) are to be fused
1592234285Sdim        // but a is an input to x and b is an output from y. This
1593234285Sdim        // means that y cannot be moved after b but x must be moved
1594234285Sdim        // after b for (a,b) to be fused. In other words, after
1595234285Sdim        // fusing (a,b) we have y .. a/b .. x where y is an input
1596234285Sdim        // to a/b and x is an output to a/b: x and y can no longer
1597234285Sdim        // be legally fused. To prevent this condition, we must
1598249423Sdim        // make sure that a child pair added to the DAG is not
1599234285Sdim        // both an input and output of an already-selected pair.
1600234285Sdim
1601234285Sdim        // Pairing-induced dependencies can also form from more complicated
1602234285Sdim        // cycles. The pair vs. pair conflicts are easy to check, and so
1603234285Sdim        // that is done explicitly for "fast rejection", and because for
1604234285Sdim        // child vs. child conflicts, we may prefer to keep the current
1605234285Sdim        // pair in preference to the already-selected child.
1606234285Sdim        DenseSet<ValuePair> CurrentPairs;
1607234285Sdim
1608234285Sdim        bool CanAdd = true;
1609263508Sdim        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
1610234285Sdim              = BestChildren.begin(), E2 = BestChildren.end();
1611234285Sdim             C2 != E2; ++C2) {
1612234285Sdim          if (C2->first.first == C->first.first ||
1613234285Sdim              C2->first.first == C->first.second ||
1614234285Sdim              C2->first.second == C->first.first ||
1615234285Sdim              C2->first.second == C->first.second ||
1616234285Sdim              pairsConflict(C2->first, C->first, PairableInstUsers,
1617249423Sdim                            UseCycleCheck ? &PairableInstUserMap : 0,
1618249423Sdim                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1619234285Sdim            if (C2->second >= C->second) {
1620234285Sdim              CanAdd = false;
1621234285Sdim              break;
1622234285Sdim            }
1623234285Sdim
1624234285Sdim            CurrentPairs.insert(C2->first);
1625234285Sdim          }
1626234285Sdim        }
1627234285Sdim        if (!CanAdd) continue;
1628234285Sdim
1629234285Sdim        // Even worse, this child could conflict with another node already
1630249423Sdim        // selected for the DAG. If that is the case, ignore this child.
1631249423Sdim        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
1632249423Sdim             E2 = PrunedDAG.end(); T != E2; ++T) {
1633234285Sdim          if (T->first == C->first.first ||
1634234285Sdim              T->first == C->first.second ||
1635234285Sdim              T->second == C->first.first ||
1636234285Sdim              T->second == C->first.second ||
1637234285Sdim              pairsConflict(*T, C->first, PairableInstUsers,
1638249423Sdim                            UseCycleCheck ? &PairableInstUserMap : 0,
1639249423Sdim                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1640234285Sdim            CanAdd = false;
1641234285Sdim            break;
1642234285Sdim          }
1643234285Sdim
1644234285Sdim          CurrentPairs.insert(*T);
1645234285Sdim        }
1646234285Sdim        if (!CanAdd) continue;
1647234285Sdim
1648234285Sdim        // And check the queue too...
1649263508Sdim        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
1650234285Sdim             E2 = Q.end(); C2 != E2; ++C2) {
1651234285Sdim          if (C2->first.first == C->first.first ||
1652234285Sdim              C2->first.first == C->first.second ||
1653234285Sdim              C2->first.second == C->first.first ||
1654234285Sdim              C2->first.second == C->first.second ||
1655234285Sdim              pairsConflict(C2->first, C->first, PairableInstUsers,
1656249423Sdim                            UseCycleCheck ? &PairableInstUserMap : 0,
1657249423Sdim                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1658234285Sdim            CanAdd = false;
1659234285Sdim            break;
1660234285Sdim          }
1661234285Sdim
1662234285Sdim          CurrentPairs.insert(C2->first);
1663234285Sdim        }
1664234285Sdim        if (!CanAdd) continue;
1665234285Sdim
1666234285Sdim        // Last but not least, check for a conflict with any of the
1667234285Sdim        // already-chosen pairs.
1668234285Sdim        for (DenseMap<Value *, Value *>::iterator C2 =
1669234285Sdim              ChosenPairs.begin(), E2 = ChosenPairs.end();
1670234285Sdim             C2 != E2; ++C2) {
1671234285Sdim          if (pairsConflict(*C2, C->first, PairableInstUsers,
1672249423Sdim                            UseCycleCheck ? &PairableInstUserMap : 0,
1673249423Sdim                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1674234285Sdim            CanAdd = false;
1675234285Sdim            break;
1676234285Sdim          }
1677234285Sdim
1678234285Sdim          CurrentPairs.insert(*C2);
1679234285Sdim        }
1680234285Sdim        if (!CanAdd) continue;
1681234285Sdim
1682234285Sdim        // To check for non-trivial cycles formed by the addition of the
1683234285Sdim        // current pair we've formed a list of all relevant pairs, now use a
1684234285Sdim        // graph walk to check for a cycle. We start from the current pair and
1685249423Sdim        // walk the use dag to see if we again reach the current pair. If we
1686234285Sdim        // do, then the current pair is rejected.
1687234285Sdim
1688234285Sdim        // FIXME: It may be more efficient to use a topological-ordering
1689234285Sdim        // algorithm to improve the cycle check. This should be investigated.
1690234285Sdim        if (UseCycleCheck &&
1691234285Sdim            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1692234285Sdim          continue;
1693234285Sdim
1694234285Sdim        // This child can be added, but we may have chosen it in preference
1695234285Sdim        // to an already-selected child. Check for this here, and if a
1696234285Sdim        // conflict is found, then remove the previously-selected child
1697234285Sdim        // before adding this one in its place.
1698263508Sdim        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
1699234285Sdim              = BestChildren.begin(); C2 != BestChildren.end();) {
1700234285Sdim          if (C2->first.first == C->first.first ||
1701234285Sdim              C2->first.first == C->first.second ||
1702234285Sdim              C2->first.second == C->first.first ||
1703234285Sdim              C2->first.second == C->first.second ||
1704234285Sdim              pairsConflict(C2->first, C->first, PairableInstUsers))
1705243830Sdim            C2 = BestChildren.erase(C2);
1706234285Sdim          else
1707234285Sdim            ++C2;
1708234285Sdim        }
1709234285Sdim
1710243830Sdim        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1711234285Sdim      }
1712234285Sdim
1713263508Sdim      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
1714234285Sdim            = BestChildren.begin(), E2 = BestChildren.end();
1715234285Sdim           C != E2; ++C) {
1716234285Sdim        size_t DepthF = getDepthFactor(C->first.first);
1717234285Sdim        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1718234285Sdim      }
1719234285Sdim    } while (!Q.empty());
1720234285Sdim  }
1721234285Sdim
1722249423Sdim  // This function finds the best dag of mututally-compatible connected
1723234285Sdim  // pairs, given the choice of root pairs as an iterator range.
1724249423Sdim  void BBVectorize::findBestDAGFor(
1725249423Sdim              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
1726249423Sdim              DenseSet<ValuePair> &CandidatePairsSet,
1727249423Sdim              DenseMap<ValuePair, int> &CandidatePairCostSavings,
1728249423Sdim              std::vector<Value *> &PairableInsts,
1729249423Sdim              DenseSet<ValuePair> &FixedOrderPairs,
1730249423Sdim              DenseMap<VPPair, unsigned> &PairConnectionTypes,
1731249423Sdim              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
1732249423Sdim              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
1733249423Sdim              DenseSet<ValuePair> &PairableInstUsers,
1734249423Sdim              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
1735249423Sdim              DenseSet<VPPair> &PairableInstUserPairSet,
1736249423Sdim              DenseMap<Value *, Value *> &ChosenPairs,
1737249423Sdim              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
1738249423Sdim              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
1739249423Sdim              bool UseCycleCheck) {
1740249423Sdim    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
1741249423Sdim         J != JE; ++J) {
1742249423Sdim      ValuePair IJ(II, *J);
1743249423Sdim      if (!CandidatePairsSet.count(IJ))
1744249423Sdim        continue;
1745234285Sdim
1746234285Sdim      // Before going any further, make sure that this pair does not
1747234285Sdim      // conflict with any already-selected pairs (see comment below
1748249423Sdim      // near the DAG pruning for more details).
1749234285Sdim      DenseSet<ValuePair> ChosenPairSet;
1750234285Sdim      bool DoesConflict = false;
1751234285Sdim      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
1752234285Sdim           E = ChosenPairs.end(); C != E; ++C) {
1753249423Sdim        if (pairsConflict(*C, IJ, PairableInstUsers,
1754249423Sdim                          UseCycleCheck ? &PairableInstUserMap : 0,
1755249423Sdim                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
1756234285Sdim          DoesConflict = true;
1757234285Sdim          break;
1758234285Sdim        }
1759234285Sdim
1760234285Sdim        ChosenPairSet.insert(*C);
1761234285Sdim      }
1762234285Sdim      if (DoesConflict) continue;
1763234285Sdim
1764234285Sdim      if (UseCycleCheck &&
1765249423Sdim          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1766234285Sdim        continue;
1767234285Sdim
1768249423Sdim      DenseMap<ValuePair, size_t> DAG;
1769249423Sdim      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1770249423Sdim                          PairableInsts, ConnectedPairs,
1771249423Sdim                          PairableInstUsers, ChosenPairs, DAG, IJ);
1772234285Sdim
1773234285Sdim      // Because we'll keep the child with the largest depth, the largest
1774249423Sdim      // depth is still the same in the unpruned DAG.
1775249423Sdim      size_t MaxDepth = DAG.lookup(IJ);
1776234285Sdim
1777249423Sdim      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1778249423Sdim                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1779249423Sdim                   MaxDepth << " and size " << DAG.size() << "\n");
1780234285Sdim
1781249423Sdim      // At this point the DAG has been constructed, but, may contain
1782234285Sdim      // contradictory children (meaning that different children of
1783249423Sdim      // some dag node may be attempting to fuse the same instruction).
1784249423Sdim      // So now we walk the dag again, in the case of a conflict,
1785234285Sdim      // keep only the child with the largest depth. To break a tie,
1786234285Sdim      // favor the first child.
1787234285Sdim
1788249423Sdim      DenseSet<ValuePair> PrunedDAG;
1789249423Sdim      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1790249423Sdim                   PairableInstUsers, PairableInstUserMap,
1791249423Sdim                   PairableInstUserPairSet,
1792249423Sdim                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1793234285Sdim
1794243830Sdim      int EffSize = 0;
1795249423Sdim      if (TTI) {
1796249423Sdim        DenseSet<Value *> PrunedDAGInstrs;
1797249423Sdim        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1798249423Sdim             E = PrunedDAG.end(); S != E; ++S) {
1799249423Sdim          PrunedDAGInstrs.insert(S->first);
1800249423Sdim          PrunedDAGInstrs.insert(S->second);
1801243830Sdim        }
1802234285Sdim
1803243830Sdim        // The set of pairs that have already contributed to the total cost.
1804243830Sdim        DenseSet<ValuePair> IncomingPairs;
1805243830Sdim
1806243830Sdim        // If the cost model were perfect, this might not be necessary; but we
1807243830Sdim        // need to make sure that we don't get stuck vectorizing our own
1808243830Sdim        // shuffle chains.
1809243830Sdim        bool HasNontrivialInsts = false;
1810243830Sdim
1811243830Sdim        // The node weights represent the cost savings associated with
1812243830Sdim        // fusing the pair of instructions.
1813249423Sdim        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
1814249423Sdim             E = PrunedDAG.end(); S != E; ++S) {
1815243830Sdim          if (!isa<ShuffleVectorInst>(S->first) &&
1816243830Sdim              !isa<InsertElementInst>(S->first) &&
1817243830Sdim              !isa<ExtractElementInst>(S->first))
1818243830Sdim            HasNontrivialInsts = true;
1819243830Sdim
1820243830Sdim          bool FlipOrder = false;
1821243830Sdim
1822243830Sdim          if (getDepthFactor(S->first)) {
1823243830Sdim            int ESContrib = CandidatePairCostSavings.find(*S)->second;
1824243830Sdim            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1825243830Sdim                   << *S->first << " <-> " << *S->second << "} = " <<
1826243830Sdim                   ESContrib << "\n");
1827243830Sdim            EffSize += ESContrib;
1828243830Sdim          }
1829243830Sdim
1830243830Sdim          // The edge weights contribute in a negative sense: they represent
1831243830Sdim          // the cost of shuffles.
1832249423Sdim          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
1833249423Sdim            ConnectedPairDeps.find(*S);
1834249423Sdim          if (SS != ConnectedPairDeps.end()) {
1835243830Sdim            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1836249423Sdim            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1837249423Sdim                 TE = SS->second.end(); T != TE; ++T) {
1838249423Sdim              VPPair Q(*S, *T);
1839249423Sdim              if (!PrunedDAG.count(Q.second))
1840243830Sdim                continue;
1841243830Sdim              DenseMap<VPPair, unsigned>::iterator R =
1842249423Sdim                PairConnectionTypes.find(VPPair(Q.second, Q.first));
1843243830Sdim              assert(R != PairConnectionTypes.end() &&
1844243830Sdim                     "Cannot find pair connection type");
1845243830Sdim              if (R->second == PairConnectionDirect)
1846243830Sdim                ++NumDepsDirect;
1847243830Sdim              else if (R->second == PairConnectionSwap)
1848243830Sdim                ++NumDepsSwap;
1849243830Sdim            }
1850243830Sdim
1851243830Sdim            // If there are more swaps than direct connections, then
1852243830Sdim            // the pair order will be flipped during fusion. So the real
1853243830Sdim            // number of swaps is the minimum number.
1854243830Sdim            FlipOrder = !FixedOrderPairs.count(*S) &&
1855243830Sdim              ((NumDepsSwap > NumDepsDirect) ||
1856243830Sdim                FixedOrderPairs.count(ValuePair(S->second, S->first)));
1857243830Sdim
1858249423Sdim            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
1859249423Sdim                 TE = SS->second.end(); T != TE; ++T) {
1860249423Sdim              VPPair Q(*S, *T);
1861249423Sdim              if (!PrunedDAG.count(Q.second))
1862243830Sdim                continue;
1863243830Sdim              DenseMap<VPPair, unsigned>::iterator R =
1864249423Sdim                PairConnectionTypes.find(VPPair(Q.second, Q.first));
1865243830Sdim              assert(R != PairConnectionTypes.end() &&
1866243830Sdim                     "Cannot find pair connection type");
1867249423Sdim              Type *Ty1 = Q.second.first->getType(),
1868249423Sdim                   *Ty2 = Q.second.second->getType();
1869243830Sdim              Type *VTy = getVecTypeForPair(Ty1, Ty2);
1870243830Sdim              if ((R->second == PairConnectionDirect && FlipOrder) ||
1871243830Sdim                  (R->second == PairConnectionSwap && !FlipOrder)  ||
1872243830Sdim                  R->second == PairConnectionSplat) {
1873243830Sdim                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1874243830Sdim                                                   VTy, VTy);
1875249423Sdim
1876249423Sdim                if (VTy->getVectorNumElements() == 2) {
1877249423Sdim                  if (R->second == PairConnectionSplat)
1878249423Sdim                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1879249423Sdim                      TargetTransformInfo::SK_Broadcast, VTy));
1880249423Sdim                  else
1881249423Sdim                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1882249423Sdim                      TargetTransformInfo::SK_Reverse, VTy));
1883249423Sdim                }
1884249423Sdim
1885243830Sdim                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1886249423Sdim                  *Q.second.first << " <-> " << *Q.second.second <<
1887243830Sdim                    "} -> {" <<
1888243830Sdim                  *S->first << " <-> " << *S->second << "} = " <<
1889243830Sdim                   ESContrib << "\n");
1890243830Sdim                EffSize -= ESContrib;
1891243830Sdim              }
1892243830Sdim            }
1893243830Sdim          }
1894243830Sdim
1895243830Sdim          // Compute the cost of outgoing edges. We assume that edges outgoing
1896243830Sdim          // to shuffles, inserts or extracts can be merged, and so contribute
1897243830Sdim          // no additional cost.
1898243830Sdim          if (!S->first->getType()->isVoidTy()) {
1899243830Sdim            Type *Ty1 = S->first->getType(),
1900243830Sdim                 *Ty2 = S->second->getType();
1901243830Sdim            Type *VTy = getVecTypeForPair(Ty1, Ty2);
1902243830Sdim
1903243830Sdim            bool NeedsExtraction = false;
1904243830Sdim            for (Value::use_iterator I = S->first->use_begin(),
1905243830Sdim                 IE = S->first->use_end(); I != IE; ++I) {
1906243830Sdim              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1907243830Sdim                // Shuffle can be folded if it has no other input
1908243830Sdim                if (isa<UndefValue>(SI->getOperand(1)))
1909243830Sdim                  continue;
1910243830Sdim              }
1911243830Sdim              if (isa<ExtractElementInst>(*I))
1912243830Sdim                continue;
1913249423Sdim              if (PrunedDAGInstrs.count(*I))
1914243830Sdim                continue;
1915243830Sdim              NeedsExtraction = true;
1916243830Sdim              break;
1917243830Sdim            }
1918243830Sdim
1919243830Sdim            if (NeedsExtraction) {
1920243830Sdim              int ESContrib;
1921249423Sdim              if (Ty1->isVectorTy()) {
1922243830Sdim                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1923243830Sdim                                               Ty1, VTy);
1924249423Sdim                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1925249423Sdim                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
1926249423Sdim              } else
1927249423Sdim                ESContrib = (int) TTI->getVectorInstrCost(
1928243830Sdim                                    Instruction::ExtractElement, VTy, 0);
1929243830Sdim
1930243830Sdim              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1931243830Sdim                *S->first << "} = " << ESContrib << "\n");
1932243830Sdim              EffSize -= ESContrib;
1933243830Sdim            }
1934243830Sdim
1935243830Sdim            NeedsExtraction = false;
1936243830Sdim            for (Value::use_iterator I = S->second->use_begin(),
1937243830Sdim                 IE = S->second->use_end(); I != IE; ++I) {
1938243830Sdim              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
1939243830Sdim                // Shuffle can be folded if it has no other input
1940243830Sdim                if (isa<UndefValue>(SI->getOperand(1)))
1941243830Sdim                  continue;
1942243830Sdim              }
1943243830Sdim              if (isa<ExtractElementInst>(*I))
1944243830Sdim                continue;
1945249423Sdim              if (PrunedDAGInstrs.count(*I))
1946243830Sdim                continue;
1947243830Sdim              NeedsExtraction = true;
1948243830Sdim              break;
1949243830Sdim            }
1950243830Sdim
1951243830Sdim            if (NeedsExtraction) {
1952243830Sdim              int ESContrib;
1953249423Sdim              if (Ty2->isVectorTy()) {
1954243830Sdim                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1955243830Sdim                                               Ty2, VTy);
1956249423Sdim                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1957249423Sdim                  TargetTransformInfo::SK_ExtractSubvector, VTy,
1958249423Sdim                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
1959249423Sdim              } else
1960249423Sdim                ESContrib = (int) TTI->getVectorInstrCost(
1961243830Sdim                                    Instruction::ExtractElement, VTy, 1);
1962243830Sdim              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
1963243830Sdim                *S->second << "} = " << ESContrib << "\n");
1964243830Sdim              EffSize -= ESContrib;
1965243830Sdim            }
1966243830Sdim          }
1967243830Sdim
1968243830Sdim          // Compute the cost of incoming edges.
1969243830Sdim          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
1970243830Sdim            Instruction *S1 = cast<Instruction>(S->first),
1971243830Sdim                        *S2 = cast<Instruction>(S->second);
1972243830Sdim            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
1973243830Sdim              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
1974243830Sdim
1975243830Sdim              // Combining constants into vector constants (or small vector
1976243830Sdim              // constants into larger ones are assumed free).
1977243830Sdim              if (isa<Constant>(O1) && isa<Constant>(O2))
1978243830Sdim                continue;
1979243830Sdim
1980243830Sdim              if (FlipOrder)
1981243830Sdim                std::swap(O1, O2);
1982243830Sdim
1983243830Sdim              ValuePair VP  = ValuePair(O1, O2);
1984243830Sdim              ValuePair VPR = ValuePair(O2, O1);
1985243830Sdim
1986243830Sdim              // Internal edges are not handled here.
1987249423Sdim              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
1988243830Sdim                continue;
1989243830Sdim
1990243830Sdim              Type *Ty1 = O1->getType(),
1991243830Sdim                   *Ty2 = O2->getType();
1992243830Sdim              Type *VTy = getVecTypeForPair(Ty1, Ty2);
1993243830Sdim
1994243830Sdim              // Combining vector operations of the same type is also assumed
1995243830Sdim              // folded with other operations.
1996243830Sdim              if (Ty1 == Ty2) {
1997243830Sdim                // If both are insert elements, then both can be widened.
1998243830Sdim                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
1999243830Sdim                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
2000243830Sdim                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
2001243830Sdim                  continue;
2002243830Sdim                // If both are extract elements, and both have the same input
2003243830Sdim                // type, then they can be replaced with a shuffle
2004243830Sdim                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
2005243830Sdim                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
2006243830Sdim                if (EIO1 && EIO2 &&
2007243830Sdim                    EIO1->getOperand(0)->getType() ==
2008243830Sdim                      EIO2->getOperand(0)->getType())
2009243830Sdim                  continue;
2010243830Sdim                // If both are a shuffle with equal operand types and only two
2011243830Sdim                // unqiue operands, then they can be replaced with a single
2012243830Sdim                // shuffle
2013243830Sdim                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
2014243830Sdim                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
2015243830Sdim                if (SIO1 && SIO2 &&
2016243830Sdim                    SIO1->getOperand(0)->getType() ==
2017243830Sdim                      SIO2->getOperand(0)->getType()) {
2018243830Sdim                  SmallSet<Value *, 4> SIOps;
2019243830Sdim                  SIOps.insert(SIO1->getOperand(0));
2020243830Sdim                  SIOps.insert(SIO1->getOperand(1));
2021243830Sdim                  SIOps.insert(SIO2->getOperand(0));
2022243830Sdim                  SIOps.insert(SIO2->getOperand(1));
2023243830Sdim                  if (SIOps.size() <= 2)
2024243830Sdim                    continue;
2025243830Sdim                }
2026243830Sdim              }
2027243830Sdim
2028243830Sdim              int ESContrib;
2029243830Sdim              // This pair has already been formed.
2030243830Sdim              if (IncomingPairs.count(VP)) {
2031243830Sdim                continue;
2032243830Sdim              } else if (IncomingPairs.count(VPR)) {
2033243830Sdim                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2034243830Sdim                                               VTy, VTy);
2035249423Sdim
2036249423Sdim                if (VTy->getVectorNumElements() == 2)
2037249423Sdim                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2038249423Sdim                    TargetTransformInfo::SK_Reverse, VTy));
2039243830Sdim              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
2040249423Sdim                ESContrib = (int) TTI->getVectorInstrCost(
2041243830Sdim                                    Instruction::InsertElement, VTy, 0);
2042249423Sdim                ESContrib += (int) TTI->getVectorInstrCost(
2043243830Sdim                                     Instruction::InsertElement, VTy, 1);
2044243830Sdim              } else if (!Ty1->isVectorTy()) {
2045243830Sdim                // O1 needs to be inserted into a vector of size O2, and then
2046243830Sdim                // both need to be shuffled together.
2047249423Sdim                ESContrib = (int) TTI->getVectorInstrCost(
2048243830Sdim                                    Instruction::InsertElement, Ty2, 0);
2049243830Sdim                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2050243830Sdim                                                VTy, Ty2);
2051243830Sdim              } else if (!Ty2->isVectorTy()) {
2052243830Sdim                // O2 needs to be inserted into a vector of size O1, and then
2053243830Sdim                // both need to be shuffled together.
2054249423Sdim                ESContrib = (int) TTI->getVectorInstrCost(
2055243830Sdim                                    Instruction::InsertElement, Ty1, 0);
2056243830Sdim                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2057243830Sdim                                                VTy, Ty1);
2058243830Sdim              } else {
2059243830Sdim                Type *TyBig = Ty1, *TySmall = Ty2;
2060243830Sdim                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2061243830Sdim                  std::swap(TyBig, TySmall);
2062243830Sdim
2063243830Sdim                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2064243830Sdim                                               VTy, TyBig);
2065243830Sdim                if (TyBig != TySmall)
2066243830Sdim                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2067243830Sdim                                                  TyBig, TySmall);
2068243830Sdim              }
2069243830Sdim
2070243830Sdim              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
2071243830Sdim                     << *O1 << " <-> " << *O2 << "} = " <<
2072243830Sdim                     ESContrib << "\n");
2073243830Sdim              EffSize -= ESContrib;
2074243830Sdim              IncomingPairs.insert(VP);
2075243830Sdim            }
2076243830Sdim          }
2077243830Sdim        }
2078243830Sdim
2079243830Sdim        if (!HasNontrivialInsts) {
2080243830Sdim          DEBUG(if (DebugPairSelection) dbgs() <<
2081249423Sdim                "\tNo non-trivial instructions in DAG;"
2082243830Sdim                " override to zero effective size\n");
2083243830Sdim          EffSize = 0;
2084243830Sdim        }
2085243830Sdim      } else {
2086249423Sdim        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
2087249423Sdim             E = PrunedDAG.end(); S != E; ++S)
2088243830Sdim          EffSize += (int) getDepthFactor(S->first);
2089243830Sdim      }
2090243830Sdim
2091234285Sdim      DEBUG(if (DebugPairSelection)
2092249423Sdim             dbgs() << "BBV: found pruned DAG for pair {"
2093249423Sdim             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2094249423Sdim             MaxDepth << " and size " << PrunedDAG.size() <<
2095234285Sdim            " (effective size: " << EffSize << ")\n");
2096249423Sdim      if (((TTI && !UseChainDepthWithTI) ||
2097243830Sdim            MaxDepth >= Config.ReqChainDepth) &&
2098243830Sdim          EffSize > 0 && EffSize > BestEffSize) {
2099234285Sdim        BestMaxDepth = MaxDepth;
2100234285Sdim        BestEffSize = EffSize;
2101249423Sdim        BestDAG = PrunedDAG;
2102234285Sdim      }
2103234285Sdim    }
2104234285Sdim  }
2105234285Sdim
2106234285Sdim  // Given the list of candidate pairs, this function selects those
2107234285Sdim  // that will be fused into vector instructions.
2108234285Sdim  void BBVectorize::choosePairs(
2109249423Sdim                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
2110249423Sdim                DenseSet<ValuePair> &CandidatePairsSet,
2111249423Sdim                DenseMap<ValuePair, int> &CandidatePairCostSavings,
2112249423Sdim                std::vector<Value *> &PairableInsts,
2113249423Sdim                DenseSet<ValuePair> &FixedOrderPairs,
2114249423Sdim                DenseMap<VPPair, unsigned> &PairConnectionTypes,
2115249423Sdim                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2116249423Sdim                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
2117249423Sdim                DenseSet<ValuePair> &PairableInstUsers,
2118249423Sdim                DenseMap<Value *, Value *>& ChosenPairs) {
2119234285Sdim    bool UseCycleCheck =
2120249423Sdim     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2121249423Sdim
2122249423Sdim    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
2123249423Sdim    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
2124249423Sdim         E = CandidatePairsSet.end(); I != E; ++I) {
2125249423Sdim      std::vector<Value *> &JJ = CandidatePairs2[I->second];
2126249423Sdim      if (JJ.empty()) JJ.reserve(32);
2127249423Sdim      JJ.push_back(I->first);
2128249423Sdim    }
2129249423Sdim
2130249423Sdim    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
2131249423Sdim    DenseSet<VPPair> PairableInstUserPairSet;
2132234285Sdim    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
2133234285Sdim         E = PairableInsts.end(); I != E; ++I) {
2134234285Sdim      // The number of possible pairings for this variable:
2135249423Sdim      size_t NumChoices = CandidatePairs.lookup(*I).size();
2136234285Sdim      if (!NumChoices) continue;
2137234285Sdim
2138249423Sdim      std::vector<Value *> &JJ = CandidatePairs[*I];
2139234285Sdim
2140249423Sdim      // The best pair to choose and its dag:
2141243830Sdim      size_t BestMaxDepth = 0;
2142243830Sdim      int BestEffSize = 0;
2143249423Sdim      DenseSet<ValuePair> BestDAG;
2144249423Sdim      findBestDAGFor(CandidatePairs, CandidatePairsSet,
2145249423Sdim                      CandidatePairCostSavings,
2146243830Sdim                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
2147243830Sdim                      ConnectedPairs, ConnectedPairDeps,
2148249423Sdim                      PairableInstUsers, PairableInstUserMap,
2149249423Sdim                      PairableInstUserPairSet, ChosenPairs,
2150249423Sdim                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2151234285Sdim                      UseCycleCheck);
2152234285Sdim
2153249423Sdim      if (BestDAG.empty())
2154249423Sdim        continue;
2155249423Sdim
2156249423Sdim      // A dag has been chosen (or not) at this point. If no dag was
2157234285Sdim      // chosen, then this instruction, I, cannot be paired (and is no longer
2158234285Sdim      // considered).
2159234285Sdim
2160249423Sdim      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
2161249423Sdim                   << *cast<Instruction>(*I) << "\n");
2162234285Sdim
2163249423Sdim      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
2164249423Sdim           SE2 = BestDAG.end(); S != SE2; ++S) {
2165249423Sdim        // Insert the members of this dag into the list of chosen pairs.
2166234285Sdim        ChosenPairs.insert(ValuePair(S->first, S->second));
2167234285Sdim        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2168234285Sdim               *S->second << "\n");
2169234285Sdim
2170249423Sdim        // Remove all candidate pairs that have values in the chosen dag.
2171249423Sdim        std::vector<Value *> &KK = CandidatePairs[S->first];
2172249423Sdim        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
2173249423Sdim             K != KE; ++K) {
2174249423Sdim          if (*K == S->second)
2175249423Sdim            continue;
2176249423Sdim
2177249423Sdim          CandidatePairsSet.erase(ValuePair(S->first, *K));
2178234285Sdim        }
2179249423Sdim
2180249423Sdim        std::vector<Value *> &LL = CandidatePairs2[S->second];
2181249423Sdim        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
2182249423Sdim             L != LE; ++L) {
2183249423Sdim          if (*L == S->first)
2184249423Sdim            continue;
2185249423Sdim
2186249423Sdim          CandidatePairsSet.erase(ValuePair(*L, S->second));
2187249423Sdim        }
2188249423Sdim
2189249423Sdim        std::vector<Value *> &MM = CandidatePairs[S->second];
2190249423Sdim        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
2191249423Sdim             M != ME; ++M) {
2192249423Sdim          assert(*M != S->first && "Flipped pair in candidate list?");
2193249423Sdim          CandidatePairsSet.erase(ValuePair(S->second, *M));
2194249423Sdim        }
2195249423Sdim
2196249423Sdim        std::vector<Value *> &NN = CandidatePairs2[S->first];
2197249423Sdim        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
2198249423Sdim             N != NE; ++N) {
2199249423Sdim          assert(*N != S->second && "Flipped pair in candidate list?");
2200249423Sdim          CandidatePairsSet.erase(ValuePair(*N, S->first));
2201249423Sdim        }
2202234285Sdim      }
2203234285Sdim    }
2204234285Sdim
2205234285Sdim    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2206234285Sdim  }
2207234285Sdim
2208234285Sdim  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2209234285Sdim                     unsigned n = 0) {
2210234285Sdim    if (!I->hasName())
2211234285Sdim      return "";
2212234285Sdim
2213234285Sdim    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2214234285Sdim             (n > 0 ? "." + utostr(n) : "")).str();
2215234285Sdim  }
2216234285Sdim
2217234285Sdim  // Returns the value that is to be used as the pointer input to the vector
2218234285Sdim  // instruction that fuses I with J.
2219234285Sdim  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2220243830Sdim                     Instruction *I, Instruction *J, unsigned o) {
2221234285Sdim    Value *IPtr, *JPtr;
2222243830Sdim    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2223234285Sdim    int64_t OffsetInElmts;
2224239462Sdim
2225243830Sdim    // Note: the analysis might fail here, that is why the pair order has
2226239462Sdim    // been precomputed (OffsetInElmts must be unused here).
2227234285Sdim    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2228243830Sdim                          IAddressSpace, JAddressSpace,
2229243830Sdim                          OffsetInElmts, false);
2230234285Sdim
2231234285Sdim    // The pointer value is taken to be the one with the lowest offset.
2232243830Sdim    Value *VPtr = IPtr;
2233234285Sdim
2234263508Sdim    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2235263508Sdim    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2236239462Sdim    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2237263508Sdim    Type *VArgPtrType
2238263508Sdim      = PointerType::get(VArgType,
2239263508Sdim                         IPtr->getType()->getPointerAddressSpace());
2240234285Sdim    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2241243830Sdim                        /* insert before */ I);
2242234285Sdim  }
2243234285Sdim
2244234285Sdim  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2245239462Sdim                     unsigned MaskOffset, unsigned NumInElem,
2246239462Sdim                     unsigned NumInElem1, unsigned IdxOffset,
2247239462Sdim                     std::vector<Constant*> &Mask) {
2248263508Sdim    unsigned NumElem1 = J->getType()->getVectorNumElements();
2249239462Sdim    for (unsigned v = 0; v < NumElem1; ++v) {
2250234285Sdim      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
2251234285Sdim      if (m < 0) {
2252234285Sdim        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2253234285Sdim      } else {
2254234285Sdim        unsigned mm = m + (int) IdxOffset;
2255239462Sdim        if (m >= (int) NumInElem1)
2256234285Sdim          mm += (int) NumInElem;
2257234285Sdim
2258234285Sdim        Mask[v+MaskOffset] =
2259234285Sdim          ConstantInt::get(Type::getInt32Ty(Context), mm);
2260234285Sdim      }
2261234285Sdim    }
2262234285Sdim  }
2263234285Sdim
2264234285Sdim  // Returns the value that is to be used as the vector-shuffle mask to the
2265234285Sdim  // vector instruction that fuses I with J.
2266234285Sdim  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2267234285Sdim                     Instruction *I, Instruction *J) {
2268234285Sdim    // This is the shuffle mask. We need to append the second
2269234285Sdim    // mask to the first, and the numbers need to be adjusted.
2270234285Sdim
2271239462Sdim    Type *ArgTypeI = I->getType();
2272239462Sdim    Type *ArgTypeJ = J->getType();
2273239462Sdim    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2274234285Sdim
2275263508Sdim    unsigned NumElemI = ArgTypeI->getVectorNumElements();
2276239462Sdim
2277234285Sdim    // Get the total number of elements in the fused vector type.
2278234285Sdim    // By definition, this must equal the number of elements in
2279234285Sdim    // the final mask.
2280263508Sdim    unsigned NumElem = VArgType->getVectorNumElements();
2281234285Sdim    std::vector<Constant*> Mask(NumElem);
2282234285Sdim
2283239462Sdim    Type *OpTypeI = I->getOperand(0)->getType();
2284263508Sdim    unsigned NumInElemI = OpTypeI->getVectorNumElements();
2285239462Sdim    Type *OpTypeJ = J->getOperand(0)->getType();
2286263508Sdim    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2287234285Sdim
2288239462Sdim    // The fused vector will be:
2289239462Sdim    // -----------------------------------------------------
2290239462Sdim    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2291239462Sdim    // -----------------------------------------------------
2292239462Sdim    // from which we'll extract NumElem total elements (where the first NumElemI
2293239462Sdim    // of them come from the mask in I and the remainder come from the mask
2294239462Sdim    // in J.
2295239462Sdim
2296234285Sdim    // For the mask from the first pair...
2297239462Sdim    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
2298239462Sdim                       0,          Mask);
2299234285Sdim
2300234285Sdim    // For the mask from the second pair...
2301239462Sdim    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2302239462Sdim                       NumInElemI, Mask);
2303234285Sdim
2304234285Sdim    return ConstantVector::get(Mask);
2305234285Sdim  }
2306234285Sdim
2307239462Sdim  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2308239462Sdim                                  Instruction *J, unsigned o, Value *&LOp,
2309239462Sdim                                  unsigned numElemL,
2310239462Sdim                                  Type *ArgTypeL, Type *ArgTypeH,
2311243830Sdim                                  bool IBeforeJ, unsigned IdxOff) {
2312239462Sdim    bool ExpandedIEChain = false;
2313239462Sdim    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
2314239462Sdim      // If we have a pure insertelement chain, then this can be rewritten
2315239462Sdim      // into a chain that directly builds the larger type.
2316243830Sdim      if (isPureIEChain(LIE)) {
2317239462Sdim        SmallVector<Value *, 8> VectElemts(numElemL,
2318239462Sdim          UndefValue::get(ArgTypeL->getScalarType()));
2319239462Sdim        InsertElementInst *LIENext = LIE;
2320239462Sdim        do {
2321239462Sdim          unsigned Idx =
2322239462Sdim            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
2323239462Sdim          VectElemts[Idx] = LIENext->getOperand(1);
2324239462Sdim        } while ((LIENext =
2325239462Sdim                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
2326239462Sdim
2327239462Sdim        LIENext = 0;
2328239462Sdim        Value *LIEPrev = UndefValue::get(ArgTypeH);
2329239462Sdim        for (unsigned i = 0; i < numElemL; ++i) {
2330239462Sdim          if (isa<UndefValue>(VectElemts[i])) continue;
2331239462Sdim          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2332239462Sdim                             ConstantInt::get(Type::getInt32Ty(Context),
2333239462Sdim                                              i + IdxOff),
2334243830Sdim                             getReplacementName(IBeforeJ ? I : J,
2335243830Sdim                                                true, o, i+1));
2336243830Sdim          LIENext->insertBefore(IBeforeJ ? J : I);
2337239462Sdim          LIEPrev = LIENext;
2338239462Sdim        }
2339239462Sdim
2340239462Sdim        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2341239462Sdim        ExpandedIEChain = true;
2342239462Sdim      }
2343239462Sdim    }
2344239462Sdim
2345239462Sdim    return ExpandedIEChain;
2346239462Sdim  }
2347239462Sdim
2348263508Sdim  static unsigned getNumScalarElements(Type *Ty) {
2349263508Sdim    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
2350263508Sdim      return VecTy->getNumElements();
2351263508Sdim    return 1;
2352263508Sdim  }
2353263508Sdim
2354234285Sdim  // Returns the value to be used as the specified operand of the vector
2355234285Sdim  // instruction that fuses I with J.
2356234285Sdim  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2357243830Sdim                     Instruction *J, unsigned o, bool IBeforeJ) {
2358234285Sdim    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2359234285Sdim    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2360234285Sdim
2361239462Sdim    // Compute the fused vector type for this operand
2362239462Sdim    Type *ArgTypeI = I->getOperand(o)->getType();
2363239462Sdim    Type *ArgTypeJ = J->getOperand(o)->getType();
2364239462Sdim    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2365234285Sdim
2366234285Sdim    Instruction *L = I, *H = J;
2367239462Sdim    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2368234285Sdim
2369263508Sdim    unsigned numElemL = getNumScalarElements(ArgTypeL);
2370263508Sdim    unsigned numElemH = getNumScalarElements(ArgTypeH);
2371234285Sdim
2372239462Sdim    Value *LOp = L->getOperand(o);
2373239462Sdim    Value *HOp = H->getOperand(o);
2374239462Sdim    unsigned numElem = VArgType->getNumElements();
2375239462Sdim
2376239462Sdim    // First, we check if we can reuse the "original" vector outputs (if these
2377239462Sdim    // exist). We might need a shuffle.
2378239462Sdim    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
2379239462Sdim    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
2380239462Sdim    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
2381239462Sdim    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
2382239462Sdim
2383239462Sdim    // FIXME: If we're fusing shuffle instructions, then we can't apply this
2384239462Sdim    // optimization. The input vectors to the shuffle might be a different
2385239462Sdim    // length from the shuffle outputs. Unfortunately, the replacement
2386239462Sdim    // shuffle mask has already been formed, and the mask entries are sensitive
2387239462Sdim    // to the sizes of the inputs.
2388239462Sdim    bool IsSizeChangeShuffle =
2389239462Sdim      isa<ShuffleVectorInst>(L) &&
2390239462Sdim        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2391239462Sdim
2392239462Sdim    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2393239462Sdim      // We can have at most two unique vector inputs.
2394239462Sdim      bool CanUseInputs = true;
2395239462Sdim      Value *I1, *I2 = 0;
2396239462Sdim      if (LEE) {
2397239462Sdim        I1 = LEE->getOperand(0);
2398239462Sdim      } else {
2399239462Sdim        I1 = LSV->getOperand(0);
2400239462Sdim        I2 = LSV->getOperand(1);
2401239462Sdim        if (I2 == I1 || isa<UndefValue>(I2))
2402239462Sdim          I2 = 0;
2403239462Sdim      }
2404239462Sdim
2405239462Sdim      if (HEE) {
2406239462Sdim        Value *I3 = HEE->getOperand(0);
2407239462Sdim        if (!I2 && I3 != I1)
2408239462Sdim          I2 = I3;
2409239462Sdim        else if (I3 != I1 && I3 != I2)
2410239462Sdim          CanUseInputs = false;
2411239462Sdim      } else {
2412239462Sdim        Value *I3 = HSV->getOperand(0);
2413239462Sdim        if (!I2 && I3 != I1)
2414239462Sdim          I2 = I3;
2415239462Sdim        else if (I3 != I1 && I3 != I2)
2416239462Sdim          CanUseInputs = false;
2417239462Sdim
2418239462Sdim        if (CanUseInputs) {
2419239462Sdim          Value *I4 = HSV->getOperand(1);
2420239462Sdim          if (!isa<UndefValue>(I4)) {
2421239462Sdim            if (!I2 && I4 != I1)
2422239462Sdim              I2 = I4;
2423239462Sdim            else if (I4 != I1 && I4 != I2)
2424239462Sdim              CanUseInputs = false;
2425239462Sdim          }
2426239462Sdim        }
2427239462Sdim      }
2428239462Sdim
2429239462Sdim      if (CanUseInputs) {
2430239462Sdim        unsigned LOpElem =
2431263508Sdim          cast<Instruction>(LOp)->getOperand(0)->getType()
2432263508Sdim            ->getVectorNumElements();
2433263508Sdim
2434239462Sdim        unsigned HOpElem =
2435263508Sdim          cast<Instruction>(HOp)->getOperand(0)->getType()
2436263508Sdim            ->getVectorNumElements();
2437239462Sdim
2438239462Sdim        // We have one or two input vectors. We need to map each index of the
2439239462Sdim        // operands to the index of the original vector.
2440239462Sdim        SmallVector<std::pair<int, int>, 8>  II(numElem);
2441239462Sdim        for (unsigned i = 0; i < numElemL; ++i) {
2442239462Sdim          int Idx, INum;
2443239462Sdim          if (LEE) {
2444239462Sdim            Idx =
2445239462Sdim              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
2446239462Sdim            INum = LEE->getOperand(0) == I1 ? 0 : 1;
2447239462Sdim          } else {
2448239462Sdim            Idx = LSV->getMaskValue(i);
2449239462Sdim            if (Idx < (int) LOpElem) {
2450239462Sdim              INum = LSV->getOperand(0) == I1 ? 0 : 1;
2451239462Sdim            } else {
2452239462Sdim              Idx -= LOpElem;
2453239462Sdim              INum = LSV->getOperand(1) == I1 ? 0 : 1;
2454239462Sdim            }
2455239462Sdim          }
2456239462Sdim
2457239462Sdim          II[i] = std::pair<int, int>(Idx, INum);
2458239462Sdim        }
2459239462Sdim        for (unsigned i = 0; i < numElemH; ++i) {
2460239462Sdim          int Idx, INum;
2461239462Sdim          if (HEE) {
2462239462Sdim            Idx =
2463239462Sdim              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
2464239462Sdim            INum = HEE->getOperand(0) == I1 ? 0 : 1;
2465239462Sdim          } else {
2466239462Sdim            Idx = HSV->getMaskValue(i);
2467239462Sdim            if (Idx < (int) HOpElem) {
2468239462Sdim              INum = HSV->getOperand(0) == I1 ? 0 : 1;
2469239462Sdim            } else {
2470239462Sdim              Idx -= HOpElem;
2471239462Sdim              INum = HSV->getOperand(1) == I1 ? 0 : 1;
2472239462Sdim            }
2473239462Sdim          }
2474239462Sdim
2475239462Sdim          II[i + numElemL] = std::pair<int, int>(Idx, INum);
2476239462Sdim        }
2477239462Sdim
2478239462Sdim        // We now have an array which tells us from which index of which
2479239462Sdim        // input vector each element of the operand comes.
2480239462Sdim        VectorType *I1T = cast<VectorType>(I1->getType());
2481239462Sdim        unsigned I1Elem = I1T->getNumElements();
2482239462Sdim
2483239462Sdim        if (!I2) {
2484239462Sdim          // In this case there is only one underlying vector input. Check for
2485239462Sdim          // the trivial case where we can use the input directly.
2486239462Sdim          if (I1Elem == numElem) {
2487239462Sdim            bool ElemInOrder = true;
2488239462Sdim            for (unsigned i = 0; i < numElem; ++i) {
2489239462Sdim              if (II[i].first != (int) i && II[i].first != -1) {
2490239462Sdim                ElemInOrder = false;
2491239462Sdim                break;
2492239462Sdim              }
2493239462Sdim            }
2494239462Sdim
2495239462Sdim            if (ElemInOrder)
2496239462Sdim              return I1;
2497239462Sdim          }
2498239462Sdim
2499239462Sdim          // A shuffle is needed.
2500239462Sdim          std::vector<Constant *> Mask(numElem);
2501239462Sdim          for (unsigned i = 0; i < numElem; ++i) {
2502239462Sdim            int Idx = II[i].first;
2503239462Sdim            if (Idx == -1)
2504239462Sdim              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2505239462Sdim            else
2506239462Sdim              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2507239462Sdim          }
2508239462Sdim
2509239462Sdim          Instruction *S =
2510239462Sdim            new ShuffleVectorInst(I1, UndefValue::get(I1T),
2511239462Sdim                                  ConstantVector::get(Mask),
2512243830Sdim                                  getReplacementName(IBeforeJ ? I : J,
2513243830Sdim                                                     true, o));
2514243830Sdim          S->insertBefore(IBeforeJ ? J : I);
2515239462Sdim          return S;
2516239462Sdim        }
2517239462Sdim
2518239462Sdim        VectorType *I2T = cast<VectorType>(I2->getType());
2519239462Sdim        unsigned I2Elem = I2T->getNumElements();
2520239462Sdim
2521239462Sdim        // This input comes from two distinct vectors. The first step is to
2522239462Sdim        // make sure that both vectors are the same length. If not, the
2523239462Sdim        // smaller one will need to grow before they can be shuffled together.
2524239462Sdim        if (I1Elem < I2Elem) {
2525239462Sdim          std::vector<Constant *> Mask(I2Elem);
2526239462Sdim          unsigned v = 0;
2527239462Sdim          for (; v < I1Elem; ++v)
2528239462Sdim            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2529239462Sdim          for (; v < I2Elem; ++v)
2530239462Sdim            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2531239462Sdim
2532239462Sdim          Instruction *NewI1 =
2533239462Sdim            new ShuffleVectorInst(I1, UndefValue::get(I1T),
2534239462Sdim                                  ConstantVector::get(Mask),
2535243830Sdim                                  getReplacementName(IBeforeJ ? I : J,
2536243830Sdim                                                     true, o, 1));
2537243830Sdim          NewI1->insertBefore(IBeforeJ ? J : I);
2538239462Sdim          I1 = NewI1;
2539239462Sdim          I1T = I2T;
2540239462Sdim          I1Elem = I2Elem;
2541239462Sdim        } else if (I1Elem > I2Elem) {
2542239462Sdim          std::vector<Constant *> Mask(I1Elem);
2543239462Sdim          unsigned v = 0;
2544239462Sdim          for (; v < I2Elem; ++v)
2545239462Sdim            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2546239462Sdim          for (; v < I1Elem; ++v)
2547239462Sdim            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2548239462Sdim
2549239462Sdim          Instruction *NewI2 =
2550239462Sdim            new ShuffleVectorInst(I2, UndefValue::get(I2T),
2551239462Sdim                                  ConstantVector::get(Mask),
2552243830Sdim                                  getReplacementName(IBeforeJ ? I : J,
2553243830Sdim                                                     true, o, 1));
2554243830Sdim          NewI2->insertBefore(IBeforeJ ? J : I);
2555239462Sdim          I2 = NewI2;
2556239462Sdim          I2T = I1T;
2557239462Sdim          I2Elem = I1Elem;
2558239462Sdim        }
2559239462Sdim
2560239462Sdim        // Now that both I1 and I2 are the same length we can shuffle them
2561239462Sdim        // together (and use the result).
2562239462Sdim        std::vector<Constant *> Mask(numElem);
2563239462Sdim        for (unsigned v = 0; v < numElem; ++v) {
2564239462Sdim          if (II[v].first == -1) {
2565239462Sdim            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2566239462Sdim          } else {
2567239462Sdim            int Idx = II[v].first + II[v].second * I1Elem;
2568239462Sdim            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2569239462Sdim          }
2570239462Sdim        }
2571239462Sdim
2572239462Sdim        Instruction *NewOp =
2573239462Sdim          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2574243830Sdim                                getReplacementName(IBeforeJ ? I : J, true, o));
2575243830Sdim        NewOp->insertBefore(IBeforeJ ? J : I);
2576239462Sdim        return NewOp;
2577239462Sdim      }
2578234285Sdim    }
2579234285Sdim
2580239462Sdim    Type *ArgType = ArgTypeL;
2581239462Sdim    if (numElemL < numElemH) {
2582239462Sdim      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2583243830Sdim                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
2584239462Sdim        // This is another short-circuit case: we're combining a scalar into
2585239462Sdim        // a vector that is formed by an IE chain. We've just expanded the IE
2586239462Sdim        // chain, now insert the scalar and we're done.
2587234285Sdim
2588239462Sdim        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2589243830Sdim                           getReplacementName(IBeforeJ ? I : J, true, o));
2590243830Sdim        S->insertBefore(IBeforeJ ? J : I);
2591239462Sdim        return S;
2592239462Sdim      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2593243830Sdim                                ArgTypeH, IBeforeJ)) {
2594239462Sdim        // The two vector inputs to the shuffle must be the same length,
2595239462Sdim        // so extend the smaller vector to be the same length as the larger one.
2596239462Sdim        Instruction *NLOp;
2597239462Sdim        if (numElemL > 1) {
2598239462Sdim
2599239462Sdim          std::vector<Constant *> Mask(numElemH);
2600239462Sdim          unsigned v = 0;
2601239462Sdim          for (; v < numElemL; ++v)
2602239462Sdim            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2603239462Sdim          for (; v < numElemH; ++v)
2604239462Sdim            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2605239462Sdim
2606239462Sdim          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2607239462Sdim                                       ConstantVector::get(Mask),
2608243830Sdim                                       getReplacementName(IBeforeJ ? I : J,
2609243830Sdim                                                          true, o, 1));
2610239462Sdim        } else {
2611239462Sdim          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2612243830Sdim                                           getReplacementName(IBeforeJ ? I : J,
2613243830Sdim                                                              true, o, 1));
2614239462Sdim        }
2615239462Sdim
2616243830Sdim        NLOp->insertBefore(IBeforeJ ? J : I);
2617239462Sdim        LOp = NLOp;
2618239462Sdim      }
2619234285Sdim
2620239462Sdim      ArgType = ArgTypeH;
2621239462Sdim    } else if (numElemL > numElemH) {
2622239462Sdim      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2623243830Sdim                                         ArgTypeH, VArgType, IBeforeJ)) {
2624239462Sdim        Instruction *S =
2625239462Sdim          InsertElementInst::Create(LOp, HOp,
2626239462Sdim                                    ConstantInt::get(Type::getInt32Ty(Context),
2627239462Sdim                                                     numElemL),
2628243830Sdim                                    getReplacementName(IBeforeJ ? I : J,
2629243830Sdim                                                       true, o));
2630243830Sdim        S->insertBefore(IBeforeJ ? J : I);
2631239462Sdim        return S;
2632239462Sdim      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2633243830Sdim                                ArgTypeL, IBeforeJ)) {
2634239462Sdim        Instruction *NHOp;
2635239462Sdim        if (numElemH > 1) {
2636239462Sdim          std::vector<Constant *> Mask(numElemL);
2637239462Sdim          unsigned v = 0;
2638239462Sdim          for (; v < numElemH; ++v)
2639239462Sdim            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2640239462Sdim          for (; v < numElemL; ++v)
2641239462Sdim            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2642239462Sdim
2643239462Sdim          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2644239462Sdim                                       ConstantVector::get(Mask),
2645243830Sdim                                       getReplacementName(IBeforeJ ? I : J,
2646243830Sdim                                                          true, o, 1));
2647239462Sdim        } else {
2648239462Sdim          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2649243830Sdim                                           getReplacementName(IBeforeJ ? I : J,
2650243830Sdim                                                              true, o, 1));
2651239462Sdim        }
2652263508Sdim
2653243830Sdim        NHOp->insertBefore(IBeforeJ ? J : I);
2654239462Sdim        HOp = NHOp;
2655239462Sdim      }
2656239462Sdim    }
2657234285Sdim
2658239462Sdim    if (ArgType->isVectorTy()) {
2659263508Sdim      unsigned numElem = VArgType->getVectorNumElements();
2660239462Sdim      std::vector<Constant*> Mask(numElem);
2661239462Sdim      for (unsigned v = 0; v < numElem; ++v) {
2662239462Sdim        unsigned Idx = v;
2663239462Sdim        // If the low vector was expanded, we need to skip the extra
2664239462Sdim        // undefined entries.
2665239462Sdim        if (v >= numElemL && numElemH > numElemL)
2666239462Sdim          Idx += (numElemH - numElemL);
2667239462Sdim        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2668234285Sdim      }
2669234285Sdim
2670239462Sdim      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2671243830Sdim                          ConstantVector::get(Mask),
2672243830Sdim                          getReplacementName(IBeforeJ ? I : J, true, o));
2673243830Sdim      BV->insertBefore(IBeforeJ ? J : I);
2674234285Sdim      return BV;
2675234285Sdim    }
2676234285Sdim
2677234285Sdim    Instruction *BV1 = InsertElementInst::Create(
2678239462Sdim                                          UndefValue::get(VArgType), LOp, CV0,
2679243830Sdim                                          getReplacementName(IBeforeJ ? I : J,
2680243830Sdim                                                             true, o, 1));
2681243830Sdim    BV1->insertBefore(IBeforeJ ? J : I);
2682239462Sdim    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2683243830Sdim                                          getReplacementName(IBeforeJ ? I : J,
2684243830Sdim                                                             true, o, 2));
2685243830Sdim    BV2->insertBefore(IBeforeJ ? J : I);
2686234285Sdim    return BV2;
2687234285Sdim  }
2688234285Sdim
2689234285Sdim  // This function creates an array of values that will be used as the inputs
2690234285Sdim  // to the vector instruction that fuses I with J.
2691234285Sdim  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2692234285Sdim                     Instruction *I, Instruction *J,
2693263508Sdim                     SmallVectorImpl<Value *> &ReplacedOperands,
2694243830Sdim                     bool IBeforeJ) {
2695234285Sdim    unsigned NumOperands = I->getNumOperands();
2696234285Sdim
2697234285Sdim    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2698234285Sdim      // Iterate backward so that we look at the store pointer
2699234285Sdim      // first and know whether or not we need to flip the inputs.
2700234285Sdim
2701234285Sdim      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
2702234285Sdim        // This is the pointer for a load/store instruction.
2703243830Sdim        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2704234285Sdim        continue;
2705234285Sdim      } else if (isa<CallInst>(I)) {
2706234285Sdim        Function *F = cast<CallInst>(I)->getCalledFunction();
2707249423Sdim        Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
2708234285Sdim        if (o == NumOperands-1) {
2709234285Sdim          BasicBlock &BB = *I->getParent();
2710234285Sdim
2711234285Sdim          Module *M = BB.getParent()->getParent();
2712239462Sdim          Type *ArgTypeI = I->getType();
2713239462Sdim          Type *ArgTypeJ = J->getType();
2714239462Sdim          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2715234285Sdim
2716249423Sdim          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2717234285Sdim          continue;
2718234285Sdim        } else if (IID == Intrinsic::powi && o == 1) {
2719234285Sdim          // The second argument of powi is a single integer and we've already
2720234285Sdim          // checked that both arguments are equal. As a result, we just keep
2721234285Sdim          // I's second argument.
2722234285Sdim          ReplacedOperands[o] = I->getOperand(o);
2723234285Sdim          continue;
2724234285Sdim        }
2725234285Sdim      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
2726234285Sdim        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2727234285Sdim        continue;
2728234285Sdim      }
2729234285Sdim
2730243830Sdim      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2731234285Sdim    }
2732234285Sdim  }
2733234285Sdim
2734234285Sdim  // This function creates two values that represent the outputs of the
2735234285Sdim  // original I and J instructions. These are generally vector shuffles
2736234285Sdim  // or extracts. In many cases, these will end up being unused and, thus,
2737234285Sdim  // eliminated by later passes.
2738234285Sdim  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2739234285Sdim                     Instruction *J, Instruction *K,
2740234285Sdim                     Instruction *&InsertionPt,
2741243830Sdim                     Instruction *&K1, Instruction *&K2) {
2742234285Sdim    if (isa<StoreInst>(I)) {
2743234285Sdim      AA->replaceWithNewValue(I, K);
2744234285Sdim      AA->replaceWithNewValue(J, K);
2745234285Sdim    } else {
2746234285Sdim      Type *IType = I->getType();
2747239462Sdim      Type *JType = J->getType();
2748234285Sdim
2749239462Sdim      VectorType *VType = getVecTypeForPair(IType, JType);
2750239462Sdim      unsigned numElem = VType->getNumElements();
2751239462Sdim
2752263508Sdim      unsigned numElemI = getNumScalarElements(IType);
2753263508Sdim      unsigned numElemJ = getNumScalarElements(JType);
2754239462Sdim
2755234285Sdim      if (IType->isVectorTy()) {
2756239462Sdim        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
2757239462Sdim        for (unsigned v = 0; v < numElemI; ++v) {
2758239462Sdim          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2759239462Sdim          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
2760239462Sdim        }
2761234285Sdim
2762239462Sdim        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2763243830Sdim                                   ConstantVector::get( Mask1),
2764239462Sdim                                   getReplacementName(K, false, 1));
2765234285Sdim      } else {
2766239462Sdim        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2767243830Sdim        K1 = ExtractElementInst::Create(K, CV0,
2768234285Sdim                                          getReplacementName(K, false, 1));
2769239462Sdim      }
2770239462Sdim
2771239462Sdim      if (JType->isVectorTy()) {
2772239462Sdim        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
2773239462Sdim        for (unsigned v = 0; v < numElemJ; ++v) {
2774239462Sdim          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2775239462Sdim          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
2776239462Sdim        }
2777239462Sdim
2778239462Sdim        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2779243830Sdim                                   ConstantVector::get( Mask2),
2780239462Sdim                                   getReplacementName(K, false, 2));
2781239462Sdim      } else {
2782239462Sdim        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
2783243830Sdim        K2 = ExtractElementInst::Create(K, CV1,
2784234285Sdim                                          getReplacementName(K, false, 2));
2785234285Sdim      }
2786234285Sdim
2787234285Sdim      K1->insertAfter(K);
2788234285Sdim      K2->insertAfter(K1);
2789234285Sdim      InsertionPt = K2;
2790234285Sdim    }
2791234285Sdim  }
2792234285Sdim
2793234285Sdim  // Move all uses of the function I (including pairing-induced uses) after J.
2794234285Sdim  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2795249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
2796234285Sdim                     Instruction *I, Instruction *J) {
2797234285Sdim    // Skip to the first instruction past I.
2798234285Sdim    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2799234285Sdim
2800234285Sdim    DenseSet<Value *> Users;
2801234285Sdim    AliasSetTracker WriteSet(*AA);
2802263508Sdim    if (I->mayWriteToMemory()) WriteSet.add(I);
2803263508Sdim
2804234285Sdim    for (; cast<Instruction>(L) != J; ++L)
2805249423Sdim      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
2806234285Sdim
2807234285Sdim    assert(cast<Instruction>(L) == J &&
2808234285Sdim      "Tracking has not proceeded far enough to check for dependencies");
2809234285Sdim    // If J is now in the use set of I, then trackUsesOfI will return true
2810234285Sdim    // and we have a dependency cycle (and the fusing operation must abort).
2811249423Sdim    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2812234285Sdim  }
2813234285Sdim
2814234285Sdim  // Move all uses of the function I (including pairing-induced uses) after J.
2815234285Sdim  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2816249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
2817234285Sdim                     Instruction *&InsertionPt,
2818234285Sdim                     Instruction *I, Instruction *J) {
2819234285Sdim    // Skip to the first instruction past I.
2820234285Sdim    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2821234285Sdim
2822234285Sdim    DenseSet<Value *> Users;
2823234285Sdim    AliasSetTracker WriteSet(*AA);
2824263508Sdim    if (I->mayWriteToMemory()) WriteSet.add(I);
2825263508Sdim
2826234285Sdim    for (; cast<Instruction>(L) != J;) {
2827249423Sdim      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
2828234285Sdim        // Move this instruction
2829234285Sdim        Instruction *InstToMove = L; ++L;
2830234285Sdim
2831234285Sdim        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2832234285Sdim                        " to after " << *InsertionPt << "\n");
2833234285Sdim        InstToMove->removeFromParent();
2834234285Sdim        InstToMove->insertAfter(InsertionPt);
2835234285Sdim        InsertionPt = InstToMove;
2836234285Sdim      } else {
2837234285Sdim        ++L;
2838234285Sdim      }
2839234285Sdim    }
2840234285Sdim  }
2841234285Sdim
2842234285Sdim  // Collect all load instruction that are in the move set of a given first
2843234285Sdim  // pair member.  These loads depend on the first instruction, I, and so need
2844234285Sdim  // to be moved after J (the second instruction) when the pair is fused.
2845234285Sdim  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2846234285Sdim                     DenseMap<Value *, Value *> &ChosenPairs,
2847249423Sdim                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2848249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs,
2849234285Sdim                     Instruction *I) {
2850234285Sdim    // Skip to the first instruction past I.
2851234285Sdim    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
2852234285Sdim
2853234285Sdim    DenseSet<Value *> Users;
2854234285Sdim    AliasSetTracker WriteSet(*AA);
2855263508Sdim    if (I->mayWriteToMemory()) WriteSet.add(I);
2856234285Sdim
2857234285Sdim    // Note: We cannot end the loop when we reach J because J could be moved
2858234285Sdim    // farther down the use chain by another instruction pairing. Also, J
2859234285Sdim    // could be before I if this is an inverted input.
2860234285Sdim    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
2861234285Sdim      if (trackUsesOfI(Users, WriteSet, I, L)) {
2862249423Sdim        if (L->mayReadFromMemory()) {
2863249423Sdim          LoadMoveSet[L].push_back(I);
2864249423Sdim          LoadMoveSetPairs.insert(ValuePair(L, I));
2865249423Sdim        }
2866234285Sdim      }
2867234285Sdim    }
2868234285Sdim  }
2869234285Sdim
2870234285Sdim  // In cases where both load/stores and the computation of their pointers
2871234285Sdim  // are chosen for vectorization, we can end up in a situation where the
2872234285Sdim  // aliasing analysis starts returning different query results as the
2873234285Sdim  // process of fusing instruction pairs continues. Because the algorithm
2874249423Sdim  // relies on finding the same use dags here as were found earlier, we'll
2875234285Sdim  // need to precompute the necessary aliasing information here and then
2876234285Sdim  // manually update it during the fusion process.
2877234285Sdim  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2878234285Sdim                     std::vector<Value *> &PairableInsts,
2879234285Sdim                     DenseMap<Value *, Value *> &ChosenPairs,
2880249423Sdim                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
2881249423Sdim                     DenseSet<ValuePair> &LoadMoveSetPairs) {
2882234285Sdim    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
2883234285Sdim         PIE = PairableInsts.end(); PI != PIE; ++PI) {
2884234285Sdim      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
2885234285Sdim      if (P == ChosenPairs.end()) continue;
2886234285Sdim
2887234285Sdim      Instruction *I = cast<Instruction>(P->first);
2888249423Sdim      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2889249423Sdim                             LoadMoveSetPairs, I);
2890234285Sdim    }
2891234285Sdim  }
2892234285Sdim
2893239462Sdim  // When the first instruction in each pair is cloned, it will inherit its
2894239462Sdim  // parent's metadata. This metadata must be combined with that of the other
2895239462Sdim  // instruction in a safe way.
2896239462Sdim  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
2897239462Sdim    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
2898239462Sdim    K->getAllMetadataOtherThanDebugLoc(Metadata);
2899239462Sdim    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
2900239462Sdim      unsigned Kind = Metadata[i].first;
2901239462Sdim      MDNode *JMD = J->getMetadata(Kind);
2902239462Sdim      MDNode *KMD = Metadata[i].second;
2903239462Sdim
2904239462Sdim      switch (Kind) {
2905239462Sdim      default:
2906239462Sdim        K->setMetadata(Kind, 0); // Remove unknown metadata
2907239462Sdim        break;
2908239462Sdim      case LLVMContext::MD_tbaa:
2909239462Sdim        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
2910239462Sdim        break;
2911239462Sdim      case LLVMContext::MD_fpmath:
2912239462Sdim        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
2913239462Sdim        break;
2914239462Sdim      }
2915239462Sdim    }
2916239462Sdim  }
2917239462Sdim
2918234285Sdim  // This function fuses the chosen instruction pairs into vector instructions,
2919234285Sdim  // taking care preserve any needed scalar outputs and, then, it reorders the
2920234285Sdim  // remaining instructions as needed (users of the first member of the pair
2921234285Sdim  // need to be moved to after the location of the second member of the pair
2922234285Sdim  // because the vector instruction is inserted in the location of the pair's
2923234285Sdim  // second member).
2924234285Sdim  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
2925249423Sdim             std::vector<Value *> &PairableInsts,
2926249423Sdim             DenseMap<Value *, Value *> &ChosenPairs,
2927249423Sdim             DenseSet<ValuePair> &FixedOrderPairs,
2928249423Sdim             DenseMap<VPPair, unsigned> &PairConnectionTypes,
2929249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
2930249423Sdim             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
2931234285Sdim    LLVMContext& Context = BB.getContext();
2932234285Sdim
2933234285Sdim    // During the vectorization process, the order of the pairs to be fused
2934234285Sdim    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
2935234285Sdim    // list. After a pair is fused, the flipped pair is removed from the list.
2936243830Sdim    DenseSet<ValuePair> FlippedPairs;
2937234285Sdim    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
2938234285Sdim         E = ChosenPairs.end(); P != E; ++P)
2939243830Sdim      FlippedPairs.insert(ValuePair(P->second, P->first));
2940243830Sdim    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
2941234285Sdim         E = FlippedPairs.end(); P != E; ++P)
2942234285Sdim      ChosenPairs.insert(*P);
2943234285Sdim
2944249423Sdim    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
2945249423Sdim    DenseSet<ValuePair> LoadMoveSetPairs;
2946249423Sdim    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
2947249423Sdim                       LoadMoveSet, LoadMoveSetPairs);
2948234285Sdim
2949234285Sdim    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
2950234285Sdim
2951234285Sdim    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
2952234285Sdim      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
2953234285Sdim      if (P == ChosenPairs.end()) {
2954234285Sdim        ++PI;
2955234285Sdim        continue;
2956234285Sdim      }
2957234285Sdim
2958234285Sdim      if (getDepthFactor(P->first) == 0) {
2959234285Sdim        // These instructions are not really fused, but are tracked as though
2960234285Sdim        // they are. Any case in which it would be interesting to fuse them
2961234285Sdim        // will be taken care of by InstCombine.
2962234285Sdim        --NumFusedOps;
2963234285Sdim        ++PI;
2964234285Sdim        continue;
2965234285Sdim      }
2966234285Sdim
2967234285Sdim      Instruction *I = cast<Instruction>(P->first),
2968234285Sdim        *J = cast<Instruction>(P->second);
2969234285Sdim
2970234285Sdim      DEBUG(dbgs() << "BBV: fusing: " << *I <<
2971234285Sdim             " <-> " << *J << "\n");
2972234285Sdim
2973234285Sdim      // Remove the pair and flipped pair from the list.
2974234285Sdim      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
2975234285Sdim      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
2976234285Sdim      ChosenPairs.erase(FP);
2977234285Sdim      ChosenPairs.erase(P);
2978234285Sdim
2979249423Sdim      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
2980234285Sdim        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
2981234285Sdim               " <-> " << *J <<
2982234285Sdim               " aborted because of non-trivial dependency cycle\n");
2983234285Sdim        --NumFusedOps;
2984234285Sdim        ++PI;
2985234285Sdim        continue;
2986234285Sdim      }
2987234285Sdim
2988243830Sdim      // If the pair must have the other order, then flip it.
2989243830Sdim      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
2990243830Sdim      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
2991243830Sdim        // This pair does not have a fixed order, and so we might want to
2992243830Sdim        // flip it if that will yield fewer shuffles. We count the number
2993243830Sdim        // of dependencies connected via swaps, and those directly connected,
2994243830Sdim        // and flip the order if the number of swaps is greater.
2995243830Sdim        bool OrigOrder = true;
2996249423Sdim        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
2997249423Sdim          ConnectedPairDeps.find(ValuePair(I, J));
2998249423Sdim        if (IJ == ConnectedPairDeps.end()) {
2999249423Sdim          IJ = ConnectedPairDeps.find(ValuePair(J, I));
3000243830Sdim          OrigOrder = false;
3001243830Sdim        }
3002239462Sdim
3003249423Sdim        if (IJ != ConnectedPairDeps.end()) {
3004243830Sdim          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3005249423Sdim          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
3006249423Sdim               TE = IJ->second.end(); T != TE; ++T) {
3007249423Sdim            VPPair Q(IJ->first, *T);
3008243830Sdim            DenseMap<VPPair, unsigned>::iterator R =
3009249423Sdim              PairConnectionTypes.find(VPPair(Q.second, Q.first));
3010243830Sdim            assert(R != PairConnectionTypes.end() &&
3011243830Sdim                   "Cannot find pair connection type");
3012243830Sdim            if (R->second == PairConnectionDirect)
3013243830Sdim              ++NumDepsDirect;
3014243830Sdim            else if (R->second == PairConnectionSwap)
3015243830Sdim              ++NumDepsSwap;
3016243830Sdim          }
3017243830Sdim
3018243830Sdim          if (!OrigOrder)
3019243830Sdim            std::swap(NumDepsDirect, NumDepsSwap);
3020243830Sdim
3021243830Sdim          if (NumDepsSwap > NumDepsDirect) {
3022243830Sdim            FlipPairOrder = true;
3023243830Sdim            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3024243830Sdim                            " <-> " << *J << "\n");
3025243830Sdim          }
3026243830Sdim        }
3027243830Sdim      }
3028243830Sdim
3029243830Sdim      Instruction *L = I, *H = J;
3030243830Sdim      if (FlipPairOrder)
3031243830Sdim        std::swap(H, L);
3032243830Sdim
3033243830Sdim      // If the pair being fused uses the opposite order from that in the pair
3034243830Sdim      // connection map, then we need to flip the types.
3035249423Sdim      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
3036249423Sdim        ConnectedPairs.find(ValuePair(H, L));
3037249423Sdim      if (HL != ConnectedPairs.end())
3038249423Sdim        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
3039249423Sdim             TE = HL->second.end(); T != TE; ++T) {
3040249423Sdim          VPPair Q(HL->first, *T);
3041249423Sdim          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
3042249423Sdim          assert(R != PairConnectionTypes.end() &&
3043249423Sdim                 "Cannot find pair connection type");
3044249423Sdim          if (R->second == PairConnectionDirect)
3045249423Sdim            R->second = PairConnectionSwap;
3046249423Sdim          else if (R->second == PairConnectionSwap)
3047249423Sdim            R->second = PairConnectionDirect;
3048249423Sdim        }
3049243830Sdim
3050243830Sdim      bool LBeforeH = !FlipPairOrder;
3051234285Sdim      unsigned NumOperands = I->getNumOperands();
3052234285Sdim      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
3053243830Sdim      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3054243830Sdim                                  LBeforeH);
3055234285Sdim
3056234285Sdim      // Make a copy of the original operation, change its type to the vector
3057234285Sdim      // type and replace its operands with the vector operands.
3058243830Sdim      Instruction *K = L->clone();
3059243830Sdim      if (L->hasName())
3060243830Sdim        K->takeName(L);
3061243830Sdim      else if (H->hasName())
3062243830Sdim        K->takeName(H);
3063234285Sdim
3064234285Sdim      if (!isa<StoreInst>(K))
3065243830Sdim        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3066234285Sdim
3067243830Sdim      combineMetadata(K, H);
3068243830Sdim      K->intersectOptionalDataWith(H);
3069239462Sdim
3070234285Sdim      for (unsigned o = 0; o < NumOperands; ++o)
3071234285Sdim        K->setOperand(o, ReplacedOperands[o]);
3072234285Sdim
3073234285Sdim      K->insertAfter(J);
3074234285Sdim
3075234285Sdim      // Instruction insertion point:
3076234285Sdim      Instruction *InsertionPt = K;
3077234285Sdim      Instruction *K1 = 0, *K2 = 0;
3078243830Sdim      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3079234285Sdim
3080249423Sdim      // The use dag of the first original instruction must be moved to after
3081249423Sdim      // the location of the second instruction. The entire use dag of the
3082249423Sdim      // first instruction is disjoint from the input dag of the second
3083234285Sdim      // (by definition), and so commutes with it.
3084234285Sdim
3085249423Sdim      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3086234285Sdim
3087234285Sdim      if (!isa<StoreInst>(I)) {
3088243830Sdim        L->replaceAllUsesWith(K1);
3089243830Sdim        H->replaceAllUsesWith(K2);
3090243830Sdim        AA->replaceWithNewValue(L, K1);
3091243830Sdim        AA->replaceWithNewValue(H, K2);
3092234285Sdim      }
3093234285Sdim
3094234285Sdim      // Instructions that may read from memory may be in the load move set.
3095234285Sdim      // Once an instruction is fused, we no longer need its move set, and so
3096234285Sdim      // the values of the map never need to be updated. However, when a load
3097234285Sdim      // is fused, we need to merge the entries from both instructions in the
3098234285Sdim      // pair in case those instructions were in the move set of some other
3099234285Sdim      // yet-to-be-fused pair. The loads in question are the keys of the map.
3100234285Sdim      if (I->mayReadFromMemory()) {
3101234285Sdim        std::vector<ValuePair> NewSetMembers;
3102249423Sdim        DenseMap<Value *, std::vector<Value *> >::iterator II =
3103249423Sdim          LoadMoveSet.find(I);
3104249423Sdim        if (II != LoadMoveSet.end())
3105249423Sdim          for (std::vector<Value *>::iterator N = II->second.begin(),
3106249423Sdim               NE = II->second.end(); N != NE; ++N)
3107249423Sdim            NewSetMembers.push_back(ValuePair(K, *N));
3108249423Sdim        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
3109249423Sdim          LoadMoveSet.find(J);
3110249423Sdim        if (JJ != LoadMoveSet.end())
3111249423Sdim          for (std::vector<Value *>::iterator N = JJ->second.begin(),
3112249423Sdim               NE = JJ->second.end(); N != NE; ++N)
3113249423Sdim            NewSetMembers.push_back(ValuePair(K, *N));
3114234285Sdim        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
3115249423Sdim             AE = NewSetMembers.end(); A != AE; ++A) {
3116249423Sdim          LoadMoveSet[A->first].push_back(A->second);
3117249423Sdim          LoadMoveSetPairs.insert(*A);
3118249423Sdim        }
3119234285Sdim      }
3120234285Sdim
3121234285Sdim      // Before removing I, set the iterator to the next instruction.
3122234285Sdim      PI = llvm::next(BasicBlock::iterator(I));
3123234285Sdim      if (cast<Instruction>(PI) == J)
3124234285Sdim        ++PI;
3125234285Sdim
3126234285Sdim      SE->forgetValue(I);
3127234285Sdim      SE->forgetValue(J);
3128234285Sdim      I->eraseFromParent();
3129234285Sdim      J->eraseFromParent();
3130243830Sdim
3131243830Sdim      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3132243830Sdim                                               BB << "\n");
3133234285Sdim    }
3134234285Sdim
3135234285Sdim    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3136234285Sdim  }
3137234285Sdim}
3138234285Sdim
3139234285Sdimchar BBVectorize::ID = 0;
3140234285Sdimstatic const char bb_vectorize_name[] = "Basic-Block Vectorization";
3141234285SdimINITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3142234285SdimINITIALIZE_AG_DEPENDENCY(AliasAnalysis)
3143249423SdimINITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
3144243830SdimINITIALIZE_PASS_DEPENDENCY(DominatorTree)
3145234285SdimINITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3146234285SdimINITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3147234285Sdim
3148234285SdimBasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
3149234285Sdim  return new BBVectorize(C);
3150234285Sdim}
3151234285Sdim
3152234285Sdimbool
3153234285Sdimllvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
3154234285Sdim  BBVectorize BBVectorizer(P, C);
3155234285Sdim  return BBVectorizer.vectorizeBB(BB);
3156234285Sdim}
3157234285Sdim
3158234285Sdim//===----------------------------------------------------------------------===//
3159234285SdimVectorizeConfig::VectorizeConfig() {
3160234285Sdim  VectorBits = ::VectorBits;
3161239462Sdim  VectorizeBools = !::NoBools;
3162234285Sdim  VectorizeInts = !::NoInts;
3163234285Sdim  VectorizeFloats = !::NoFloats;
3164234982Sdim  VectorizePointers = !::NoPointers;
3165234285Sdim  VectorizeCasts = !::NoCasts;
3166234285Sdim  VectorizeMath = !::NoMath;
3167234285Sdim  VectorizeFMA = !::NoFMA;
3168234982Sdim  VectorizeSelect = !::NoSelect;
3169239462Sdim  VectorizeCmp = !::NoCmp;
3170234982Sdim  VectorizeGEP = !::NoGEP;
3171234285Sdim  VectorizeMemOps = !::NoMemOps;
3172234285Sdim  AlignedOnly = ::AlignedOnly;
3173234285Sdim  ReqChainDepth= ::ReqChainDepth;
3174234285Sdim  SearchLimit = ::SearchLimit;
3175234285Sdim  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
3176234285Sdim  SplatBreaksChain = ::SplatBreaksChain;
3177234285Sdim  MaxInsts = ::MaxInsts;
3178249423Sdim  MaxPairs = ::MaxPairs;
3179234285Sdim  MaxIter = ::MaxIter;
3180239462Sdim  Pow2LenOnly = ::Pow2LenOnly;
3181234285Sdim  NoMemOpBoost = ::NoMemOpBoost;
3182234285Sdim  FastDep = ::FastDep;
3183234285Sdim}
3184