1234285Sdim//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2234285Sdim// 3234285Sdim// The LLVM Compiler Infrastructure 4234285Sdim// 5234285Sdim// This file is distributed under the University of Illinois Open Source 6234285Sdim// License. See LICENSE.TXT for details. 7234285Sdim// 8234285Sdim//===----------------------------------------------------------------------===// 9234285Sdim// 10234285Sdim// This file implements a basic-block vectorization pass. The algorithm was 11234285Sdim// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12234285Sdim// et al. It works by looking for chains of pairable operations and then 13234285Sdim// pairing them. 14234285Sdim// 15234285Sdim//===----------------------------------------------------------------------===// 16234285Sdim 17234285Sdim#define BBV_NAME "bb-vectorize" 18234285Sdim#define DEBUG_TYPE BBV_NAME 19249423Sdim#include "llvm/Transforms/Vectorize.h" 20234285Sdim#include "llvm/ADT/DenseMap.h" 21234285Sdim#include "llvm/ADT/DenseSet.h" 22249423Sdim#include "llvm/ADT/STLExtras.h" 23243830Sdim#include "llvm/ADT/SmallSet.h" 24234285Sdim#include "llvm/ADT/SmallVector.h" 25234285Sdim#include "llvm/ADT/Statistic.h" 26234285Sdim#include "llvm/ADT/StringExtras.h" 27234285Sdim#include "llvm/Analysis/AliasAnalysis.h" 28234285Sdim#include "llvm/Analysis/AliasSetTracker.h" 29243830Sdim#include "llvm/Analysis/Dominators.h" 30234285Sdim#include "llvm/Analysis/ScalarEvolution.h" 31234285Sdim#include "llvm/Analysis/ScalarEvolutionExpressions.h" 32249423Sdim#include "llvm/Analysis/TargetTransformInfo.h" 33234285Sdim#include "llvm/Analysis/ValueTracking.h" 34249423Sdim#include "llvm/IR/Constants.h" 35249423Sdim#include "llvm/IR/DataLayout.h" 36249423Sdim#include "llvm/IR/DerivedTypes.h" 37249423Sdim#include "llvm/IR/Function.h" 38249423Sdim#include "llvm/IR/Instructions.h" 39249423Sdim#include "llvm/IR/IntrinsicInst.h" 40249423Sdim#include "llvm/IR/Intrinsics.h" 41249423Sdim#include "llvm/IR/LLVMContext.h" 42249423Sdim#include "llvm/IR/Metadata.h" 43249423Sdim#include "llvm/IR/Type.h" 44249423Sdim#include "llvm/Pass.h" 45234285Sdim#include "llvm/Support/CommandLine.h" 46234285Sdim#include "llvm/Support/Debug.h" 47249423Sdim#include "llvm/Support/ValueHandle.h" 48234285Sdim#include "llvm/Support/raw_ostream.h" 49239462Sdim#include "llvm/Transforms/Utils/Local.h" 50234285Sdim#include <algorithm> 51234285Sdimusing namespace llvm; 52234285Sdim 53243830Sdimstatic cl::opt<bool> 54243830SdimIgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), 55243830Sdim cl::Hidden, cl::desc("Ignore target information")); 56243830Sdim 57234285Sdimstatic cl::opt<unsigned> 58234285SdimReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 59234285Sdim cl::desc("The required chain depth for vectorization")); 60234285Sdim 61243830Sdimstatic cl::opt<bool> 62243830SdimUseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), 63243830Sdim cl::Hidden, cl::desc("Use the chain depth requirement with" 64243830Sdim " target information")); 65243830Sdim 66234285Sdimstatic cl::opt<unsigned> 67234285SdimSearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 68234285Sdim cl::desc("The maximum search distance for instruction pairs")); 69234285Sdim 70234285Sdimstatic cl::opt<bool> 71234285SdimSplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 72234285Sdim cl::desc("Replicating one element to a pair breaks the chain")); 73234285Sdim 74234285Sdimstatic cl::opt<unsigned> 75234285SdimVectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 76234285Sdim cl::desc("The size of the native vector registers")); 77234285Sdim 78234285Sdimstatic cl::opt<unsigned> 79234285SdimMaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 80234285Sdim cl::desc("The maximum number of pairing iterations")); 81234285Sdim 82239462Sdimstatic cl::opt<bool> 83239462SdimPow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, 84239462Sdim cl::desc("Don't try to form non-2^n-length vectors")); 85239462Sdim 86234285Sdimstatic cl::opt<unsigned> 87234285SdimMaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 88234285Sdim cl::desc("The maximum number of pairable instructions per group")); 89234285Sdim 90234285Sdimstatic cl::opt<unsigned> 91249423SdimMaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, 92249423Sdim cl::desc("The maximum number of candidate instruction pairs per group")); 93249423Sdim 94249423Sdimstatic cl::opt<unsigned> 95234285SdimMaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 96234285Sdim cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 97234285Sdim " a full cycle check")); 98234285Sdim 99234285Sdimstatic cl::opt<bool> 100239462SdimNoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 101239462Sdim cl::desc("Don't try to vectorize boolean (i1) values")); 102239462Sdim 103239462Sdimstatic cl::opt<bool> 104234285SdimNoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 105234285Sdim cl::desc("Don't try to vectorize integer values")); 106234285Sdim 107234285Sdimstatic cl::opt<bool> 108234285SdimNoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 109234285Sdim cl::desc("Don't try to vectorize floating-point values")); 110234285Sdim 111243830Sdim// FIXME: This should default to false once pointer vector support works. 112234285Sdimstatic cl::opt<bool> 113243830SdimNoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, 114234982Sdim cl::desc("Don't try to vectorize pointer values")); 115234982Sdim 116234982Sdimstatic cl::opt<bool> 117234285SdimNoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 118234285Sdim cl::desc("Don't try to vectorize casting (conversion) operations")); 119234285Sdim 120234285Sdimstatic cl::opt<bool> 121234285SdimNoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 122234285Sdim cl::desc("Don't try to vectorize floating-point math intrinsics")); 123234285Sdim 124234285Sdimstatic cl::opt<bool> 125234285SdimNoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 126234285Sdim cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 127234285Sdim 128234285Sdimstatic cl::opt<bool> 129234982SdimNoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 130234982Sdim cl::desc("Don't try to vectorize select instructions")); 131234982Sdim 132234982Sdimstatic cl::opt<bool> 133239462SdimNoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 134239462Sdim cl::desc("Don't try to vectorize comparison instructions")); 135239462Sdim 136239462Sdimstatic cl::opt<bool> 137234982SdimNoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 138234982Sdim cl::desc("Don't try to vectorize getelementptr instructions")); 139234982Sdim 140234982Sdimstatic cl::opt<bool> 141234285SdimNoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 142234285Sdim cl::desc("Don't try to vectorize loads and stores")); 143234285Sdim 144234285Sdimstatic cl::opt<bool> 145234285SdimAlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 146234285Sdim cl::desc("Only generate aligned loads and stores")); 147234285Sdim 148234285Sdimstatic cl::opt<bool> 149234285SdimNoMemOpBoost("bb-vectorize-no-mem-op-boost", 150234285Sdim cl::init(false), cl::Hidden, 151234285Sdim cl::desc("Don't boost the chain-depth contribution of loads and stores")); 152234285Sdim 153234285Sdimstatic cl::opt<bool> 154234285SdimFastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 155234285Sdim cl::desc("Use a fast instruction dependency analysis")); 156234285Sdim 157234285Sdim#ifndef NDEBUG 158234285Sdimstatic cl::opt<bool> 159234285SdimDebugInstructionExamination("bb-vectorize-debug-instruction-examination", 160234285Sdim cl::init(false), cl::Hidden, 161234285Sdim cl::desc("When debugging is enabled, output information on the" 162234285Sdim " instruction-examination process")); 163234285Sdimstatic cl::opt<bool> 164234285SdimDebugCandidateSelection("bb-vectorize-debug-candidate-selection", 165234285Sdim cl::init(false), cl::Hidden, 166234285Sdim cl::desc("When debugging is enabled, output information on the" 167234285Sdim " candidate-selection process")); 168234285Sdimstatic cl::opt<bool> 169234285SdimDebugPairSelection("bb-vectorize-debug-pair-selection", 170234285Sdim cl::init(false), cl::Hidden, 171234285Sdim cl::desc("When debugging is enabled, output information on the" 172234285Sdim " pair-selection process")); 173234285Sdimstatic cl::opt<bool> 174234285SdimDebugCycleCheck("bb-vectorize-debug-cycle-check", 175234285Sdim cl::init(false), cl::Hidden, 176234285Sdim cl::desc("When debugging is enabled, output information on the" 177234285Sdim " cycle-checking process")); 178243830Sdim 179243830Sdimstatic cl::opt<bool> 180243830SdimPrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", 181243830Sdim cl::init(false), cl::Hidden, 182243830Sdim cl::desc("When debugging is enabled, dump the basic block after" 183243830Sdim " every pair is fused")); 184234285Sdim#endif 185234285Sdim 186234285SdimSTATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 187234285Sdim 188234285Sdimnamespace { 189234285Sdim struct BBVectorize : public BasicBlockPass { 190234285Sdim static char ID; // Pass identification, replacement for typeid 191234285Sdim 192234285Sdim const VectorizeConfig Config; 193234285Sdim 194234285Sdim BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 195234285Sdim : BasicBlockPass(ID), Config(C) { 196234285Sdim initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 197234285Sdim } 198234285Sdim 199234285Sdim BBVectorize(Pass *P, const VectorizeConfig &C) 200234285Sdim : BasicBlockPass(ID), Config(C) { 201234285Sdim AA = &P->getAnalysis<AliasAnalysis>(); 202243830Sdim DT = &P->getAnalysis<DominatorTree>(); 203234285Sdim SE = &P->getAnalysis<ScalarEvolution>(); 204243830Sdim TD = P->getAnalysisIfAvailable<DataLayout>(); 205249423Sdim TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); 206234285Sdim } 207234285Sdim 208234285Sdim typedef std::pair<Value *, Value *> ValuePair; 209243830Sdim typedef std::pair<ValuePair, int> ValuePairWithCost; 210234285Sdim typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 211234285Sdim typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 212243830Sdim typedef std::pair<VPPair, unsigned> VPPairWithType; 213234285Sdim 214234285Sdim AliasAnalysis *AA; 215243830Sdim DominatorTree *DT; 216234285Sdim ScalarEvolution *SE; 217243830Sdim DataLayout *TD; 218249423Sdim const TargetTransformInfo *TTI; 219234285Sdim 220234285Sdim // FIXME: const correct? 221234285Sdim 222239462Sdim bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); 223234285Sdim 224234285Sdim bool getCandidatePairs(BasicBlock &BB, 225234285Sdim BasicBlock::iterator &Start, 226249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 227243830Sdim DenseSet<ValuePair> &FixedOrderPairs, 228243830Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 229239462Sdim std::vector<Value *> &PairableInsts, bool NonPow2Len); 230234285Sdim 231243830Sdim // FIXME: The current implementation does not account for pairs that 232243830Sdim // are connected in multiple ways. For example: 233243830Sdim // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) 234243830Sdim enum PairConnectionType { 235243830Sdim PairConnectionDirect, 236243830Sdim PairConnectionSwap, 237243830Sdim PairConnectionSplat 238243830Sdim }; 239243830Sdim 240249423Sdim void computeConnectedPairs( 241249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 242249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 243249423Sdim std::vector<Value *> &PairableInsts, 244249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 245249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes); 246234285Sdim 247234285Sdim void buildDepMap(BasicBlock &BB, 248249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 249249423Sdim std::vector<Value *> &PairableInsts, 250249423Sdim DenseSet<ValuePair> &PairableInstUsers); 251234285Sdim 252249423Sdim void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 253249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 254249423Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 255249423Sdim std::vector<Value *> &PairableInsts, 256249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 257249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 258249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 259249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 260249423Sdim DenseSet<ValuePair> &PairableInstUsers, 261249423Sdim DenseMap<Value *, Value *>& ChosenPairs); 262234285Sdim 263234285Sdim void fuseChosenPairs(BasicBlock &BB, 264249423Sdim std::vector<Value *> &PairableInsts, 265249423Sdim DenseMap<Value *, Value *>& ChosenPairs, 266249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 267249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 268249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 269249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); 270234285Sdim 271243830Sdim 272234285Sdim bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 273234285Sdim 274234285Sdim bool areInstsCompatible(Instruction *I, Instruction *J, 275243830Sdim bool IsSimpleLoadStore, bool NonPow2Len, 276243830Sdim int &CostSavings, int &FixedOrder); 277234285Sdim 278234285Sdim bool trackUsesOfI(DenseSet<Value *> &Users, 279234285Sdim AliasSetTracker &WriteSet, Instruction *I, 280234285Sdim Instruction *J, bool UpdateUsers = true, 281249423Sdim DenseSet<ValuePair> *LoadMoveSetPairs = 0); 282234285Sdim 283249423Sdim void computePairsConnectedTo( 284249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 285249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 286249423Sdim std::vector<Value *> &PairableInsts, 287249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 288249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 289249423Sdim ValuePair P); 290234285Sdim 291234285Sdim bool pairsConflict(ValuePair P, ValuePair Q, 292249423Sdim DenseSet<ValuePair> &PairableInstUsers, 293249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > 294249423Sdim *PairableInstUserMap = 0, 295249423Sdim DenseSet<VPPair> *PairableInstUserPairSet = 0); 296234285Sdim 297234285Sdim bool pairWillFormCycle(ValuePair P, 298249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, 299249423Sdim DenseSet<ValuePair> &CurrentPairs); 300234285Sdim 301249423Sdim void pruneDAGFor( 302249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 303249423Sdim std::vector<Value *> &PairableInsts, 304249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 305249423Sdim DenseSet<ValuePair> &PairableInstUsers, 306249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 307249423Sdim DenseSet<VPPair> &PairableInstUserPairSet, 308249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 309249423Sdim DenseMap<ValuePair, size_t> &DAG, 310249423Sdim DenseSet<ValuePair> &PrunedDAG, ValuePair J, 311249423Sdim bool UseCycleCheck); 312234285Sdim 313249423Sdim void buildInitialDAGFor( 314249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 315249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 316249423Sdim std::vector<Value *> &PairableInsts, 317249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 318249423Sdim DenseSet<ValuePair> &PairableInstUsers, 319249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 320249423Sdim DenseMap<ValuePair, size_t> &DAG, ValuePair J); 321234285Sdim 322249423Sdim void findBestDAGFor( 323249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 324249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 325249423Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 326249423Sdim std::vector<Value *> &PairableInsts, 327249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 328249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 329249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 330249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 331249423Sdim DenseSet<ValuePair> &PairableInstUsers, 332249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 333249423Sdim DenseSet<VPPair> &PairableInstUserPairSet, 334249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 335249423Sdim DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 336249423Sdim int &BestEffSize, Value *II, std::vector<Value *>&JJ, 337249423Sdim bool UseCycleCheck); 338234285Sdim 339234285Sdim Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 340243830Sdim Instruction *J, unsigned o); 341234285Sdim 342234285Sdim void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 343239462Sdim unsigned MaskOffset, unsigned NumInElem, 344239462Sdim unsigned NumInElem1, unsigned IdxOffset, 345239462Sdim std::vector<Constant*> &Mask); 346234285Sdim 347234285Sdim Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 348234285Sdim Instruction *J); 349234285Sdim 350239462Sdim bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, 351239462Sdim unsigned o, Value *&LOp, unsigned numElemL, 352243830Sdim Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, 353239462Sdim unsigned IdxOff = 0); 354239462Sdim 355234285Sdim Value *getReplacementInput(LLVMContext& Context, Instruction *I, 356243830Sdim Instruction *J, unsigned o, bool IBeforeJ); 357234285Sdim 358234285Sdim void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 359263508Sdim Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, 360243830Sdim bool IBeforeJ); 361234285Sdim 362234285Sdim void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 363234285Sdim Instruction *J, Instruction *K, 364234285Sdim Instruction *&InsertionPt, Instruction *&K1, 365243830Sdim Instruction *&K2); 366234285Sdim 367234285Sdim void collectPairLoadMoveSet(BasicBlock &BB, 368234285Sdim DenseMap<Value *, Value *> &ChosenPairs, 369249423Sdim DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 370249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 371234285Sdim Instruction *I); 372234285Sdim 373234285Sdim void collectLoadMoveSet(BasicBlock &BB, 374234285Sdim std::vector<Value *> &PairableInsts, 375234285Sdim DenseMap<Value *, Value *> &ChosenPairs, 376249423Sdim DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 377249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs); 378234285Sdim 379234285Sdim bool canMoveUsesOfIAfterJ(BasicBlock &BB, 380249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 381234285Sdim Instruction *I, Instruction *J); 382234285Sdim 383234285Sdim void moveUsesOfIAfterJ(BasicBlock &BB, 384249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 385234285Sdim Instruction *&InsertionPt, 386234285Sdim Instruction *I, Instruction *J); 387234285Sdim 388239462Sdim void combineMetadata(Instruction *K, const Instruction *J); 389239462Sdim 390234285Sdim bool vectorizeBB(BasicBlock &BB) { 391243830Sdim if (!DT->isReachableFromEntry(&BB)) { 392243830Sdim DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << 393243830Sdim " in " << BB.getParent()->getName() << "\n"); 394243830Sdim return false; 395243830Sdim } 396243830Sdim 397249423Sdim DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); 398243830Sdim 399234285Sdim bool changed = false; 400234285Sdim // Iterate a sufficient number of times to merge types of size 1 bit, 401234285Sdim // then 2 bits, then 4, etc. up to half of the target vector width of the 402234285Sdim // target vector register. 403239462Sdim unsigned n = 1; 404239462Sdim for (unsigned v = 2; 405249423Sdim (TTI || v <= Config.VectorBits) && 406243830Sdim (!Config.MaxIter || n <= Config.MaxIter); 407234285Sdim v *= 2, ++n) { 408234285Sdim DEBUG(dbgs() << "BBV: fusing loop #" << n << 409234285Sdim " for " << BB.getName() << " in " << 410234285Sdim BB.getParent()->getName() << "...\n"); 411234285Sdim if (vectorizePairs(BB)) 412234285Sdim changed = true; 413234285Sdim else 414234285Sdim break; 415234285Sdim } 416234285Sdim 417239462Sdim if (changed && !Pow2LenOnly) { 418239462Sdim ++n; 419239462Sdim for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { 420239462Sdim DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << 421239462Sdim n << " for " << BB.getName() << " in " << 422239462Sdim BB.getParent()->getName() << "...\n"); 423239462Sdim if (!vectorizePairs(BB, true)) break; 424239462Sdim } 425239462Sdim } 426239462Sdim 427234285Sdim DEBUG(dbgs() << "BBV: done!\n"); 428234285Sdim return changed; 429234285Sdim } 430234285Sdim 431234285Sdim virtual bool runOnBasicBlock(BasicBlock &BB) { 432234285Sdim AA = &getAnalysis<AliasAnalysis>(); 433243830Sdim DT = &getAnalysis<DominatorTree>(); 434234285Sdim SE = &getAnalysis<ScalarEvolution>(); 435243830Sdim TD = getAnalysisIfAvailable<DataLayout>(); 436249423Sdim TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); 437234285Sdim 438234285Sdim return vectorizeBB(BB); 439234285Sdim } 440234285Sdim 441234285Sdim virtual void getAnalysisUsage(AnalysisUsage &AU) const { 442234285Sdim BasicBlockPass::getAnalysisUsage(AU); 443234285Sdim AU.addRequired<AliasAnalysis>(); 444243830Sdim AU.addRequired<DominatorTree>(); 445234285Sdim AU.addRequired<ScalarEvolution>(); 446249423Sdim AU.addRequired<TargetTransformInfo>(); 447234285Sdim AU.addPreserved<AliasAnalysis>(); 448243830Sdim AU.addPreserved<DominatorTree>(); 449234285Sdim AU.addPreserved<ScalarEvolution>(); 450234285Sdim AU.setPreservesCFG(); 451234285Sdim } 452234285Sdim 453239462Sdim static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { 454239462Sdim assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && 455239462Sdim "Cannot form vector from incompatible scalar types"); 456239462Sdim Type *STy = ElemTy->getScalarType(); 457239462Sdim 458239462Sdim unsigned numElem; 459234285Sdim if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 460239462Sdim numElem = VTy->getNumElements(); 461239462Sdim } else { 462239462Sdim numElem = 1; 463234285Sdim } 464234285Sdim 465239462Sdim if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { 466239462Sdim numElem += VTy->getNumElements(); 467239462Sdim } else { 468239462Sdim numElem += 1; 469239462Sdim } 470239462Sdim 471239462Sdim return VectorType::get(STy, numElem); 472234285Sdim } 473234285Sdim 474239462Sdim static inline void getInstructionTypes(Instruction *I, 475239462Sdim Type *&T1, Type *&T2) { 476249423Sdim if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 477239462Sdim // For stores, it is the value type, not the pointer type that matters 478239462Sdim // because the value is what will come from a vector register. 479239462Sdim 480249423Sdim Value *IVal = SI->getValueOperand(); 481239462Sdim T1 = IVal->getType(); 482239462Sdim } else { 483239462Sdim T1 = I->getType(); 484239462Sdim } 485239462Sdim 486249423Sdim if (CastInst *CI = dyn_cast<CastInst>(I)) 487249423Sdim T2 = CI->getSrcTy(); 488239462Sdim else 489239462Sdim T2 = T1; 490243830Sdim 491243830Sdim if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 492243830Sdim T2 = SI->getCondition()->getType(); 493243830Sdim } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 494243830Sdim T2 = SI->getOperand(0)->getType(); 495243830Sdim } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { 496243830Sdim T2 = CI->getOperand(0)->getType(); 497243830Sdim } 498239462Sdim } 499239462Sdim 500234285Sdim // Returns the weight associated with the provided value. A chain of 501234285Sdim // candidate pairs has a length given by the sum of the weights of its 502234285Sdim // members (one weight per pair; the weight of each member of the pair 503234285Sdim // is assumed to be the same). This length is then compared to the 504234285Sdim // chain-length threshold to determine if a given chain is significant 505234285Sdim // enough to be vectorized. The length is also used in comparing 506234285Sdim // candidate chains where longer chains are considered to be better. 507234285Sdim // Note: when this function returns 0, the resulting instructions are 508234285Sdim // not actually fused. 509234285Sdim inline size_t getDepthFactor(Value *V) { 510234285Sdim // InsertElement and ExtractElement have a depth factor of zero. This is 511234285Sdim // for two reasons: First, they cannot be usefully fused. Second, because 512234285Sdim // the pass generates a lot of these, they can confuse the simple metric 513249423Sdim // used to compare the dags in the next iteration. Thus, giving them a 514234285Sdim // weight of zero allows the pass to essentially ignore them in 515234285Sdim // subsequent iterations when looking for vectorization opportunities 516234285Sdim // while still tracking dependency chains that flow through those 517234285Sdim // instructions. 518234285Sdim if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 519234285Sdim return 0; 520234285Sdim 521234285Sdim // Give a load or store half of the required depth so that load/store 522234285Sdim // pairs will vectorize. 523234285Sdim if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 524234285Sdim return Config.ReqChainDepth/2; 525234285Sdim 526234285Sdim return 1; 527234285Sdim } 528234285Sdim 529249423Sdim // Returns the cost of the provided instruction using TTI. 530243830Sdim // This does not handle loads and stores. 531243830Sdim unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { 532243830Sdim switch (Opcode) { 533243830Sdim default: break; 534243830Sdim case Instruction::GetElementPtr: 535243830Sdim // We mark this instruction as zero-cost because scalar GEPs are usually 536263508Sdim // lowered to the instruction addressing mode. At the moment we don't 537243830Sdim // generate vector GEPs. 538243830Sdim return 0; 539243830Sdim case Instruction::Br: 540249423Sdim return TTI->getCFInstrCost(Opcode); 541243830Sdim case Instruction::PHI: 542243830Sdim return 0; 543243830Sdim case Instruction::Add: 544243830Sdim case Instruction::FAdd: 545243830Sdim case Instruction::Sub: 546243830Sdim case Instruction::FSub: 547243830Sdim case Instruction::Mul: 548243830Sdim case Instruction::FMul: 549243830Sdim case Instruction::UDiv: 550243830Sdim case Instruction::SDiv: 551243830Sdim case Instruction::FDiv: 552243830Sdim case Instruction::URem: 553243830Sdim case Instruction::SRem: 554243830Sdim case Instruction::FRem: 555243830Sdim case Instruction::Shl: 556243830Sdim case Instruction::LShr: 557243830Sdim case Instruction::AShr: 558243830Sdim case Instruction::And: 559243830Sdim case Instruction::Or: 560243830Sdim case Instruction::Xor: 561249423Sdim return TTI->getArithmeticInstrCost(Opcode, T1); 562243830Sdim case Instruction::Select: 563243830Sdim case Instruction::ICmp: 564243830Sdim case Instruction::FCmp: 565249423Sdim return TTI->getCmpSelInstrCost(Opcode, T1, T2); 566243830Sdim case Instruction::ZExt: 567243830Sdim case Instruction::SExt: 568243830Sdim case Instruction::FPToUI: 569243830Sdim case Instruction::FPToSI: 570243830Sdim case Instruction::FPExt: 571243830Sdim case Instruction::PtrToInt: 572243830Sdim case Instruction::IntToPtr: 573243830Sdim case Instruction::SIToFP: 574243830Sdim case Instruction::UIToFP: 575243830Sdim case Instruction::Trunc: 576243830Sdim case Instruction::FPTrunc: 577243830Sdim case Instruction::BitCast: 578243830Sdim case Instruction::ShuffleVector: 579249423Sdim return TTI->getCastInstrCost(Opcode, T1, T2); 580243830Sdim } 581243830Sdim 582243830Sdim return 1; 583243830Sdim } 584243830Sdim 585234285Sdim // This determines the relative offset of two loads or stores, returning 586234285Sdim // true if the offset could be determined to be some constant value. 587234285Sdim // For example, if OffsetInElmts == 1, then J accesses the memory directly 588234285Sdim // after I; if OffsetInElmts == -1 then I accesses the memory 589239462Sdim // directly after J. 590234285Sdim bool getPairPtrInfo(Instruction *I, Instruction *J, 591234285Sdim Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 592243830Sdim unsigned &IAddressSpace, unsigned &JAddressSpace, 593243830Sdim int64_t &OffsetInElmts, bool ComputeOffset = true) { 594234285Sdim OffsetInElmts = 0; 595243830Sdim if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 596243830Sdim LoadInst *LJ = cast<LoadInst>(J); 597243830Sdim IPtr = LI->getPointerOperand(); 598243830Sdim JPtr = LJ->getPointerOperand(); 599243830Sdim IAlignment = LI->getAlignment(); 600243830Sdim JAlignment = LJ->getAlignment(); 601243830Sdim IAddressSpace = LI->getPointerAddressSpace(); 602243830Sdim JAddressSpace = LJ->getPointerAddressSpace(); 603234285Sdim } else { 604243830Sdim StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); 605243830Sdim IPtr = SI->getPointerOperand(); 606243830Sdim JPtr = SJ->getPointerOperand(); 607243830Sdim IAlignment = SI->getAlignment(); 608243830Sdim JAlignment = SJ->getAlignment(); 609243830Sdim IAddressSpace = SI->getPointerAddressSpace(); 610243830Sdim JAddressSpace = SJ->getPointerAddressSpace(); 611234285Sdim } 612234285Sdim 613243830Sdim if (!ComputeOffset) 614243830Sdim return true; 615243830Sdim 616234285Sdim const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 617234285Sdim const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 618234285Sdim 619234285Sdim // If this is a trivial offset, then we'll get something like 620234285Sdim // 1*sizeof(type). With target data, which we need anyway, this will get 621234285Sdim // constant folded into a number. 622234285Sdim const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 623234285Sdim if (const SCEVConstant *ConstOffSCEV = 624234285Sdim dyn_cast<SCEVConstant>(OffsetSCEV)) { 625234285Sdim ConstantInt *IntOff = ConstOffSCEV->getValue(); 626234285Sdim int64_t Offset = IntOff->getSExtValue(); 627234285Sdim 628263508Sdim Type *VTy = IPtr->getType()->getPointerElementType(); 629234285Sdim int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 630234285Sdim 631263508Sdim Type *VTy2 = JPtr->getType()->getPointerElementType(); 632239462Sdim if (VTy != VTy2 && Offset < 0) { 633239462Sdim int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); 634239462Sdim OffsetInElmts = Offset/VTy2TSS; 635239462Sdim return (abs64(Offset) % VTy2TSS) == 0; 636239462Sdim } 637234285Sdim 638234285Sdim OffsetInElmts = Offset/VTyTSS; 639234285Sdim return (abs64(Offset) % VTyTSS) == 0; 640234285Sdim } 641234285Sdim 642234285Sdim return false; 643234285Sdim } 644234285Sdim 645234285Sdim // Returns true if the provided CallInst represents an intrinsic that can 646234285Sdim // be vectorized. 647234285Sdim bool isVectorizableIntrinsic(CallInst* I) { 648234285Sdim Function *F = I->getCalledFunction(); 649234285Sdim if (!F) return false; 650234285Sdim 651249423Sdim Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 652234285Sdim if (!IID) return false; 653234285Sdim 654234285Sdim switch(IID) { 655234285Sdim default: 656234285Sdim return false; 657234285Sdim case Intrinsic::sqrt: 658234285Sdim case Intrinsic::powi: 659234285Sdim case Intrinsic::sin: 660234285Sdim case Intrinsic::cos: 661234285Sdim case Intrinsic::log: 662234285Sdim case Intrinsic::log2: 663234285Sdim case Intrinsic::log10: 664234285Sdim case Intrinsic::exp: 665234285Sdim case Intrinsic::exp2: 666234285Sdim case Intrinsic::pow: 667234285Sdim return Config.VectorizeMath; 668234285Sdim case Intrinsic::fma: 669249423Sdim case Intrinsic::fmuladd: 670234285Sdim return Config.VectorizeFMA; 671234285Sdim } 672234285Sdim } 673234285Sdim 674243830Sdim bool isPureIEChain(InsertElementInst *IE) { 675243830Sdim InsertElementInst *IENext = IE; 676243830Sdim do { 677243830Sdim if (!isa<UndefValue>(IENext->getOperand(0)) && 678243830Sdim !isa<InsertElementInst>(IENext->getOperand(0))) { 679243830Sdim return false; 680243830Sdim } 681243830Sdim } while ((IENext = 682243830Sdim dyn_cast<InsertElementInst>(IENext->getOperand(0)))); 683243830Sdim 684243830Sdim return true; 685243830Sdim } 686234285Sdim }; 687234285Sdim 688234285Sdim // This function implements one vectorization iteration on the provided 689234285Sdim // basic block. It returns true if the block is changed. 690239462Sdim bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { 691234285Sdim bool ShouldContinue; 692234285Sdim BasicBlock::iterator Start = BB.getFirstInsertionPt(); 693234285Sdim 694234285Sdim std::vector<Value *> AllPairableInsts; 695234285Sdim DenseMap<Value *, Value *> AllChosenPairs; 696243830Sdim DenseSet<ValuePair> AllFixedOrderPairs; 697243830Sdim DenseMap<VPPair, unsigned> AllPairConnectionTypes; 698249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, 699249423Sdim AllConnectedPairDeps; 700234285Sdim 701234285Sdim do { 702234285Sdim std::vector<Value *> PairableInsts; 703249423Sdim DenseMap<Value *, std::vector<Value *> > CandidatePairs; 704243830Sdim DenseSet<ValuePair> FixedOrderPairs; 705243830Sdim DenseMap<ValuePair, int> CandidatePairCostSavings; 706234285Sdim ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 707243830Sdim FixedOrderPairs, 708243830Sdim CandidatePairCostSavings, 709239462Sdim PairableInsts, NonPow2Len); 710234285Sdim if (PairableInsts.empty()) continue; 711234285Sdim 712249423Sdim // Build the candidate pair set for faster lookups. 713249423Sdim DenseSet<ValuePair> CandidatePairsSet; 714249423Sdim for (DenseMap<Value *, std::vector<Value *> >::iterator I = 715249423Sdim CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) 716249423Sdim for (std::vector<Value *>::iterator J = I->second.begin(), 717249423Sdim JE = I->second.end(); J != JE; ++J) 718249423Sdim CandidatePairsSet.insert(ValuePair(I->first, *J)); 719249423Sdim 720234285Sdim // Now we have a map of all of the pairable instructions and we need to 721234285Sdim // select the best possible pairing. A good pairing is one such that the 722234285Sdim // users of the pair are also paired. This defines a (directed) forest 723234285Sdim // over the pairs such that two pairs are connected iff the second pair 724234285Sdim // uses the first. 725234285Sdim 726234285Sdim // Note that it only matters that both members of the second pair use some 727234285Sdim // element of the first pair (to allow for splatting). 728234285Sdim 729249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, 730249423Sdim ConnectedPairDeps; 731243830Sdim DenseMap<VPPair, unsigned> PairConnectionTypes; 732249423Sdim computeConnectedPairs(CandidatePairs, CandidatePairsSet, 733249423Sdim PairableInsts, ConnectedPairs, PairConnectionTypes); 734234285Sdim if (ConnectedPairs.empty()) continue; 735234285Sdim 736249423Sdim for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 737243830Sdim I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 738249423Sdim I != IE; ++I) 739249423Sdim for (std::vector<ValuePair>::iterator J = I->second.begin(), 740249423Sdim JE = I->second.end(); J != JE; ++J) 741249423Sdim ConnectedPairDeps[*J].push_back(I->first); 742243830Sdim 743234285Sdim // Build the pairable-instruction dependency map 744234285Sdim DenseSet<ValuePair> PairableInstUsers; 745234285Sdim buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 746234285Sdim 747234285Sdim // There is now a graph of the connected pairs. For each variable, pick 748249423Sdim // the pairing with the largest dag meeting the depth requirement on at 749249423Sdim // least one branch. Then select all pairings that are part of that dag 750234285Sdim // and remove them from the list of available pairings and pairable 751234285Sdim // variables. 752234285Sdim 753234285Sdim DenseMap<Value *, Value *> ChosenPairs; 754249423Sdim choosePairs(CandidatePairs, CandidatePairsSet, 755249423Sdim CandidatePairCostSavings, 756243830Sdim PairableInsts, FixedOrderPairs, PairConnectionTypes, 757243830Sdim ConnectedPairs, ConnectedPairDeps, 758234285Sdim PairableInstUsers, ChosenPairs); 759234285Sdim 760234285Sdim if (ChosenPairs.empty()) continue; 761234285Sdim AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 762234285Sdim PairableInsts.end()); 763234285Sdim AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 764243830Sdim 765243830Sdim // Only for the chosen pairs, propagate information on fixed-order pairs, 766243830Sdim // pair connections, and their types to the data structures used by the 767243830Sdim // pair fusion procedures. 768243830Sdim for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), 769243830Sdim IE = ChosenPairs.end(); I != IE; ++I) { 770243830Sdim if (FixedOrderPairs.count(*I)) 771243830Sdim AllFixedOrderPairs.insert(*I); 772243830Sdim else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) 773243830Sdim AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); 774243830Sdim 775243830Sdim for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); 776243830Sdim J != IE; ++J) { 777243830Sdim DenseMap<VPPair, unsigned>::iterator K = 778243830Sdim PairConnectionTypes.find(VPPair(*I, *J)); 779243830Sdim if (K != PairConnectionTypes.end()) { 780243830Sdim AllPairConnectionTypes.insert(*K); 781243830Sdim } else { 782243830Sdim K = PairConnectionTypes.find(VPPair(*J, *I)); 783243830Sdim if (K != PairConnectionTypes.end()) 784243830Sdim AllPairConnectionTypes.insert(*K); 785243830Sdim } 786243830Sdim } 787243830Sdim } 788243830Sdim 789249423Sdim for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 790243830Sdim I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 791249423Sdim I != IE; ++I) 792249423Sdim for (std::vector<ValuePair>::iterator J = I->second.begin(), 793249423Sdim JE = I->second.end(); J != JE; ++J) 794249423Sdim if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { 795249423Sdim AllConnectedPairs[I->first].push_back(*J); 796249423Sdim AllConnectedPairDeps[*J].push_back(I->first); 797249423Sdim } 798234285Sdim } while (ShouldContinue); 799234285Sdim 800234285Sdim if (AllChosenPairs.empty()) return false; 801234285Sdim NumFusedOps += AllChosenPairs.size(); 802234285Sdim 803234285Sdim // A set of pairs has now been selected. It is now necessary to replace the 804234285Sdim // paired instructions with vector instructions. For this procedure each 805234285Sdim // operand must be replaced with a vector operand. This vector is formed 806234285Sdim // by using build_vector on the old operands. The replaced values are then 807234285Sdim // replaced with a vector_extract on the result. Subsequent optimization 808234285Sdim // passes should coalesce the build/extract combinations. 809234285Sdim 810243830Sdim fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, 811243830Sdim AllPairConnectionTypes, 812243830Sdim AllConnectedPairs, AllConnectedPairDeps); 813239462Sdim 814239462Sdim // It is important to cleanup here so that future iterations of this 815239462Sdim // function have less work to do. 816243830Sdim (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo()); 817234285Sdim return true; 818234285Sdim } 819234285Sdim 820234285Sdim // This function returns true if the provided instruction is capable of being 821234285Sdim // fused into a vector instruction. This determination is based only on the 822234285Sdim // type and other attributes of the instruction. 823234285Sdim bool BBVectorize::isInstVectorizable(Instruction *I, 824234285Sdim bool &IsSimpleLoadStore) { 825234285Sdim IsSimpleLoadStore = false; 826234285Sdim 827234285Sdim if (CallInst *C = dyn_cast<CallInst>(I)) { 828234285Sdim if (!isVectorizableIntrinsic(C)) 829234285Sdim return false; 830234285Sdim } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 831234285Sdim // Vectorize simple loads if possbile: 832234285Sdim IsSimpleLoadStore = L->isSimple(); 833234285Sdim if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 834234285Sdim return false; 835234285Sdim } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 836234285Sdim // Vectorize simple stores if possbile: 837234285Sdim IsSimpleLoadStore = S->isSimple(); 838234285Sdim if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 839234285Sdim return false; 840234285Sdim } else if (CastInst *C = dyn_cast<CastInst>(I)) { 841234285Sdim // We can vectorize casts, but not casts of pointer types, etc. 842234285Sdim if (!Config.VectorizeCasts) 843234285Sdim return false; 844234285Sdim 845234285Sdim Type *SrcTy = C->getSrcTy(); 846234982Sdim if (!SrcTy->isSingleValueType()) 847234285Sdim return false; 848234285Sdim 849234285Sdim Type *DestTy = C->getDestTy(); 850234982Sdim if (!DestTy->isSingleValueType()) 851234285Sdim return false; 852234982Sdim } else if (isa<SelectInst>(I)) { 853234982Sdim if (!Config.VectorizeSelect) 854234982Sdim return false; 855239462Sdim } else if (isa<CmpInst>(I)) { 856239462Sdim if (!Config.VectorizeCmp) 857239462Sdim return false; 858234982Sdim } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 859234982Sdim if (!Config.VectorizeGEP) 860234982Sdim return false; 861234982Sdim 862234982Sdim // Currently, vector GEPs exist only with one index. 863234982Sdim if (G->getNumIndices() != 1) 864234982Sdim return false; 865234285Sdim } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 866234285Sdim isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 867234285Sdim return false; 868234285Sdim } 869234285Sdim 870234285Sdim // We can't vectorize memory operations without target data 871234285Sdim if (TD == 0 && IsSimpleLoadStore) 872234285Sdim return false; 873234285Sdim 874234285Sdim Type *T1, *T2; 875239462Sdim getInstructionTypes(I, T1, T2); 876234285Sdim 877234285Sdim // Not every type can be vectorized... 878234285Sdim if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 879234285Sdim !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 880234285Sdim return false; 881234285Sdim 882243830Sdim if (T1->getScalarSizeInBits() == 1) { 883239462Sdim if (!Config.VectorizeBools) 884239462Sdim return false; 885239462Sdim } else { 886243830Sdim if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) 887239462Sdim return false; 888239462Sdim } 889243830Sdim 890243830Sdim if (T2->getScalarSizeInBits() == 1) { 891243830Sdim if (!Config.VectorizeBools) 892243830Sdim return false; 893243830Sdim } else { 894243830Sdim if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) 895243830Sdim return false; 896243830Sdim } 897243830Sdim 898234285Sdim if (!Config.VectorizeFloats 899234285Sdim && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 900234285Sdim return false; 901234285Sdim 902239462Sdim // Don't vectorize target-specific types. 903239462Sdim if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 904239462Sdim return false; 905239462Sdim if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 906239462Sdim return false; 907239462Sdim 908234982Sdim if ((!Config.VectorizePointers || TD == 0) && 909234982Sdim (T1->getScalarType()->isPointerTy() || 910234982Sdim T2->getScalarType()->isPointerTy())) 911234982Sdim return false; 912234982Sdim 913249423Sdim if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || 914249423Sdim T2->getPrimitiveSizeInBits() >= Config.VectorBits)) 915234285Sdim return false; 916234285Sdim 917234285Sdim return true; 918234285Sdim } 919234285Sdim 920234285Sdim // This function returns true if the two provided instructions are compatible 921234285Sdim // (meaning that they can be fused into a vector instruction). This assumes 922234285Sdim // that I has already been determined to be vectorizable and that J is not 923249423Sdim // in the use dag of I. 924234285Sdim bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 925243830Sdim bool IsSimpleLoadStore, bool NonPow2Len, 926243830Sdim int &CostSavings, int &FixedOrder) { 927234285Sdim DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 928234285Sdim " <-> " << *J << "\n"); 929234285Sdim 930243830Sdim CostSavings = 0; 931243830Sdim FixedOrder = 0; 932243830Sdim 933234285Sdim // Loads and stores can be merged if they have different alignments, 934234285Sdim // but are otherwise the same. 935239462Sdim if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | 936239462Sdim (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) 937239462Sdim return false; 938234285Sdim 939239462Sdim Type *IT1, *IT2, *JT1, *JT2; 940239462Sdim getInstructionTypes(I, IT1, IT2); 941239462Sdim getInstructionTypes(J, JT1, JT2); 942239462Sdim unsigned MaxTypeBits = std::max( 943239462Sdim IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), 944239462Sdim IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); 945249423Sdim if (!TTI && MaxTypeBits > Config.VectorBits) 946234285Sdim return false; 947239462Sdim 948234285Sdim // FIXME: handle addsub-type operations! 949234285Sdim 950234285Sdim if (IsSimpleLoadStore) { 951234285Sdim Value *IPtr, *JPtr; 952243830Sdim unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 953234285Sdim int64_t OffsetInElmts = 0; 954234285Sdim if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 955243830Sdim IAddressSpace, JAddressSpace, 956234285Sdim OffsetInElmts) && abs64(OffsetInElmts) == 1) { 957243830Sdim FixedOrder = (int) OffsetInElmts; 958243830Sdim unsigned BottomAlignment = IAlignment; 959243830Sdim if (OffsetInElmts < 0) BottomAlignment = JAlignment; 960243830Sdim 961243830Sdim Type *aTypeI = isa<StoreInst>(I) ? 962243830Sdim cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 963243830Sdim Type *aTypeJ = isa<StoreInst>(J) ? 964243830Sdim cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); 965243830Sdim Type *VType = getVecTypeForPair(aTypeI, aTypeJ); 966243830Sdim 967234285Sdim if (Config.AlignedOnly) { 968234285Sdim // An aligned load or store is possible only if the instruction 969234285Sdim // with the lower offset has an alignment suitable for the 970234285Sdim // vector type. 971234285Sdim 972234285Sdim unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 973234285Sdim if (BottomAlignment < VecAlignment) 974234285Sdim return false; 975234285Sdim } 976243830Sdim 977249423Sdim if (TTI) { 978249423Sdim unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, 979249423Sdim IAlignment, IAddressSpace); 980249423Sdim unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, 981249423Sdim JAlignment, JAddressSpace); 982249423Sdim unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, 983249423Sdim BottomAlignment, 984249423Sdim IAddressSpace); 985249423Sdim 986249423Sdim ICost += TTI->getAddressComputationCost(aTypeI); 987249423Sdim JCost += TTI->getAddressComputationCost(aTypeJ); 988249423Sdim VCost += TTI->getAddressComputationCost(VType); 989249423Sdim 990243830Sdim if (VCost > ICost + JCost) 991243830Sdim return false; 992243830Sdim 993243830Sdim // We don't want to fuse to a type that will be split, even 994243830Sdim // if the two input types will also be split and there is no other 995243830Sdim // associated cost. 996249423Sdim unsigned VParts = TTI->getNumberOfParts(VType); 997243830Sdim if (VParts > 1) 998243830Sdim return false; 999243830Sdim else if (!VParts && VCost == ICost + JCost) 1000243830Sdim return false; 1001243830Sdim 1002243830Sdim CostSavings = ICost + JCost - VCost; 1003243830Sdim } 1004234285Sdim } else { 1005234285Sdim return false; 1006234285Sdim } 1007249423Sdim } else if (TTI) { 1008243830Sdim unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); 1009243830Sdim unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); 1010243830Sdim Type *VT1 = getVecTypeForPair(IT1, JT1), 1011243830Sdim *VT2 = getVecTypeForPair(IT2, JT2); 1012249423Sdim 1013249423Sdim // Note that this procedure is incorrect for insert and extract element 1014249423Sdim // instructions (because combining these often results in a shuffle), 1015249423Sdim // but this cost is ignored (because insert and extract element 1016249423Sdim // instructions are assigned a zero depth factor and are not really 1017249423Sdim // fused in general). 1018243830Sdim unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); 1019243830Sdim 1020243830Sdim if (VCost > ICost + JCost) 1021243830Sdim return false; 1022243830Sdim 1023243830Sdim // We don't want to fuse to a type that will be split, even 1024243830Sdim // if the two input types will also be split and there is no other 1025243830Sdim // associated cost. 1026249423Sdim unsigned VParts1 = TTI->getNumberOfParts(VT1), 1027249423Sdim VParts2 = TTI->getNumberOfParts(VT2); 1028243830Sdim if (VParts1 > 1 || VParts2 > 1) 1029243830Sdim return false; 1030243830Sdim else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) 1031243830Sdim return false; 1032243830Sdim 1033243830Sdim CostSavings = ICost + JCost - VCost; 1034234285Sdim } 1035234285Sdim 1036234285Sdim // The powi intrinsic is special because only the first argument is 1037234285Sdim // vectorized, the second arguments must be equal. 1038234285Sdim CallInst *CI = dyn_cast<CallInst>(I); 1039234285Sdim Function *FI; 1040249423Sdim if (CI && (FI = CI->getCalledFunction())) { 1041249423Sdim Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); 1042249423Sdim if (IID == Intrinsic::powi) { 1043249423Sdim Value *A1I = CI->getArgOperand(1), 1044249423Sdim *A1J = cast<CallInst>(J)->getArgOperand(1); 1045249423Sdim const SCEV *A1ISCEV = SE->getSCEV(A1I), 1046249423Sdim *A1JSCEV = SE->getSCEV(A1J); 1047249423Sdim return (A1ISCEV == A1JSCEV); 1048249423Sdim } 1049234285Sdim 1050249423Sdim if (IID && TTI) { 1051249423Sdim SmallVector<Type*, 4> Tys; 1052249423Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 1053249423Sdim Tys.push_back(CI->getArgOperand(i)->getType()); 1054249423Sdim unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); 1055249423Sdim 1056249423Sdim Tys.clear(); 1057249423Sdim CallInst *CJ = cast<CallInst>(J); 1058249423Sdim for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) 1059249423Sdim Tys.push_back(CJ->getArgOperand(i)->getType()); 1060249423Sdim unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); 1061249423Sdim 1062249423Sdim Tys.clear(); 1063249423Sdim assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && 1064249423Sdim "Intrinsic argument counts differ"); 1065249423Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1066249423Sdim if (IID == Intrinsic::powi && i == 1) 1067249423Sdim Tys.push_back(CI->getArgOperand(i)->getType()); 1068249423Sdim else 1069249423Sdim Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), 1070249423Sdim CJ->getArgOperand(i)->getType())); 1071249423Sdim } 1072249423Sdim 1073249423Sdim Type *RetTy = getVecTypeForPair(IT1, JT1); 1074249423Sdim unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); 1075249423Sdim 1076249423Sdim if (VCost > ICost + JCost) 1077249423Sdim return false; 1078249423Sdim 1079249423Sdim // We don't want to fuse to a type that will be split, even 1080249423Sdim // if the two input types will also be split and there is no other 1081249423Sdim // associated cost. 1082249423Sdim unsigned RetParts = TTI->getNumberOfParts(RetTy); 1083249423Sdim if (RetParts > 1) 1084249423Sdim return false; 1085249423Sdim else if (!RetParts && VCost == ICost + JCost) 1086249423Sdim return false; 1087249423Sdim 1088249423Sdim for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1089249423Sdim if (!Tys[i]->isVectorTy()) 1090249423Sdim continue; 1091249423Sdim 1092249423Sdim unsigned NumParts = TTI->getNumberOfParts(Tys[i]); 1093249423Sdim if (NumParts > 1) 1094249423Sdim return false; 1095249423Sdim else if (!NumParts && VCost == ICost + JCost) 1096249423Sdim return false; 1097249423Sdim } 1098249423Sdim 1099249423Sdim CostSavings = ICost + JCost - VCost; 1100249423Sdim } 1101234285Sdim } 1102234285Sdim 1103234285Sdim return true; 1104234285Sdim } 1105234285Sdim 1106234285Sdim // Figure out whether or not J uses I and update the users and write-set 1107234285Sdim // structures associated with I. Specifically, Users represents the set of 1108234285Sdim // instructions that depend on I. WriteSet represents the set 1109234285Sdim // of memory locations that are dependent on I. If UpdateUsers is true, 1110234285Sdim // and J uses I, then Users is updated to contain J and WriteSet is updated 1111234285Sdim // to contain any memory locations to which J writes. The function returns 1112234285Sdim // true if J uses I. By default, alias analysis is used to determine 1113234285Sdim // whether J reads from memory that overlaps with a location in WriteSet. 1114249423Sdim // If LoadMoveSet is not null, then it is a previously-computed map 1115234285Sdim // where the key is the memory-based user instruction and the value is 1116234285Sdim // the instruction to be compared with I. So, if LoadMoveSet is provided, 1117234285Sdim // then the alias analysis is not used. This is necessary because this 1118234285Sdim // function is called during the process of moving instructions during 1119234285Sdim // vectorization and the results of the alias analysis are not stable during 1120234285Sdim // that process. 1121234285Sdim bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 1122234285Sdim AliasSetTracker &WriteSet, Instruction *I, 1123234285Sdim Instruction *J, bool UpdateUsers, 1124249423Sdim DenseSet<ValuePair> *LoadMoveSetPairs) { 1125234285Sdim bool UsesI = false; 1126234285Sdim 1127234285Sdim // This instruction may already be marked as a user due, for example, to 1128234285Sdim // being a member of a selected pair. 1129234285Sdim if (Users.count(J)) 1130234285Sdim UsesI = true; 1131234285Sdim 1132234285Sdim if (!UsesI) 1133234285Sdim for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 1134234285Sdim JU != JE; ++JU) { 1135234285Sdim Value *V = *JU; 1136234285Sdim if (I == V || Users.count(V)) { 1137234285Sdim UsesI = true; 1138234285Sdim break; 1139234285Sdim } 1140234285Sdim } 1141234285Sdim if (!UsesI && J->mayReadFromMemory()) { 1142249423Sdim if (LoadMoveSetPairs) { 1143249423Sdim UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); 1144234285Sdim } else { 1145234285Sdim for (AliasSetTracker::iterator W = WriteSet.begin(), 1146234285Sdim WE = WriteSet.end(); W != WE; ++W) { 1147234285Sdim if (W->aliasesUnknownInst(J, *AA)) { 1148234285Sdim UsesI = true; 1149234285Sdim break; 1150234285Sdim } 1151234285Sdim } 1152234285Sdim } 1153234285Sdim } 1154234285Sdim 1155234285Sdim if (UsesI && UpdateUsers) { 1156234285Sdim if (J->mayWriteToMemory()) WriteSet.add(J); 1157234285Sdim Users.insert(J); 1158234285Sdim } 1159234285Sdim 1160234285Sdim return UsesI; 1161234285Sdim } 1162234285Sdim 1163234285Sdim // This function iterates over all instruction pairs in the provided 1164234285Sdim // basic block and collects all candidate pairs for vectorization. 1165234285Sdim bool BBVectorize::getCandidatePairs(BasicBlock &BB, 1166234285Sdim BasicBlock::iterator &Start, 1167249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1168243830Sdim DenseSet<ValuePair> &FixedOrderPairs, 1169243830Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 1170239462Sdim std::vector<Value *> &PairableInsts, bool NonPow2Len) { 1171249423Sdim size_t TotalPairs = 0; 1172234285Sdim BasicBlock::iterator E = BB.end(); 1173234285Sdim if (Start == E) return false; 1174234285Sdim 1175234285Sdim bool ShouldContinue = false, IAfterStart = false; 1176234285Sdim for (BasicBlock::iterator I = Start++; I != E; ++I) { 1177234285Sdim if (I == Start) IAfterStart = true; 1178234285Sdim 1179234285Sdim bool IsSimpleLoadStore; 1180234285Sdim if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 1181234285Sdim 1182234285Sdim // Look for an instruction with which to pair instruction *I... 1183234285Sdim DenseSet<Value *> Users; 1184234285Sdim AliasSetTracker WriteSet(*AA); 1185263508Sdim if (I->mayWriteToMemory()) WriteSet.add(I); 1186263508Sdim 1187234285Sdim bool JAfterStart = IAfterStart; 1188234285Sdim BasicBlock::iterator J = llvm::next(I); 1189234285Sdim for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 1190234285Sdim if (J == Start) JAfterStart = true; 1191234285Sdim 1192234285Sdim // Determine if J uses I, if so, exit the loop. 1193234285Sdim bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 1194234285Sdim if (Config.FastDep) { 1195234285Sdim // Note: For this heuristic to be effective, independent operations 1196234285Sdim // must tend to be intermixed. This is likely to be true from some 1197234285Sdim // kinds of grouped loop unrolling (but not the generic LLVM pass), 1198234285Sdim // but otherwise may require some kind of reordering pass. 1199234285Sdim 1200234285Sdim // When using fast dependency analysis, 1201234285Sdim // stop searching after first use: 1202234285Sdim if (UsesI) break; 1203234285Sdim } else { 1204234285Sdim if (UsesI) continue; 1205234285Sdim } 1206234285Sdim 1207234285Sdim // J does not use I, and comes before the first use of I, so it can be 1208234285Sdim // merged with I if the instructions are compatible. 1209243830Sdim int CostSavings, FixedOrder; 1210243830Sdim if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, 1211243830Sdim CostSavings, FixedOrder)) continue; 1212234285Sdim 1213234285Sdim // J is a candidate for merging with I. 1214234285Sdim if (!PairableInsts.size() || 1215234285Sdim PairableInsts[PairableInsts.size()-1] != I) { 1216234285Sdim PairableInsts.push_back(I); 1217234285Sdim } 1218234285Sdim 1219249423Sdim CandidatePairs[I].push_back(J); 1220249423Sdim ++TotalPairs; 1221249423Sdim if (TTI) 1222243830Sdim CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), 1223243830Sdim CostSavings)); 1224234285Sdim 1225243830Sdim if (FixedOrder == 1) 1226243830Sdim FixedOrderPairs.insert(ValuePair(I, J)); 1227243830Sdim else if (FixedOrder == -1) 1228243830Sdim FixedOrderPairs.insert(ValuePair(J, I)); 1229243830Sdim 1230234285Sdim // The next call to this function must start after the last instruction 1231234285Sdim // selected during this invocation. 1232234285Sdim if (JAfterStart) { 1233234285Sdim Start = llvm::next(J); 1234234285Sdim IAfterStart = JAfterStart = false; 1235234285Sdim } 1236234285Sdim 1237234285Sdim DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 1238243830Sdim << *I << " <-> " << *J << " (cost savings: " << 1239243830Sdim CostSavings << ")\n"); 1240234285Sdim 1241234285Sdim // If we have already found too many pairs, break here and this function 1242234285Sdim // will be called again starting after the last instruction selected 1243234285Sdim // during this invocation. 1244249423Sdim if (PairableInsts.size() >= Config.MaxInsts || 1245249423Sdim TotalPairs >= Config.MaxPairs) { 1246234285Sdim ShouldContinue = true; 1247234285Sdim break; 1248234285Sdim } 1249234285Sdim } 1250234285Sdim 1251234285Sdim if (ShouldContinue) 1252234285Sdim break; 1253234285Sdim } 1254234285Sdim 1255234285Sdim DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 1256234285Sdim << " instructions with candidate pairs\n"); 1257234285Sdim 1258234285Sdim return ShouldContinue; 1259234285Sdim } 1260234285Sdim 1261234285Sdim // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 1262234285Sdim // it looks for pairs such that both members have an input which is an 1263234285Sdim // output of PI or PJ. 1264234285Sdim void BBVectorize::computePairsConnectedTo( 1265249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1266249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 1267249423Sdim std::vector<Value *> &PairableInsts, 1268249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1269249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 1270249423Sdim ValuePair P) { 1271234982Sdim StoreInst *SI, *SJ; 1272234982Sdim 1273234285Sdim // For each possible pairing for this variable, look at the uses of 1274234285Sdim // the first value... 1275234285Sdim for (Value::use_iterator I = P.first->use_begin(), 1276234285Sdim E = P.first->use_end(); I != E; ++I) { 1277234982Sdim if (isa<LoadInst>(*I)) { 1278234982Sdim // A pair cannot be connected to a load because the load only takes one 1279234982Sdim // operand (the address) and it is a scalar even after vectorization. 1280234982Sdim continue; 1281234982Sdim } else if ((SI = dyn_cast<StoreInst>(*I)) && 1282234982Sdim P.first == SI->getPointerOperand()) { 1283234982Sdim // Similarly, a pair cannot be connected to a store through its 1284234982Sdim // pointer operand. 1285234982Sdim continue; 1286234982Sdim } 1287234982Sdim 1288234285Sdim // For each use of the first variable, look for uses of the second 1289234285Sdim // variable... 1290234285Sdim for (Value::use_iterator J = P.second->use_begin(), 1291234285Sdim E2 = P.second->use_end(); J != E2; ++J) { 1292234982Sdim if ((SJ = dyn_cast<StoreInst>(*J)) && 1293234982Sdim P.second == SJ->getPointerOperand()) 1294234982Sdim continue; 1295234982Sdim 1296234285Sdim // Look for <I, J>: 1297249423Sdim if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1298243830Sdim VPPair VP(P, ValuePair(*I, *J)); 1299249423Sdim ConnectedPairs[VP.first].push_back(VP.second); 1300243830Sdim PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); 1301243830Sdim } 1302234285Sdim 1303234285Sdim // Look for <J, I>: 1304249423Sdim if (CandidatePairsSet.count(ValuePair(*J, *I))) { 1305243830Sdim VPPair VP(P, ValuePair(*J, *I)); 1306249423Sdim ConnectedPairs[VP.first].push_back(VP.second); 1307243830Sdim PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); 1308243830Sdim } 1309234285Sdim } 1310234285Sdim 1311234285Sdim if (Config.SplatBreaksChain) continue; 1312234285Sdim // Look for cases where just the first value in the pair is used by 1313234285Sdim // both members of another pair (splatting). 1314234285Sdim for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 1315234982Sdim if ((SJ = dyn_cast<StoreInst>(*J)) && 1316234982Sdim P.first == SJ->getPointerOperand()) 1317234982Sdim continue; 1318234982Sdim 1319249423Sdim if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1320243830Sdim VPPair VP(P, ValuePair(*I, *J)); 1321249423Sdim ConnectedPairs[VP.first].push_back(VP.second); 1322243830Sdim PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1323243830Sdim } 1324234285Sdim } 1325234285Sdim } 1326234285Sdim 1327234285Sdim if (Config.SplatBreaksChain) return; 1328234285Sdim // Look for cases where just the second value in the pair is used by 1329234285Sdim // both members of another pair (splatting). 1330234285Sdim for (Value::use_iterator I = P.second->use_begin(), 1331234285Sdim E = P.second->use_end(); I != E; ++I) { 1332234982Sdim if (isa<LoadInst>(*I)) 1333234982Sdim continue; 1334234982Sdim else if ((SI = dyn_cast<StoreInst>(*I)) && 1335234982Sdim P.second == SI->getPointerOperand()) 1336234982Sdim continue; 1337234982Sdim 1338234285Sdim for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 1339234982Sdim if ((SJ = dyn_cast<StoreInst>(*J)) && 1340234982Sdim P.second == SJ->getPointerOperand()) 1341234982Sdim continue; 1342234982Sdim 1343249423Sdim if (CandidatePairsSet.count(ValuePair(*I, *J))) { 1344243830Sdim VPPair VP(P, ValuePair(*I, *J)); 1345249423Sdim ConnectedPairs[VP.first].push_back(VP.second); 1346243830Sdim PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1347243830Sdim } 1348234285Sdim } 1349234285Sdim } 1350234285Sdim } 1351234285Sdim 1352234285Sdim // This function figures out which pairs are connected. Two pairs are 1353234285Sdim // connected if some output of the first pair forms an input to both members 1354234285Sdim // of the second pair. 1355234285Sdim void BBVectorize::computeConnectedPairs( 1356249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1357249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 1358249423Sdim std::vector<Value *> &PairableInsts, 1359249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1360249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes) { 1361234285Sdim for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1362234285Sdim PE = PairableInsts.end(); PI != PE; ++PI) { 1363249423Sdim DenseMap<Value *, std::vector<Value *> >::iterator PP = 1364249423Sdim CandidatePairs.find(*PI); 1365249423Sdim if (PP == CandidatePairs.end()) 1366249423Sdim continue; 1367234285Sdim 1368249423Sdim for (std::vector<Value *>::iterator P = PP->second.begin(), 1369249423Sdim E = PP->second.end(); P != E; ++P) 1370249423Sdim computePairsConnectedTo(CandidatePairs, CandidatePairsSet, 1371249423Sdim PairableInsts, ConnectedPairs, 1372249423Sdim PairConnectionTypes, ValuePair(*PI, *P)); 1373234285Sdim } 1374234285Sdim 1375249423Sdim DEBUG(size_t TotalPairs = 0; 1376249423Sdim for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = 1377249423Sdim ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) 1378249423Sdim TotalPairs += I->second.size(); 1379249423Sdim dbgs() << "BBV: found " << TotalPairs 1380234285Sdim << " pair connections.\n"); 1381234285Sdim } 1382234285Sdim 1383234285Sdim // This function builds a set of use tuples such that <A, B> is in the set 1384249423Sdim // if B is in the use dag of A. If B is in the use dag of A, then B 1385234285Sdim // depends on the output of A. 1386234285Sdim void BBVectorize::buildDepMap( 1387234285Sdim BasicBlock &BB, 1388249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1389234285Sdim std::vector<Value *> &PairableInsts, 1390234285Sdim DenseSet<ValuePair> &PairableInstUsers) { 1391234285Sdim DenseSet<Value *> IsInPair; 1392249423Sdim for (DenseMap<Value *, std::vector<Value *> >::iterator C = 1393249423Sdim CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { 1394234285Sdim IsInPair.insert(C->first); 1395249423Sdim IsInPair.insert(C->second.begin(), C->second.end()); 1396234285Sdim } 1397234285Sdim 1398249423Sdim // Iterate through the basic block, recording all users of each 1399234285Sdim // pairable instruction. 1400234285Sdim 1401249423Sdim BasicBlock::iterator E = BB.end(), EL = 1402249423Sdim BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); 1403234285Sdim for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 1404234285Sdim if (IsInPair.find(I) == IsInPair.end()) continue; 1405234285Sdim 1406234285Sdim DenseSet<Value *> Users; 1407234285Sdim AliasSetTracker WriteSet(*AA); 1408263508Sdim if (I->mayWriteToMemory()) WriteSet.add(I); 1409263508Sdim 1410249423Sdim for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) { 1411234285Sdim (void) trackUsesOfI(Users, WriteSet, I, J); 1412234285Sdim 1413249423Sdim if (J == EL) 1414249423Sdim break; 1415249423Sdim } 1416249423Sdim 1417234285Sdim for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 1418249423Sdim U != E; ++U) { 1419249423Sdim if (IsInPair.find(*U) == IsInPair.end()) continue; 1420234285Sdim PairableInstUsers.insert(ValuePair(I, *U)); 1421249423Sdim } 1422249423Sdim 1423249423Sdim if (I == EL) 1424249423Sdim break; 1425234285Sdim } 1426234285Sdim } 1427234285Sdim 1428234285Sdim // Returns true if an input to pair P is an output of pair Q and also an 1429234285Sdim // input of pair Q is an output of pair P. If this is the case, then these 1430234285Sdim // two pairs cannot be simultaneously fused. 1431234285Sdim bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 1432249423Sdim DenseSet<ValuePair> &PairableInstUsers, 1433249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, 1434249423Sdim DenseSet<VPPair> *PairableInstUserPairSet) { 1435234285Sdim // Two pairs are in conflict if they are mutual Users of eachother. 1436234285Sdim bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1437234285Sdim PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1438234285Sdim PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1439234285Sdim PairableInstUsers.count(ValuePair(P.second, Q.second)); 1440234285Sdim bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1441234285Sdim PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1442234285Sdim PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1443234285Sdim PairableInstUsers.count(ValuePair(Q.second, P.second)); 1444234285Sdim if (PairableInstUserMap) { 1445234285Sdim // FIXME: The expensive part of the cycle check is not so much the cycle 1446234285Sdim // check itself but this edge insertion procedure. This needs some 1447249423Sdim // profiling and probably a different data structure. 1448234285Sdim if (PUsesQ) { 1449249423Sdim if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) 1450249423Sdim (*PairableInstUserMap)[Q].push_back(P); 1451234285Sdim } 1452234285Sdim if (QUsesP) { 1453249423Sdim if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) 1454249423Sdim (*PairableInstUserMap)[P].push_back(Q); 1455234285Sdim } 1456234285Sdim } 1457234285Sdim 1458234285Sdim return (QUsesP && PUsesQ); 1459234285Sdim } 1460234285Sdim 1461234285Sdim // This function walks the use graph of current pairs to see if, starting 1462234285Sdim // from P, the walk returns to P. 1463234285Sdim bool BBVectorize::pairWillFormCycle(ValuePair P, 1464249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1465249423Sdim DenseSet<ValuePair> &CurrentPairs) { 1466234285Sdim DEBUG(if (DebugCycleCheck) 1467234285Sdim dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1468234285Sdim << *P.second << "\n"); 1469234285Sdim // A lookup table of visisted pairs is kept because the PairableInstUserMap 1470234285Sdim // contains non-direct associations. 1471234285Sdim DenseSet<ValuePair> Visited; 1472234285Sdim SmallVector<ValuePair, 32> Q; 1473234285Sdim // General depth-first post-order traversal: 1474234285Sdim Q.push_back(P); 1475234285Sdim do { 1476234285Sdim ValuePair QTop = Q.pop_back_val(); 1477234285Sdim Visited.insert(QTop); 1478234285Sdim 1479234285Sdim DEBUG(if (DebugCycleCheck) 1480234285Sdim dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1481234285Sdim << *QTop.second << "\n"); 1482249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1483249423Sdim PairableInstUserMap.find(QTop); 1484249423Sdim if (QQ == PairableInstUserMap.end()) 1485249423Sdim continue; 1486249423Sdim 1487249423Sdim for (std::vector<ValuePair>::iterator C = QQ->second.begin(), 1488249423Sdim CE = QQ->second.end(); C != CE; ++C) { 1489249423Sdim if (*C == P) { 1490234285Sdim DEBUG(dbgs() 1491234285Sdim << "BBV: rejected to prevent non-trivial cycle formation: " 1492249423Sdim << QTop.first << " <-> " << C->second << "\n"); 1493234285Sdim return true; 1494234285Sdim } 1495234285Sdim 1496249423Sdim if (CurrentPairs.count(*C) && !Visited.count(*C)) 1497249423Sdim Q.push_back(*C); 1498234285Sdim } 1499234285Sdim } while (!Q.empty()); 1500234285Sdim 1501234285Sdim return false; 1502234285Sdim } 1503234285Sdim 1504249423Sdim // This function builds the initial dag of connected pairs with the 1505234285Sdim // pair J at the root. 1506249423Sdim void BBVectorize::buildInitialDAGFor( 1507249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1508249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 1509249423Sdim std::vector<Value *> &PairableInsts, 1510249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1511249423Sdim DenseSet<ValuePair> &PairableInstUsers, 1512249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 1513249423Sdim DenseMap<ValuePair, size_t> &DAG, ValuePair J) { 1514249423Sdim // Each of these pairs is viewed as the root node of a DAG. The DAG 1515234285Sdim // is then walked (depth-first). As this happens, we keep track of 1516249423Sdim // the pairs that compose the DAG and the maximum depth of the DAG. 1517234285Sdim SmallVector<ValuePairWithDepth, 32> Q; 1518234285Sdim // General depth-first post-order traversal: 1519234285Sdim Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1520234285Sdim do { 1521234285Sdim ValuePairWithDepth QTop = Q.back(); 1522234285Sdim 1523234285Sdim // Push each child onto the queue: 1524234285Sdim bool MoreChildren = false; 1525234285Sdim size_t MaxChildDepth = QTop.second; 1526249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1527249423Sdim ConnectedPairs.find(QTop.first); 1528249423Sdim if (QQ != ConnectedPairs.end()) 1529249423Sdim for (std::vector<ValuePair>::iterator k = QQ->second.begin(), 1530249423Sdim ke = QQ->second.end(); k != ke; ++k) { 1531249423Sdim // Make sure that this child pair is still a candidate: 1532249423Sdim if (CandidatePairsSet.count(*k)) { 1533249423Sdim DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); 1534249423Sdim if (C == DAG.end()) { 1535249423Sdim size_t d = getDepthFactor(k->first); 1536249423Sdim Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); 1537249423Sdim MoreChildren = true; 1538249423Sdim } else { 1539249423Sdim MaxChildDepth = std::max(MaxChildDepth, C->second); 1540249423Sdim } 1541234285Sdim } 1542234285Sdim } 1543234285Sdim 1544234285Sdim if (!MoreChildren) { 1545249423Sdim // Record the current pair as part of the DAG: 1546249423Sdim DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1547234285Sdim Q.pop_back(); 1548234285Sdim } 1549234285Sdim } while (!Q.empty()); 1550234285Sdim } 1551234285Sdim 1552249423Sdim // Given some initial dag, prune it by removing conflicting pairs (pairs 1553234285Sdim // that cannot be simultaneously chosen for vectorization). 1554249423Sdim void BBVectorize::pruneDAGFor( 1555249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1556249423Sdim std::vector<Value *> &PairableInsts, 1557249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1558249423Sdim DenseSet<ValuePair> &PairableInstUsers, 1559249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1560249423Sdim DenseSet<VPPair> &PairableInstUserPairSet, 1561249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 1562249423Sdim DenseMap<ValuePair, size_t> &DAG, 1563249423Sdim DenseSet<ValuePair> &PrunedDAG, ValuePair J, 1564249423Sdim bool UseCycleCheck) { 1565234285Sdim SmallVector<ValuePairWithDepth, 32> Q; 1566234285Sdim // General depth-first post-order traversal: 1567234285Sdim Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1568234285Sdim do { 1569234285Sdim ValuePairWithDepth QTop = Q.pop_back_val(); 1570249423Sdim PrunedDAG.insert(QTop.first); 1571234285Sdim 1572234285Sdim // Visit each child, pruning as necessary... 1573243830Sdim SmallVector<ValuePairWithDepth, 8> BestChildren; 1574249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1575249423Sdim ConnectedPairs.find(QTop.first); 1576249423Sdim if (QQ == ConnectedPairs.end()) 1577249423Sdim continue; 1578234285Sdim 1579249423Sdim for (std::vector<ValuePair>::iterator K = QQ->second.begin(), 1580249423Sdim KE = QQ->second.end(); K != KE; ++K) { 1581249423Sdim DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); 1582249423Sdim if (C == DAG.end()) continue; 1583249423Sdim 1584249423Sdim // This child is in the DAG, now we need to make sure it is the 1585234285Sdim // best of any conflicting children. There could be multiple 1586234285Sdim // conflicting children, so first, determine if we're keeping 1587234285Sdim // this child, then delete conflicting children as necessary. 1588234285Sdim 1589234285Sdim // It is also necessary to guard against pairing-induced 1590234285Sdim // dependencies. Consider instructions a .. x .. y .. b 1591234285Sdim // such that (a,b) are to be fused and (x,y) are to be fused 1592234285Sdim // but a is an input to x and b is an output from y. This 1593234285Sdim // means that y cannot be moved after b but x must be moved 1594234285Sdim // after b for (a,b) to be fused. In other words, after 1595234285Sdim // fusing (a,b) we have y .. a/b .. x where y is an input 1596234285Sdim // to a/b and x is an output to a/b: x and y can no longer 1597234285Sdim // be legally fused. To prevent this condition, we must 1598249423Sdim // make sure that a child pair added to the DAG is not 1599234285Sdim // both an input and output of an already-selected pair. 1600234285Sdim 1601234285Sdim // Pairing-induced dependencies can also form from more complicated 1602234285Sdim // cycles. The pair vs. pair conflicts are easy to check, and so 1603234285Sdim // that is done explicitly for "fast rejection", and because for 1604234285Sdim // child vs. child conflicts, we may prefer to keep the current 1605234285Sdim // pair in preference to the already-selected child. 1606234285Sdim DenseSet<ValuePair> CurrentPairs; 1607234285Sdim 1608234285Sdim bool CanAdd = true; 1609263508Sdim for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 1610234285Sdim = BestChildren.begin(), E2 = BestChildren.end(); 1611234285Sdim C2 != E2; ++C2) { 1612234285Sdim if (C2->first.first == C->first.first || 1613234285Sdim C2->first.first == C->first.second || 1614234285Sdim C2->first.second == C->first.first || 1615234285Sdim C2->first.second == C->first.second || 1616234285Sdim pairsConflict(C2->first, C->first, PairableInstUsers, 1617249423Sdim UseCycleCheck ? &PairableInstUserMap : 0, 1618249423Sdim UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1619234285Sdim if (C2->second >= C->second) { 1620234285Sdim CanAdd = false; 1621234285Sdim break; 1622234285Sdim } 1623234285Sdim 1624234285Sdim CurrentPairs.insert(C2->first); 1625234285Sdim } 1626234285Sdim } 1627234285Sdim if (!CanAdd) continue; 1628234285Sdim 1629234285Sdim // Even worse, this child could conflict with another node already 1630249423Sdim // selected for the DAG. If that is the case, ignore this child. 1631249423Sdim for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), 1632249423Sdim E2 = PrunedDAG.end(); T != E2; ++T) { 1633234285Sdim if (T->first == C->first.first || 1634234285Sdim T->first == C->first.second || 1635234285Sdim T->second == C->first.first || 1636234285Sdim T->second == C->first.second || 1637234285Sdim pairsConflict(*T, C->first, PairableInstUsers, 1638249423Sdim UseCycleCheck ? &PairableInstUserMap : 0, 1639249423Sdim UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1640234285Sdim CanAdd = false; 1641234285Sdim break; 1642234285Sdim } 1643234285Sdim 1644234285Sdim CurrentPairs.insert(*T); 1645234285Sdim } 1646234285Sdim if (!CanAdd) continue; 1647234285Sdim 1648234285Sdim // And check the queue too... 1649263508Sdim for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), 1650234285Sdim E2 = Q.end(); C2 != E2; ++C2) { 1651234285Sdim if (C2->first.first == C->first.first || 1652234285Sdim C2->first.first == C->first.second || 1653234285Sdim C2->first.second == C->first.first || 1654234285Sdim C2->first.second == C->first.second || 1655234285Sdim pairsConflict(C2->first, C->first, PairableInstUsers, 1656249423Sdim UseCycleCheck ? &PairableInstUserMap : 0, 1657249423Sdim UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1658234285Sdim CanAdd = false; 1659234285Sdim break; 1660234285Sdim } 1661234285Sdim 1662234285Sdim CurrentPairs.insert(C2->first); 1663234285Sdim } 1664234285Sdim if (!CanAdd) continue; 1665234285Sdim 1666234285Sdim // Last but not least, check for a conflict with any of the 1667234285Sdim // already-chosen pairs. 1668234285Sdim for (DenseMap<Value *, Value *>::iterator C2 = 1669234285Sdim ChosenPairs.begin(), E2 = ChosenPairs.end(); 1670234285Sdim C2 != E2; ++C2) { 1671234285Sdim if (pairsConflict(*C2, C->first, PairableInstUsers, 1672249423Sdim UseCycleCheck ? &PairableInstUserMap : 0, 1673249423Sdim UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1674234285Sdim CanAdd = false; 1675234285Sdim break; 1676234285Sdim } 1677234285Sdim 1678234285Sdim CurrentPairs.insert(*C2); 1679234285Sdim } 1680234285Sdim if (!CanAdd) continue; 1681234285Sdim 1682234285Sdim // To check for non-trivial cycles formed by the addition of the 1683234285Sdim // current pair we've formed a list of all relevant pairs, now use a 1684234285Sdim // graph walk to check for a cycle. We start from the current pair and 1685249423Sdim // walk the use dag to see if we again reach the current pair. If we 1686234285Sdim // do, then the current pair is rejected. 1687234285Sdim 1688234285Sdim // FIXME: It may be more efficient to use a topological-ordering 1689234285Sdim // algorithm to improve the cycle check. This should be investigated. 1690234285Sdim if (UseCycleCheck && 1691234285Sdim pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1692234285Sdim continue; 1693234285Sdim 1694234285Sdim // This child can be added, but we may have chosen it in preference 1695234285Sdim // to an already-selected child. Check for this here, and if a 1696234285Sdim // conflict is found, then remove the previously-selected child 1697234285Sdim // before adding this one in its place. 1698263508Sdim for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 1699234285Sdim = BestChildren.begin(); C2 != BestChildren.end();) { 1700234285Sdim if (C2->first.first == C->first.first || 1701234285Sdim C2->first.first == C->first.second || 1702234285Sdim C2->first.second == C->first.first || 1703234285Sdim C2->first.second == C->first.second || 1704234285Sdim pairsConflict(C2->first, C->first, PairableInstUsers)) 1705243830Sdim C2 = BestChildren.erase(C2); 1706234285Sdim else 1707234285Sdim ++C2; 1708234285Sdim } 1709234285Sdim 1710243830Sdim BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); 1711234285Sdim } 1712234285Sdim 1713263508Sdim for (SmallVectorImpl<ValuePairWithDepth>::iterator C 1714234285Sdim = BestChildren.begin(), E2 = BestChildren.end(); 1715234285Sdim C != E2; ++C) { 1716234285Sdim size_t DepthF = getDepthFactor(C->first.first); 1717234285Sdim Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1718234285Sdim } 1719234285Sdim } while (!Q.empty()); 1720234285Sdim } 1721234285Sdim 1722249423Sdim // This function finds the best dag of mututally-compatible connected 1723234285Sdim // pairs, given the choice of root pairs as an iterator range. 1724249423Sdim void BBVectorize::findBestDAGFor( 1725249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1726249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 1727249423Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 1728249423Sdim std::vector<Value *> &PairableInsts, 1729249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 1730249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 1731249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1732249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 1733249423Sdim DenseSet<ValuePair> &PairableInstUsers, 1734249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1735249423Sdim DenseSet<VPPair> &PairableInstUserPairSet, 1736249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 1737249423Sdim DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 1738249423Sdim int &BestEffSize, Value *II, std::vector<Value *>&JJ, 1739249423Sdim bool UseCycleCheck) { 1740249423Sdim for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); 1741249423Sdim J != JE; ++J) { 1742249423Sdim ValuePair IJ(II, *J); 1743249423Sdim if (!CandidatePairsSet.count(IJ)) 1744249423Sdim continue; 1745234285Sdim 1746234285Sdim // Before going any further, make sure that this pair does not 1747234285Sdim // conflict with any already-selected pairs (see comment below 1748249423Sdim // near the DAG pruning for more details). 1749234285Sdim DenseSet<ValuePair> ChosenPairSet; 1750234285Sdim bool DoesConflict = false; 1751234285Sdim for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1752234285Sdim E = ChosenPairs.end(); C != E; ++C) { 1753249423Sdim if (pairsConflict(*C, IJ, PairableInstUsers, 1754249423Sdim UseCycleCheck ? &PairableInstUserMap : 0, 1755249423Sdim UseCycleCheck ? &PairableInstUserPairSet : 0)) { 1756234285Sdim DoesConflict = true; 1757234285Sdim break; 1758234285Sdim } 1759234285Sdim 1760234285Sdim ChosenPairSet.insert(*C); 1761234285Sdim } 1762234285Sdim if (DoesConflict) continue; 1763234285Sdim 1764234285Sdim if (UseCycleCheck && 1765249423Sdim pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) 1766234285Sdim continue; 1767234285Sdim 1768249423Sdim DenseMap<ValuePair, size_t> DAG; 1769249423Sdim buildInitialDAGFor(CandidatePairs, CandidatePairsSet, 1770249423Sdim PairableInsts, ConnectedPairs, 1771249423Sdim PairableInstUsers, ChosenPairs, DAG, IJ); 1772234285Sdim 1773234285Sdim // Because we'll keep the child with the largest depth, the largest 1774249423Sdim // depth is still the same in the unpruned DAG. 1775249423Sdim size_t MaxDepth = DAG.lookup(IJ); 1776234285Sdim 1777249423Sdim DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" 1778249423Sdim << *IJ.first << " <-> " << *IJ.second << "} of depth " << 1779249423Sdim MaxDepth << " and size " << DAG.size() << "\n"); 1780234285Sdim 1781249423Sdim // At this point the DAG has been constructed, but, may contain 1782234285Sdim // contradictory children (meaning that different children of 1783249423Sdim // some dag node may be attempting to fuse the same instruction). 1784249423Sdim // So now we walk the dag again, in the case of a conflict, 1785234285Sdim // keep only the child with the largest depth. To break a tie, 1786234285Sdim // favor the first child. 1787234285Sdim 1788249423Sdim DenseSet<ValuePair> PrunedDAG; 1789249423Sdim pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, 1790249423Sdim PairableInstUsers, PairableInstUserMap, 1791249423Sdim PairableInstUserPairSet, 1792249423Sdim ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); 1793234285Sdim 1794243830Sdim int EffSize = 0; 1795249423Sdim if (TTI) { 1796249423Sdim DenseSet<Value *> PrunedDAGInstrs; 1797249423Sdim for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 1798249423Sdim E = PrunedDAG.end(); S != E; ++S) { 1799249423Sdim PrunedDAGInstrs.insert(S->first); 1800249423Sdim PrunedDAGInstrs.insert(S->second); 1801243830Sdim } 1802234285Sdim 1803243830Sdim // The set of pairs that have already contributed to the total cost. 1804243830Sdim DenseSet<ValuePair> IncomingPairs; 1805243830Sdim 1806243830Sdim // If the cost model were perfect, this might not be necessary; but we 1807243830Sdim // need to make sure that we don't get stuck vectorizing our own 1808243830Sdim // shuffle chains. 1809243830Sdim bool HasNontrivialInsts = false; 1810243830Sdim 1811243830Sdim // The node weights represent the cost savings associated with 1812243830Sdim // fusing the pair of instructions. 1813249423Sdim for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 1814249423Sdim E = PrunedDAG.end(); S != E; ++S) { 1815243830Sdim if (!isa<ShuffleVectorInst>(S->first) && 1816243830Sdim !isa<InsertElementInst>(S->first) && 1817243830Sdim !isa<ExtractElementInst>(S->first)) 1818243830Sdim HasNontrivialInsts = true; 1819243830Sdim 1820243830Sdim bool FlipOrder = false; 1821243830Sdim 1822243830Sdim if (getDepthFactor(S->first)) { 1823243830Sdim int ESContrib = CandidatePairCostSavings.find(*S)->second; 1824243830Sdim DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" 1825243830Sdim << *S->first << " <-> " << *S->second << "} = " << 1826243830Sdim ESContrib << "\n"); 1827243830Sdim EffSize += ESContrib; 1828243830Sdim } 1829243830Sdim 1830243830Sdim // The edge weights contribute in a negative sense: they represent 1831243830Sdim // the cost of shuffles. 1832249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = 1833249423Sdim ConnectedPairDeps.find(*S); 1834249423Sdim if (SS != ConnectedPairDeps.end()) { 1835243830Sdim unsigned NumDepsDirect = 0, NumDepsSwap = 0; 1836249423Sdim for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1837249423Sdim TE = SS->second.end(); T != TE; ++T) { 1838249423Sdim VPPair Q(*S, *T); 1839249423Sdim if (!PrunedDAG.count(Q.second)) 1840243830Sdim continue; 1841243830Sdim DenseMap<VPPair, unsigned>::iterator R = 1842249423Sdim PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1843243830Sdim assert(R != PairConnectionTypes.end() && 1844243830Sdim "Cannot find pair connection type"); 1845243830Sdim if (R->second == PairConnectionDirect) 1846243830Sdim ++NumDepsDirect; 1847243830Sdim else if (R->second == PairConnectionSwap) 1848243830Sdim ++NumDepsSwap; 1849243830Sdim } 1850243830Sdim 1851243830Sdim // If there are more swaps than direct connections, then 1852243830Sdim // the pair order will be flipped during fusion. So the real 1853243830Sdim // number of swaps is the minimum number. 1854243830Sdim FlipOrder = !FixedOrderPairs.count(*S) && 1855243830Sdim ((NumDepsSwap > NumDepsDirect) || 1856243830Sdim FixedOrderPairs.count(ValuePair(S->second, S->first))); 1857243830Sdim 1858249423Sdim for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1859249423Sdim TE = SS->second.end(); T != TE; ++T) { 1860249423Sdim VPPair Q(*S, *T); 1861249423Sdim if (!PrunedDAG.count(Q.second)) 1862243830Sdim continue; 1863243830Sdim DenseMap<VPPair, unsigned>::iterator R = 1864249423Sdim PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1865243830Sdim assert(R != PairConnectionTypes.end() && 1866243830Sdim "Cannot find pair connection type"); 1867249423Sdim Type *Ty1 = Q.second.first->getType(), 1868249423Sdim *Ty2 = Q.second.second->getType(); 1869243830Sdim Type *VTy = getVecTypeForPair(Ty1, Ty2); 1870243830Sdim if ((R->second == PairConnectionDirect && FlipOrder) || 1871243830Sdim (R->second == PairConnectionSwap && !FlipOrder) || 1872243830Sdim R->second == PairConnectionSplat) { 1873243830Sdim int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1874243830Sdim VTy, VTy); 1875249423Sdim 1876249423Sdim if (VTy->getVectorNumElements() == 2) { 1877249423Sdim if (R->second == PairConnectionSplat) 1878249423Sdim ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1879249423Sdim TargetTransformInfo::SK_Broadcast, VTy)); 1880249423Sdim else 1881249423Sdim ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1882249423Sdim TargetTransformInfo::SK_Reverse, VTy)); 1883249423Sdim } 1884249423Sdim 1885243830Sdim DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1886249423Sdim *Q.second.first << " <-> " << *Q.second.second << 1887243830Sdim "} -> {" << 1888243830Sdim *S->first << " <-> " << *S->second << "} = " << 1889243830Sdim ESContrib << "\n"); 1890243830Sdim EffSize -= ESContrib; 1891243830Sdim } 1892243830Sdim } 1893243830Sdim } 1894243830Sdim 1895243830Sdim // Compute the cost of outgoing edges. We assume that edges outgoing 1896243830Sdim // to shuffles, inserts or extracts can be merged, and so contribute 1897243830Sdim // no additional cost. 1898243830Sdim if (!S->first->getType()->isVoidTy()) { 1899243830Sdim Type *Ty1 = S->first->getType(), 1900243830Sdim *Ty2 = S->second->getType(); 1901243830Sdim Type *VTy = getVecTypeForPair(Ty1, Ty2); 1902243830Sdim 1903243830Sdim bool NeedsExtraction = false; 1904243830Sdim for (Value::use_iterator I = S->first->use_begin(), 1905243830Sdim IE = S->first->use_end(); I != IE; ++I) { 1906243830Sdim if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1907243830Sdim // Shuffle can be folded if it has no other input 1908243830Sdim if (isa<UndefValue>(SI->getOperand(1))) 1909243830Sdim continue; 1910243830Sdim } 1911243830Sdim if (isa<ExtractElementInst>(*I)) 1912243830Sdim continue; 1913249423Sdim if (PrunedDAGInstrs.count(*I)) 1914243830Sdim continue; 1915243830Sdim NeedsExtraction = true; 1916243830Sdim break; 1917243830Sdim } 1918243830Sdim 1919243830Sdim if (NeedsExtraction) { 1920243830Sdim int ESContrib; 1921249423Sdim if (Ty1->isVectorTy()) { 1922243830Sdim ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1923243830Sdim Ty1, VTy); 1924249423Sdim ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1925249423Sdim TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); 1926249423Sdim } else 1927249423Sdim ESContrib = (int) TTI->getVectorInstrCost( 1928243830Sdim Instruction::ExtractElement, VTy, 0); 1929243830Sdim 1930243830Sdim DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1931243830Sdim *S->first << "} = " << ESContrib << "\n"); 1932243830Sdim EffSize -= ESContrib; 1933243830Sdim } 1934243830Sdim 1935243830Sdim NeedsExtraction = false; 1936243830Sdim for (Value::use_iterator I = S->second->use_begin(), 1937243830Sdim IE = S->second->use_end(); I != IE; ++I) { 1938243830Sdim if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) { 1939243830Sdim // Shuffle can be folded if it has no other input 1940243830Sdim if (isa<UndefValue>(SI->getOperand(1))) 1941243830Sdim continue; 1942243830Sdim } 1943243830Sdim if (isa<ExtractElementInst>(*I)) 1944243830Sdim continue; 1945249423Sdim if (PrunedDAGInstrs.count(*I)) 1946243830Sdim continue; 1947243830Sdim NeedsExtraction = true; 1948243830Sdim break; 1949243830Sdim } 1950243830Sdim 1951243830Sdim if (NeedsExtraction) { 1952243830Sdim int ESContrib; 1953249423Sdim if (Ty2->isVectorTy()) { 1954243830Sdim ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1955243830Sdim Ty2, VTy); 1956249423Sdim ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1957249423Sdim TargetTransformInfo::SK_ExtractSubvector, VTy, 1958249423Sdim Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); 1959249423Sdim } else 1960249423Sdim ESContrib = (int) TTI->getVectorInstrCost( 1961243830Sdim Instruction::ExtractElement, VTy, 1); 1962243830Sdim DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1963243830Sdim *S->second << "} = " << ESContrib << "\n"); 1964243830Sdim EffSize -= ESContrib; 1965243830Sdim } 1966243830Sdim } 1967243830Sdim 1968243830Sdim // Compute the cost of incoming edges. 1969243830Sdim if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { 1970243830Sdim Instruction *S1 = cast<Instruction>(S->first), 1971243830Sdim *S2 = cast<Instruction>(S->second); 1972243830Sdim for (unsigned o = 0; o < S1->getNumOperands(); ++o) { 1973243830Sdim Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); 1974243830Sdim 1975243830Sdim // Combining constants into vector constants (or small vector 1976243830Sdim // constants into larger ones are assumed free). 1977243830Sdim if (isa<Constant>(O1) && isa<Constant>(O2)) 1978243830Sdim continue; 1979243830Sdim 1980243830Sdim if (FlipOrder) 1981243830Sdim std::swap(O1, O2); 1982243830Sdim 1983243830Sdim ValuePair VP = ValuePair(O1, O2); 1984243830Sdim ValuePair VPR = ValuePair(O2, O1); 1985243830Sdim 1986243830Sdim // Internal edges are not handled here. 1987249423Sdim if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) 1988243830Sdim continue; 1989243830Sdim 1990243830Sdim Type *Ty1 = O1->getType(), 1991243830Sdim *Ty2 = O2->getType(); 1992243830Sdim Type *VTy = getVecTypeForPair(Ty1, Ty2); 1993243830Sdim 1994243830Sdim // Combining vector operations of the same type is also assumed 1995243830Sdim // folded with other operations. 1996243830Sdim if (Ty1 == Ty2) { 1997243830Sdim // If both are insert elements, then both can be widened. 1998243830Sdim InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), 1999243830Sdim *IEO2 = dyn_cast<InsertElementInst>(O2); 2000243830Sdim if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) 2001243830Sdim continue; 2002243830Sdim // If both are extract elements, and both have the same input 2003243830Sdim // type, then they can be replaced with a shuffle 2004243830Sdim ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), 2005243830Sdim *EIO2 = dyn_cast<ExtractElementInst>(O2); 2006243830Sdim if (EIO1 && EIO2 && 2007243830Sdim EIO1->getOperand(0)->getType() == 2008243830Sdim EIO2->getOperand(0)->getType()) 2009243830Sdim continue; 2010243830Sdim // If both are a shuffle with equal operand types and only two 2011243830Sdim // unqiue operands, then they can be replaced with a single 2012243830Sdim // shuffle 2013243830Sdim ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), 2014243830Sdim *SIO2 = dyn_cast<ShuffleVectorInst>(O2); 2015243830Sdim if (SIO1 && SIO2 && 2016243830Sdim SIO1->getOperand(0)->getType() == 2017243830Sdim SIO2->getOperand(0)->getType()) { 2018243830Sdim SmallSet<Value *, 4> SIOps; 2019243830Sdim SIOps.insert(SIO1->getOperand(0)); 2020243830Sdim SIOps.insert(SIO1->getOperand(1)); 2021243830Sdim SIOps.insert(SIO2->getOperand(0)); 2022243830Sdim SIOps.insert(SIO2->getOperand(1)); 2023243830Sdim if (SIOps.size() <= 2) 2024243830Sdim continue; 2025243830Sdim } 2026243830Sdim } 2027243830Sdim 2028243830Sdim int ESContrib; 2029243830Sdim // This pair has already been formed. 2030243830Sdim if (IncomingPairs.count(VP)) { 2031243830Sdim continue; 2032243830Sdim } else if (IncomingPairs.count(VPR)) { 2033243830Sdim ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2034243830Sdim VTy, VTy); 2035249423Sdim 2036249423Sdim if (VTy->getVectorNumElements() == 2) 2037249423Sdim ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 2038249423Sdim TargetTransformInfo::SK_Reverse, VTy)); 2039243830Sdim } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { 2040249423Sdim ESContrib = (int) TTI->getVectorInstrCost( 2041243830Sdim Instruction::InsertElement, VTy, 0); 2042249423Sdim ESContrib += (int) TTI->getVectorInstrCost( 2043243830Sdim Instruction::InsertElement, VTy, 1); 2044243830Sdim } else if (!Ty1->isVectorTy()) { 2045243830Sdim // O1 needs to be inserted into a vector of size O2, and then 2046243830Sdim // both need to be shuffled together. 2047249423Sdim ESContrib = (int) TTI->getVectorInstrCost( 2048243830Sdim Instruction::InsertElement, Ty2, 0); 2049243830Sdim ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2050243830Sdim VTy, Ty2); 2051243830Sdim } else if (!Ty2->isVectorTy()) { 2052243830Sdim // O2 needs to be inserted into a vector of size O1, and then 2053243830Sdim // both need to be shuffled together. 2054249423Sdim ESContrib = (int) TTI->getVectorInstrCost( 2055243830Sdim Instruction::InsertElement, Ty1, 0); 2056243830Sdim ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2057243830Sdim VTy, Ty1); 2058243830Sdim } else { 2059243830Sdim Type *TyBig = Ty1, *TySmall = Ty2; 2060243830Sdim if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) 2061243830Sdim std::swap(TyBig, TySmall); 2062243830Sdim 2063243830Sdim ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2064243830Sdim VTy, TyBig); 2065243830Sdim if (TyBig != TySmall) 2066243830Sdim ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2067243830Sdim TyBig, TySmall); 2068243830Sdim } 2069243830Sdim 2070243830Sdim DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" 2071243830Sdim << *O1 << " <-> " << *O2 << "} = " << 2072243830Sdim ESContrib << "\n"); 2073243830Sdim EffSize -= ESContrib; 2074243830Sdim IncomingPairs.insert(VP); 2075243830Sdim } 2076243830Sdim } 2077243830Sdim } 2078243830Sdim 2079243830Sdim if (!HasNontrivialInsts) { 2080243830Sdim DEBUG(if (DebugPairSelection) dbgs() << 2081249423Sdim "\tNo non-trivial instructions in DAG;" 2082243830Sdim " override to zero effective size\n"); 2083243830Sdim EffSize = 0; 2084243830Sdim } 2085243830Sdim } else { 2086249423Sdim for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 2087249423Sdim E = PrunedDAG.end(); S != E; ++S) 2088243830Sdim EffSize += (int) getDepthFactor(S->first); 2089243830Sdim } 2090243830Sdim 2091234285Sdim DEBUG(if (DebugPairSelection) 2092249423Sdim dbgs() << "BBV: found pruned DAG for pair {" 2093249423Sdim << *IJ.first << " <-> " << *IJ.second << "} of depth " << 2094249423Sdim MaxDepth << " and size " << PrunedDAG.size() << 2095234285Sdim " (effective size: " << EffSize << ")\n"); 2096249423Sdim if (((TTI && !UseChainDepthWithTI) || 2097243830Sdim MaxDepth >= Config.ReqChainDepth) && 2098243830Sdim EffSize > 0 && EffSize > BestEffSize) { 2099234285Sdim BestMaxDepth = MaxDepth; 2100234285Sdim BestEffSize = EffSize; 2101249423Sdim BestDAG = PrunedDAG; 2102234285Sdim } 2103234285Sdim } 2104234285Sdim } 2105234285Sdim 2106234285Sdim // Given the list of candidate pairs, this function selects those 2107234285Sdim // that will be fused into vector instructions. 2108234285Sdim void BBVectorize::choosePairs( 2109249423Sdim DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 2110249423Sdim DenseSet<ValuePair> &CandidatePairsSet, 2111249423Sdim DenseMap<ValuePair, int> &CandidatePairCostSavings, 2112249423Sdim std::vector<Value *> &PairableInsts, 2113249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 2114249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 2115249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2116249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 2117249423Sdim DenseSet<ValuePair> &PairableInstUsers, 2118249423Sdim DenseMap<Value *, Value *>& ChosenPairs) { 2119234285Sdim bool UseCycleCheck = 2120249423Sdim CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; 2121249423Sdim 2122249423Sdim DenseMap<Value *, std::vector<Value *> > CandidatePairs2; 2123249423Sdim for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), 2124249423Sdim E = CandidatePairsSet.end(); I != E; ++I) { 2125249423Sdim std::vector<Value *> &JJ = CandidatePairs2[I->second]; 2126249423Sdim if (JJ.empty()) JJ.reserve(32); 2127249423Sdim JJ.push_back(I->first); 2128249423Sdim } 2129249423Sdim 2130249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; 2131249423Sdim DenseSet<VPPair> PairableInstUserPairSet; 2132234285Sdim for (std::vector<Value *>::iterator I = PairableInsts.begin(), 2133234285Sdim E = PairableInsts.end(); I != E; ++I) { 2134234285Sdim // The number of possible pairings for this variable: 2135249423Sdim size_t NumChoices = CandidatePairs.lookup(*I).size(); 2136234285Sdim if (!NumChoices) continue; 2137234285Sdim 2138249423Sdim std::vector<Value *> &JJ = CandidatePairs[*I]; 2139234285Sdim 2140249423Sdim // The best pair to choose and its dag: 2141243830Sdim size_t BestMaxDepth = 0; 2142243830Sdim int BestEffSize = 0; 2143249423Sdim DenseSet<ValuePair> BestDAG; 2144249423Sdim findBestDAGFor(CandidatePairs, CandidatePairsSet, 2145249423Sdim CandidatePairCostSavings, 2146243830Sdim PairableInsts, FixedOrderPairs, PairConnectionTypes, 2147243830Sdim ConnectedPairs, ConnectedPairDeps, 2148249423Sdim PairableInstUsers, PairableInstUserMap, 2149249423Sdim PairableInstUserPairSet, ChosenPairs, 2150249423Sdim BestDAG, BestMaxDepth, BestEffSize, *I, JJ, 2151234285Sdim UseCycleCheck); 2152234285Sdim 2153249423Sdim if (BestDAG.empty()) 2154249423Sdim continue; 2155249423Sdim 2156249423Sdim // A dag has been chosen (or not) at this point. If no dag was 2157234285Sdim // chosen, then this instruction, I, cannot be paired (and is no longer 2158234285Sdim // considered). 2159234285Sdim 2160249423Sdim DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: " 2161249423Sdim << *cast<Instruction>(*I) << "\n"); 2162234285Sdim 2163249423Sdim for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), 2164249423Sdim SE2 = BestDAG.end(); S != SE2; ++S) { 2165249423Sdim // Insert the members of this dag into the list of chosen pairs. 2166234285Sdim ChosenPairs.insert(ValuePair(S->first, S->second)); 2167234285Sdim DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 2168234285Sdim *S->second << "\n"); 2169234285Sdim 2170249423Sdim // Remove all candidate pairs that have values in the chosen dag. 2171249423Sdim std::vector<Value *> &KK = CandidatePairs[S->first]; 2172249423Sdim for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); 2173249423Sdim K != KE; ++K) { 2174249423Sdim if (*K == S->second) 2175249423Sdim continue; 2176249423Sdim 2177249423Sdim CandidatePairsSet.erase(ValuePair(S->first, *K)); 2178234285Sdim } 2179249423Sdim 2180249423Sdim std::vector<Value *> &LL = CandidatePairs2[S->second]; 2181249423Sdim for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); 2182249423Sdim L != LE; ++L) { 2183249423Sdim if (*L == S->first) 2184249423Sdim continue; 2185249423Sdim 2186249423Sdim CandidatePairsSet.erase(ValuePair(*L, S->second)); 2187249423Sdim } 2188249423Sdim 2189249423Sdim std::vector<Value *> &MM = CandidatePairs[S->second]; 2190249423Sdim for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); 2191249423Sdim M != ME; ++M) { 2192249423Sdim assert(*M != S->first && "Flipped pair in candidate list?"); 2193249423Sdim CandidatePairsSet.erase(ValuePair(S->second, *M)); 2194249423Sdim } 2195249423Sdim 2196249423Sdim std::vector<Value *> &NN = CandidatePairs2[S->first]; 2197249423Sdim for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); 2198249423Sdim N != NE; ++N) { 2199249423Sdim assert(*N != S->second && "Flipped pair in candidate list?"); 2200249423Sdim CandidatePairsSet.erase(ValuePair(*N, S->first)); 2201249423Sdim } 2202234285Sdim } 2203234285Sdim } 2204234285Sdim 2205234285Sdim DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 2206234285Sdim } 2207234285Sdim 2208234285Sdim std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 2209234285Sdim unsigned n = 0) { 2210234285Sdim if (!I->hasName()) 2211234285Sdim return ""; 2212234285Sdim 2213234285Sdim return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 2214234285Sdim (n > 0 ? "." + utostr(n) : "")).str(); 2215234285Sdim } 2216234285Sdim 2217234285Sdim // Returns the value that is to be used as the pointer input to the vector 2218234285Sdim // instruction that fuses I with J. 2219234285Sdim Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 2220243830Sdim Instruction *I, Instruction *J, unsigned o) { 2221234285Sdim Value *IPtr, *JPtr; 2222243830Sdim unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 2223234285Sdim int64_t OffsetInElmts; 2224239462Sdim 2225243830Sdim // Note: the analysis might fail here, that is why the pair order has 2226239462Sdim // been precomputed (OffsetInElmts must be unused here). 2227234285Sdim (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 2228243830Sdim IAddressSpace, JAddressSpace, 2229243830Sdim OffsetInElmts, false); 2230234285Sdim 2231234285Sdim // The pointer value is taken to be the one with the lowest offset. 2232243830Sdim Value *VPtr = IPtr; 2233234285Sdim 2234263508Sdim Type *ArgTypeI = IPtr->getType()->getPointerElementType(); 2235263508Sdim Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); 2236239462Sdim Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2237263508Sdim Type *VArgPtrType 2238263508Sdim = PointerType::get(VArgType, 2239263508Sdim IPtr->getType()->getPointerAddressSpace()); 2240234285Sdim return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 2241243830Sdim /* insert before */ I); 2242234285Sdim } 2243234285Sdim 2244234285Sdim void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 2245239462Sdim unsigned MaskOffset, unsigned NumInElem, 2246239462Sdim unsigned NumInElem1, unsigned IdxOffset, 2247239462Sdim std::vector<Constant*> &Mask) { 2248263508Sdim unsigned NumElem1 = J->getType()->getVectorNumElements(); 2249239462Sdim for (unsigned v = 0; v < NumElem1; ++v) { 2250234285Sdim int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 2251234285Sdim if (m < 0) { 2252234285Sdim Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 2253234285Sdim } else { 2254234285Sdim unsigned mm = m + (int) IdxOffset; 2255239462Sdim if (m >= (int) NumInElem1) 2256234285Sdim mm += (int) NumInElem; 2257234285Sdim 2258234285Sdim Mask[v+MaskOffset] = 2259234285Sdim ConstantInt::get(Type::getInt32Ty(Context), mm); 2260234285Sdim } 2261234285Sdim } 2262234285Sdim } 2263234285Sdim 2264234285Sdim // Returns the value that is to be used as the vector-shuffle mask to the 2265234285Sdim // vector instruction that fuses I with J. 2266234285Sdim Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 2267234285Sdim Instruction *I, Instruction *J) { 2268234285Sdim // This is the shuffle mask. We need to append the second 2269234285Sdim // mask to the first, and the numbers need to be adjusted. 2270234285Sdim 2271239462Sdim Type *ArgTypeI = I->getType(); 2272239462Sdim Type *ArgTypeJ = J->getType(); 2273239462Sdim Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2274234285Sdim 2275263508Sdim unsigned NumElemI = ArgTypeI->getVectorNumElements(); 2276239462Sdim 2277234285Sdim // Get the total number of elements in the fused vector type. 2278234285Sdim // By definition, this must equal the number of elements in 2279234285Sdim // the final mask. 2280263508Sdim unsigned NumElem = VArgType->getVectorNumElements(); 2281234285Sdim std::vector<Constant*> Mask(NumElem); 2282234285Sdim 2283239462Sdim Type *OpTypeI = I->getOperand(0)->getType(); 2284263508Sdim unsigned NumInElemI = OpTypeI->getVectorNumElements(); 2285239462Sdim Type *OpTypeJ = J->getOperand(0)->getType(); 2286263508Sdim unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); 2287234285Sdim 2288239462Sdim // The fused vector will be: 2289239462Sdim // ----------------------------------------------------- 2290239462Sdim // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | 2291239462Sdim // ----------------------------------------------------- 2292239462Sdim // from which we'll extract NumElem total elements (where the first NumElemI 2293239462Sdim // of them come from the mask in I and the remainder come from the mask 2294239462Sdim // in J. 2295239462Sdim 2296234285Sdim // For the mask from the first pair... 2297239462Sdim fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, 2298239462Sdim 0, Mask); 2299234285Sdim 2300234285Sdim // For the mask from the second pair... 2301239462Sdim fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, 2302239462Sdim NumInElemI, Mask); 2303234285Sdim 2304234285Sdim return ConstantVector::get(Mask); 2305234285Sdim } 2306234285Sdim 2307239462Sdim bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, 2308239462Sdim Instruction *J, unsigned o, Value *&LOp, 2309239462Sdim unsigned numElemL, 2310239462Sdim Type *ArgTypeL, Type *ArgTypeH, 2311243830Sdim bool IBeforeJ, unsigned IdxOff) { 2312239462Sdim bool ExpandedIEChain = false; 2313239462Sdim if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { 2314239462Sdim // If we have a pure insertelement chain, then this can be rewritten 2315239462Sdim // into a chain that directly builds the larger type. 2316243830Sdim if (isPureIEChain(LIE)) { 2317239462Sdim SmallVector<Value *, 8> VectElemts(numElemL, 2318239462Sdim UndefValue::get(ArgTypeL->getScalarType())); 2319239462Sdim InsertElementInst *LIENext = LIE; 2320239462Sdim do { 2321239462Sdim unsigned Idx = 2322239462Sdim cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); 2323239462Sdim VectElemts[Idx] = LIENext->getOperand(1); 2324239462Sdim } while ((LIENext = 2325239462Sdim dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); 2326239462Sdim 2327239462Sdim LIENext = 0; 2328239462Sdim Value *LIEPrev = UndefValue::get(ArgTypeH); 2329239462Sdim for (unsigned i = 0; i < numElemL; ++i) { 2330239462Sdim if (isa<UndefValue>(VectElemts[i])) continue; 2331239462Sdim LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], 2332239462Sdim ConstantInt::get(Type::getInt32Ty(Context), 2333239462Sdim i + IdxOff), 2334243830Sdim getReplacementName(IBeforeJ ? I : J, 2335243830Sdim true, o, i+1)); 2336243830Sdim LIENext->insertBefore(IBeforeJ ? J : I); 2337239462Sdim LIEPrev = LIENext; 2338239462Sdim } 2339239462Sdim 2340239462Sdim LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); 2341239462Sdim ExpandedIEChain = true; 2342239462Sdim } 2343239462Sdim } 2344239462Sdim 2345239462Sdim return ExpandedIEChain; 2346239462Sdim } 2347239462Sdim 2348263508Sdim static unsigned getNumScalarElements(Type *Ty) { 2349263508Sdim if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) 2350263508Sdim return VecTy->getNumElements(); 2351263508Sdim return 1; 2352263508Sdim } 2353263508Sdim 2354234285Sdim // Returns the value to be used as the specified operand of the vector 2355234285Sdim // instruction that fuses I with J. 2356234285Sdim Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 2357243830Sdim Instruction *J, unsigned o, bool IBeforeJ) { 2358234285Sdim Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2359234285Sdim Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 2360234285Sdim 2361239462Sdim // Compute the fused vector type for this operand 2362239462Sdim Type *ArgTypeI = I->getOperand(o)->getType(); 2363239462Sdim Type *ArgTypeJ = J->getOperand(o)->getType(); 2364239462Sdim VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2365234285Sdim 2366234285Sdim Instruction *L = I, *H = J; 2367239462Sdim Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; 2368234285Sdim 2369263508Sdim unsigned numElemL = getNumScalarElements(ArgTypeL); 2370263508Sdim unsigned numElemH = getNumScalarElements(ArgTypeH); 2371234285Sdim 2372239462Sdim Value *LOp = L->getOperand(o); 2373239462Sdim Value *HOp = H->getOperand(o); 2374239462Sdim unsigned numElem = VArgType->getNumElements(); 2375239462Sdim 2376239462Sdim // First, we check if we can reuse the "original" vector outputs (if these 2377239462Sdim // exist). We might need a shuffle. 2378239462Sdim ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); 2379239462Sdim ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); 2380239462Sdim ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); 2381239462Sdim ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); 2382239462Sdim 2383239462Sdim // FIXME: If we're fusing shuffle instructions, then we can't apply this 2384239462Sdim // optimization. The input vectors to the shuffle might be a different 2385239462Sdim // length from the shuffle outputs. Unfortunately, the replacement 2386239462Sdim // shuffle mask has already been formed, and the mask entries are sensitive 2387239462Sdim // to the sizes of the inputs. 2388239462Sdim bool IsSizeChangeShuffle = 2389239462Sdim isa<ShuffleVectorInst>(L) && 2390239462Sdim (LOp->getType() != L->getType() || HOp->getType() != H->getType()); 2391239462Sdim 2392239462Sdim if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { 2393239462Sdim // We can have at most two unique vector inputs. 2394239462Sdim bool CanUseInputs = true; 2395239462Sdim Value *I1, *I2 = 0; 2396239462Sdim if (LEE) { 2397239462Sdim I1 = LEE->getOperand(0); 2398239462Sdim } else { 2399239462Sdim I1 = LSV->getOperand(0); 2400239462Sdim I2 = LSV->getOperand(1); 2401239462Sdim if (I2 == I1 || isa<UndefValue>(I2)) 2402239462Sdim I2 = 0; 2403239462Sdim } 2404239462Sdim 2405239462Sdim if (HEE) { 2406239462Sdim Value *I3 = HEE->getOperand(0); 2407239462Sdim if (!I2 && I3 != I1) 2408239462Sdim I2 = I3; 2409239462Sdim else if (I3 != I1 && I3 != I2) 2410239462Sdim CanUseInputs = false; 2411239462Sdim } else { 2412239462Sdim Value *I3 = HSV->getOperand(0); 2413239462Sdim if (!I2 && I3 != I1) 2414239462Sdim I2 = I3; 2415239462Sdim else if (I3 != I1 && I3 != I2) 2416239462Sdim CanUseInputs = false; 2417239462Sdim 2418239462Sdim if (CanUseInputs) { 2419239462Sdim Value *I4 = HSV->getOperand(1); 2420239462Sdim if (!isa<UndefValue>(I4)) { 2421239462Sdim if (!I2 && I4 != I1) 2422239462Sdim I2 = I4; 2423239462Sdim else if (I4 != I1 && I4 != I2) 2424239462Sdim CanUseInputs = false; 2425239462Sdim } 2426239462Sdim } 2427239462Sdim } 2428239462Sdim 2429239462Sdim if (CanUseInputs) { 2430239462Sdim unsigned LOpElem = 2431263508Sdim cast<Instruction>(LOp)->getOperand(0)->getType() 2432263508Sdim ->getVectorNumElements(); 2433263508Sdim 2434239462Sdim unsigned HOpElem = 2435263508Sdim cast<Instruction>(HOp)->getOperand(0)->getType() 2436263508Sdim ->getVectorNumElements(); 2437239462Sdim 2438239462Sdim // We have one or two input vectors. We need to map each index of the 2439239462Sdim // operands to the index of the original vector. 2440239462Sdim SmallVector<std::pair<int, int>, 8> II(numElem); 2441239462Sdim for (unsigned i = 0; i < numElemL; ++i) { 2442239462Sdim int Idx, INum; 2443239462Sdim if (LEE) { 2444239462Sdim Idx = 2445239462Sdim cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); 2446239462Sdim INum = LEE->getOperand(0) == I1 ? 0 : 1; 2447239462Sdim } else { 2448239462Sdim Idx = LSV->getMaskValue(i); 2449239462Sdim if (Idx < (int) LOpElem) { 2450239462Sdim INum = LSV->getOperand(0) == I1 ? 0 : 1; 2451239462Sdim } else { 2452239462Sdim Idx -= LOpElem; 2453239462Sdim INum = LSV->getOperand(1) == I1 ? 0 : 1; 2454239462Sdim } 2455239462Sdim } 2456239462Sdim 2457239462Sdim II[i] = std::pair<int, int>(Idx, INum); 2458239462Sdim } 2459239462Sdim for (unsigned i = 0; i < numElemH; ++i) { 2460239462Sdim int Idx, INum; 2461239462Sdim if (HEE) { 2462239462Sdim Idx = 2463239462Sdim cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); 2464239462Sdim INum = HEE->getOperand(0) == I1 ? 0 : 1; 2465239462Sdim } else { 2466239462Sdim Idx = HSV->getMaskValue(i); 2467239462Sdim if (Idx < (int) HOpElem) { 2468239462Sdim INum = HSV->getOperand(0) == I1 ? 0 : 1; 2469239462Sdim } else { 2470239462Sdim Idx -= HOpElem; 2471239462Sdim INum = HSV->getOperand(1) == I1 ? 0 : 1; 2472239462Sdim } 2473239462Sdim } 2474239462Sdim 2475239462Sdim II[i + numElemL] = std::pair<int, int>(Idx, INum); 2476239462Sdim } 2477239462Sdim 2478239462Sdim // We now have an array which tells us from which index of which 2479239462Sdim // input vector each element of the operand comes. 2480239462Sdim VectorType *I1T = cast<VectorType>(I1->getType()); 2481239462Sdim unsigned I1Elem = I1T->getNumElements(); 2482239462Sdim 2483239462Sdim if (!I2) { 2484239462Sdim // In this case there is only one underlying vector input. Check for 2485239462Sdim // the trivial case where we can use the input directly. 2486239462Sdim if (I1Elem == numElem) { 2487239462Sdim bool ElemInOrder = true; 2488239462Sdim for (unsigned i = 0; i < numElem; ++i) { 2489239462Sdim if (II[i].first != (int) i && II[i].first != -1) { 2490239462Sdim ElemInOrder = false; 2491239462Sdim break; 2492239462Sdim } 2493239462Sdim } 2494239462Sdim 2495239462Sdim if (ElemInOrder) 2496239462Sdim return I1; 2497239462Sdim } 2498239462Sdim 2499239462Sdim // A shuffle is needed. 2500239462Sdim std::vector<Constant *> Mask(numElem); 2501239462Sdim for (unsigned i = 0; i < numElem; ++i) { 2502239462Sdim int Idx = II[i].first; 2503239462Sdim if (Idx == -1) 2504239462Sdim Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); 2505239462Sdim else 2506239462Sdim Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2507239462Sdim } 2508239462Sdim 2509239462Sdim Instruction *S = 2510239462Sdim new ShuffleVectorInst(I1, UndefValue::get(I1T), 2511239462Sdim ConstantVector::get(Mask), 2512243830Sdim getReplacementName(IBeforeJ ? I : J, 2513243830Sdim true, o)); 2514243830Sdim S->insertBefore(IBeforeJ ? J : I); 2515239462Sdim return S; 2516239462Sdim } 2517239462Sdim 2518239462Sdim VectorType *I2T = cast<VectorType>(I2->getType()); 2519239462Sdim unsigned I2Elem = I2T->getNumElements(); 2520239462Sdim 2521239462Sdim // This input comes from two distinct vectors. The first step is to 2522239462Sdim // make sure that both vectors are the same length. If not, the 2523239462Sdim // smaller one will need to grow before they can be shuffled together. 2524239462Sdim if (I1Elem < I2Elem) { 2525239462Sdim std::vector<Constant *> Mask(I2Elem); 2526239462Sdim unsigned v = 0; 2527239462Sdim for (; v < I1Elem; ++v) 2528239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2529239462Sdim for (; v < I2Elem; ++v) 2530239462Sdim Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2531239462Sdim 2532239462Sdim Instruction *NewI1 = 2533239462Sdim new ShuffleVectorInst(I1, UndefValue::get(I1T), 2534239462Sdim ConstantVector::get(Mask), 2535243830Sdim getReplacementName(IBeforeJ ? I : J, 2536243830Sdim true, o, 1)); 2537243830Sdim NewI1->insertBefore(IBeforeJ ? J : I); 2538239462Sdim I1 = NewI1; 2539239462Sdim I1T = I2T; 2540239462Sdim I1Elem = I2Elem; 2541239462Sdim } else if (I1Elem > I2Elem) { 2542239462Sdim std::vector<Constant *> Mask(I1Elem); 2543239462Sdim unsigned v = 0; 2544239462Sdim for (; v < I2Elem; ++v) 2545239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2546239462Sdim for (; v < I1Elem; ++v) 2547239462Sdim Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2548239462Sdim 2549239462Sdim Instruction *NewI2 = 2550239462Sdim new ShuffleVectorInst(I2, UndefValue::get(I2T), 2551239462Sdim ConstantVector::get(Mask), 2552243830Sdim getReplacementName(IBeforeJ ? I : J, 2553243830Sdim true, o, 1)); 2554243830Sdim NewI2->insertBefore(IBeforeJ ? J : I); 2555239462Sdim I2 = NewI2; 2556239462Sdim I2T = I1T; 2557239462Sdim I2Elem = I1Elem; 2558239462Sdim } 2559239462Sdim 2560239462Sdim // Now that both I1 and I2 are the same length we can shuffle them 2561239462Sdim // together (and use the result). 2562239462Sdim std::vector<Constant *> Mask(numElem); 2563239462Sdim for (unsigned v = 0; v < numElem; ++v) { 2564239462Sdim if (II[v].first == -1) { 2565239462Sdim Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2566239462Sdim } else { 2567239462Sdim int Idx = II[v].first + II[v].second * I1Elem; 2568239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2569239462Sdim } 2570239462Sdim } 2571239462Sdim 2572239462Sdim Instruction *NewOp = 2573239462Sdim new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), 2574243830Sdim getReplacementName(IBeforeJ ? I : J, true, o)); 2575243830Sdim NewOp->insertBefore(IBeforeJ ? J : I); 2576239462Sdim return NewOp; 2577239462Sdim } 2578234285Sdim } 2579234285Sdim 2580239462Sdim Type *ArgType = ArgTypeL; 2581239462Sdim if (numElemL < numElemH) { 2582239462Sdim if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, 2583243830Sdim ArgTypeL, VArgType, IBeforeJ, 1)) { 2584239462Sdim // This is another short-circuit case: we're combining a scalar into 2585239462Sdim // a vector that is formed by an IE chain. We've just expanded the IE 2586239462Sdim // chain, now insert the scalar and we're done. 2587234285Sdim 2588239462Sdim Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, 2589243830Sdim getReplacementName(IBeforeJ ? I : J, true, o)); 2590243830Sdim S->insertBefore(IBeforeJ ? J : I); 2591239462Sdim return S; 2592239462Sdim } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, 2593243830Sdim ArgTypeH, IBeforeJ)) { 2594239462Sdim // The two vector inputs to the shuffle must be the same length, 2595239462Sdim // so extend the smaller vector to be the same length as the larger one. 2596239462Sdim Instruction *NLOp; 2597239462Sdim if (numElemL > 1) { 2598239462Sdim 2599239462Sdim std::vector<Constant *> Mask(numElemH); 2600239462Sdim unsigned v = 0; 2601239462Sdim for (; v < numElemL; ++v) 2602239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2603239462Sdim for (; v < numElemH; ++v) 2604239462Sdim Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2605239462Sdim 2606239462Sdim NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), 2607239462Sdim ConstantVector::get(Mask), 2608243830Sdim getReplacementName(IBeforeJ ? I : J, 2609243830Sdim true, o, 1)); 2610239462Sdim } else { 2611239462Sdim NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, 2612243830Sdim getReplacementName(IBeforeJ ? I : J, 2613243830Sdim true, o, 1)); 2614239462Sdim } 2615239462Sdim 2616243830Sdim NLOp->insertBefore(IBeforeJ ? J : I); 2617239462Sdim LOp = NLOp; 2618239462Sdim } 2619234285Sdim 2620239462Sdim ArgType = ArgTypeH; 2621239462Sdim } else if (numElemL > numElemH) { 2622239462Sdim if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, 2623243830Sdim ArgTypeH, VArgType, IBeforeJ)) { 2624239462Sdim Instruction *S = 2625239462Sdim InsertElementInst::Create(LOp, HOp, 2626239462Sdim ConstantInt::get(Type::getInt32Ty(Context), 2627239462Sdim numElemL), 2628243830Sdim getReplacementName(IBeforeJ ? I : J, 2629243830Sdim true, o)); 2630243830Sdim S->insertBefore(IBeforeJ ? J : I); 2631239462Sdim return S; 2632239462Sdim } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, 2633243830Sdim ArgTypeL, IBeforeJ)) { 2634239462Sdim Instruction *NHOp; 2635239462Sdim if (numElemH > 1) { 2636239462Sdim std::vector<Constant *> Mask(numElemL); 2637239462Sdim unsigned v = 0; 2638239462Sdim for (; v < numElemH; ++v) 2639239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2640239462Sdim for (; v < numElemL; ++v) 2641239462Sdim Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2642239462Sdim 2643239462Sdim NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), 2644239462Sdim ConstantVector::get(Mask), 2645243830Sdim getReplacementName(IBeforeJ ? I : J, 2646243830Sdim true, o, 1)); 2647239462Sdim } else { 2648239462Sdim NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, 2649243830Sdim getReplacementName(IBeforeJ ? I : J, 2650243830Sdim true, o, 1)); 2651239462Sdim } 2652263508Sdim 2653243830Sdim NHOp->insertBefore(IBeforeJ ? J : I); 2654239462Sdim HOp = NHOp; 2655239462Sdim } 2656239462Sdim } 2657234285Sdim 2658239462Sdim if (ArgType->isVectorTy()) { 2659263508Sdim unsigned numElem = VArgType->getVectorNumElements(); 2660239462Sdim std::vector<Constant*> Mask(numElem); 2661239462Sdim for (unsigned v = 0; v < numElem; ++v) { 2662239462Sdim unsigned Idx = v; 2663239462Sdim // If the low vector was expanded, we need to skip the extra 2664239462Sdim // undefined entries. 2665239462Sdim if (v >= numElemL && numElemH > numElemL) 2666239462Sdim Idx += (numElemH - numElemL); 2667239462Sdim Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2668234285Sdim } 2669234285Sdim 2670239462Sdim Instruction *BV = new ShuffleVectorInst(LOp, HOp, 2671243830Sdim ConstantVector::get(Mask), 2672243830Sdim getReplacementName(IBeforeJ ? I : J, true, o)); 2673243830Sdim BV->insertBefore(IBeforeJ ? J : I); 2674234285Sdim return BV; 2675234285Sdim } 2676234285Sdim 2677234285Sdim Instruction *BV1 = InsertElementInst::Create( 2678239462Sdim UndefValue::get(VArgType), LOp, CV0, 2679243830Sdim getReplacementName(IBeforeJ ? I : J, 2680243830Sdim true, o, 1)); 2681243830Sdim BV1->insertBefore(IBeforeJ ? J : I); 2682239462Sdim Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, 2683243830Sdim getReplacementName(IBeforeJ ? I : J, 2684243830Sdim true, o, 2)); 2685243830Sdim BV2->insertBefore(IBeforeJ ? J : I); 2686234285Sdim return BV2; 2687234285Sdim } 2688234285Sdim 2689234285Sdim // This function creates an array of values that will be used as the inputs 2690234285Sdim // to the vector instruction that fuses I with J. 2691234285Sdim void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 2692234285Sdim Instruction *I, Instruction *J, 2693263508Sdim SmallVectorImpl<Value *> &ReplacedOperands, 2694243830Sdim bool IBeforeJ) { 2695234285Sdim unsigned NumOperands = I->getNumOperands(); 2696234285Sdim 2697234285Sdim for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 2698234285Sdim // Iterate backward so that we look at the store pointer 2699234285Sdim // first and know whether or not we need to flip the inputs. 2700234285Sdim 2701234285Sdim if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 2702234285Sdim // This is the pointer for a load/store instruction. 2703243830Sdim ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); 2704234285Sdim continue; 2705234285Sdim } else if (isa<CallInst>(I)) { 2706234285Sdim Function *F = cast<CallInst>(I)->getCalledFunction(); 2707249423Sdim Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 2708234285Sdim if (o == NumOperands-1) { 2709234285Sdim BasicBlock &BB = *I->getParent(); 2710234285Sdim 2711234285Sdim Module *M = BB.getParent()->getParent(); 2712239462Sdim Type *ArgTypeI = I->getType(); 2713239462Sdim Type *ArgTypeJ = J->getType(); 2714239462Sdim Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2715234285Sdim 2716249423Sdim ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); 2717234285Sdim continue; 2718234285Sdim } else if (IID == Intrinsic::powi && o == 1) { 2719234285Sdim // The second argument of powi is a single integer and we've already 2720234285Sdim // checked that both arguments are equal. As a result, we just keep 2721234285Sdim // I's second argument. 2722234285Sdim ReplacedOperands[o] = I->getOperand(o); 2723234285Sdim continue; 2724234285Sdim } 2725234285Sdim } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 2726234285Sdim ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 2727234285Sdim continue; 2728234285Sdim } 2729234285Sdim 2730243830Sdim ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); 2731234285Sdim } 2732234285Sdim } 2733234285Sdim 2734234285Sdim // This function creates two values that represent the outputs of the 2735234285Sdim // original I and J instructions. These are generally vector shuffles 2736234285Sdim // or extracts. In many cases, these will end up being unused and, thus, 2737234285Sdim // eliminated by later passes. 2738234285Sdim void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 2739234285Sdim Instruction *J, Instruction *K, 2740234285Sdim Instruction *&InsertionPt, 2741243830Sdim Instruction *&K1, Instruction *&K2) { 2742234285Sdim if (isa<StoreInst>(I)) { 2743234285Sdim AA->replaceWithNewValue(I, K); 2744234285Sdim AA->replaceWithNewValue(J, K); 2745234285Sdim } else { 2746234285Sdim Type *IType = I->getType(); 2747239462Sdim Type *JType = J->getType(); 2748234285Sdim 2749239462Sdim VectorType *VType = getVecTypeForPair(IType, JType); 2750239462Sdim unsigned numElem = VType->getNumElements(); 2751239462Sdim 2752263508Sdim unsigned numElemI = getNumScalarElements(IType); 2753263508Sdim unsigned numElemJ = getNumScalarElements(JType); 2754239462Sdim 2755234285Sdim if (IType->isVectorTy()) { 2756239462Sdim std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); 2757239462Sdim for (unsigned v = 0; v < numElemI; ++v) { 2758239462Sdim Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2759239462Sdim Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); 2760239462Sdim } 2761234285Sdim 2762239462Sdim K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 2763243830Sdim ConstantVector::get( Mask1), 2764239462Sdim getReplacementName(K, false, 1)); 2765234285Sdim } else { 2766239462Sdim Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2767243830Sdim K1 = ExtractElementInst::Create(K, CV0, 2768234285Sdim getReplacementName(K, false, 1)); 2769239462Sdim } 2770239462Sdim 2771239462Sdim if (JType->isVectorTy()) { 2772239462Sdim std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); 2773239462Sdim for (unsigned v = 0; v < numElemJ; ++v) { 2774239462Sdim Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2775239462Sdim Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); 2776239462Sdim } 2777239462Sdim 2778239462Sdim K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 2779243830Sdim ConstantVector::get( Mask2), 2780239462Sdim getReplacementName(K, false, 2)); 2781239462Sdim } else { 2782239462Sdim Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); 2783243830Sdim K2 = ExtractElementInst::Create(K, CV1, 2784234285Sdim getReplacementName(K, false, 2)); 2785234285Sdim } 2786234285Sdim 2787234285Sdim K1->insertAfter(K); 2788234285Sdim K2->insertAfter(K1); 2789234285Sdim InsertionPt = K2; 2790234285Sdim } 2791234285Sdim } 2792234285Sdim 2793234285Sdim // Move all uses of the function I (including pairing-induced uses) after J. 2794234285Sdim bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 2795249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 2796234285Sdim Instruction *I, Instruction *J) { 2797234285Sdim // Skip to the first instruction past I. 2798234285Sdim BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2799234285Sdim 2800234285Sdim DenseSet<Value *> Users; 2801234285Sdim AliasSetTracker WriteSet(*AA); 2802263508Sdim if (I->mayWriteToMemory()) WriteSet.add(I); 2803263508Sdim 2804234285Sdim for (; cast<Instruction>(L) != J; ++L) 2805249423Sdim (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); 2806234285Sdim 2807234285Sdim assert(cast<Instruction>(L) == J && 2808234285Sdim "Tracking has not proceeded far enough to check for dependencies"); 2809234285Sdim // If J is now in the use set of I, then trackUsesOfI will return true 2810234285Sdim // and we have a dependency cycle (and the fusing operation must abort). 2811249423Sdim return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); 2812234285Sdim } 2813234285Sdim 2814234285Sdim // Move all uses of the function I (including pairing-induced uses) after J. 2815234285Sdim void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 2816249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 2817234285Sdim Instruction *&InsertionPt, 2818234285Sdim Instruction *I, Instruction *J) { 2819234285Sdim // Skip to the first instruction past I. 2820234285Sdim BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2821234285Sdim 2822234285Sdim DenseSet<Value *> Users; 2823234285Sdim AliasSetTracker WriteSet(*AA); 2824263508Sdim if (I->mayWriteToMemory()) WriteSet.add(I); 2825263508Sdim 2826234285Sdim for (; cast<Instruction>(L) != J;) { 2827249423Sdim if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { 2828234285Sdim // Move this instruction 2829234285Sdim Instruction *InstToMove = L; ++L; 2830234285Sdim 2831234285Sdim DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 2832234285Sdim " to after " << *InsertionPt << "\n"); 2833234285Sdim InstToMove->removeFromParent(); 2834234285Sdim InstToMove->insertAfter(InsertionPt); 2835234285Sdim InsertionPt = InstToMove; 2836234285Sdim } else { 2837234285Sdim ++L; 2838234285Sdim } 2839234285Sdim } 2840234285Sdim } 2841234285Sdim 2842234285Sdim // Collect all load instruction that are in the move set of a given first 2843234285Sdim // pair member. These loads depend on the first instruction, I, and so need 2844234285Sdim // to be moved after J (the second instruction) when the pair is fused. 2845234285Sdim void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 2846234285Sdim DenseMap<Value *, Value *> &ChosenPairs, 2847249423Sdim DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2848249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs, 2849234285Sdim Instruction *I) { 2850234285Sdim // Skip to the first instruction past I. 2851234285Sdim BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 2852234285Sdim 2853234285Sdim DenseSet<Value *> Users; 2854234285Sdim AliasSetTracker WriteSet(*AA); 2855263508Sdim if (I->mayWriteToMemory()) WriteSet.add(I); 2856234285Sdim 2857234285Sdim // Note: We cannot end the loop when we reach J because J could be moved 2858234285Sdim // farther down the use chain by another instruction pairing. Also, J 2859234285Sdim // could be before I if this is an inverted input. 2860234285Sdim for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 2861234285Sdim if (trackUsesOfI(Users, WriteSet, I, L)) { 2862249423Sdim if (L->mayReadFromMemory()) { 2863249423Sdim LoadMoveSet[L].push_back(I); 2864249423Sdim LoadMoveSetPairs.insert(ValuePair(L, I)); 2865249423Sdim } 2866234285Sdim } 2867234285Sdim } 2868234285Sdim } 2869234285Sdim 2870234285Sdim // In cases where both load/stores and the computation of their pointers 2871234285Sdim // are chosen for vectorization, we can end up in a situation where the 2872234285Sdim // aliasing analysis starts returning different query results as the 2873234285Sdim // process of fusing instruction pairs continues. Because the algorithm 2874249423Sdim // relies on finding the same use dags here as were found earlier, we'll 2875234285Sdim // need to precompute the necessary aliasing information here and then 2876234285Sdim // manually update it during the fusion process. 2877234285Sdim void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 2878234285Sdim std::vector<Value *> &PairableInsts, 2879234285Sdim DenseMap<Value *, Value *> &ChosenPairs, 2880249423Sdim DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2881249423Sdim DenseSet<ValuePair> &LoadMoveSetPairs) { 2882234285Sdim for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 2883234285Sdim PIE = PairableInsts.end(); PI != PIE; ++PI) { 2884234285Sdim DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 2885234285Sdim if (P == ChosenPairs.end()) continue; 2886234285Sdim 2887234285Sdim Instruction *I = cast<Instruction>(P->first); 2888249423Sdim collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, 2889249423Sdim LoadMoveSetPairs, I); 2890234285Sdim } 2891234285Sdim } 2892234285Sdim 2893239462Sdim // When the first instruction in each pair is cloned, it will inherit its 2894239462Sdim // parent's metadata. This metadata must be combined with that of the other 2895239462Sdim // instruction in a safe way. 2896239462Sdim void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { 2897239462Sdim SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; 2898239462Sdim K->getAllMetadataOtherThanDebugLoc(Metadata); 2899239462Sdim for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { 2900239462Sdim unsigned Kind = Metadata[i].first; 2901239462Sdim MDNode *JMD = J->getMetadata(Kind); 2902239462Sdim MDNode *KMD = Metadata[i].second; 2903239462Sdim 2904239462Sdim switch (Kind) { 2905239462Sdim default: 2906239462Sdim K->setMetadata(Kind, 0); // Remove unknown metadata 2907239462Sdim break; 2908239462Sdim case LLVMContext::MD_tbaa: 2909239462Sdim K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); 2910239462Sdim break; 2911239462Sdim case LLVMContext::MD_fpmath: 2912239462Sdim K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); 2913239462Sdim break; 2914239462Sdim } 2915239462Sdim } 2916239462Sdim } 2917239462Sdim 2918234285Sdim // This function fuses the chosen instruction pairs into vector instructions, 2919234285Sdim // taking care preserve any needed scalar outputs and, then, it reorders the 2920234285Sdim // remaining instructions as needed (users of the first member of the pair 2921234285Sdim // need to be moved to after the location of the second member of the pair 2922234285Sdim // because the vector instruction is inserted in the location of the pair's 2923234285Sdim // second member). 2924234285Sdim void BBVectorize::fuseChosenPairs(BasicBlock &BB, 2925249423Sdim std::vector<Value *> &PairableInsts, 2926249423Sdim DenseMap<Value *, Value *> &ChosenPairs, 2927249423Sdim DenseSet<ValuePair> &FixedOrderPairs, 2928249423Sdim DenseMap<VPPair, unsigned> &PairConnectionTypes, 2929249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2930249423Sdim DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { 2931234285Sdim LLVMContext& Context = BB.getContext(); 2932234285Sdim 2933234285Sdim // During the vectorization process, the order of the pairs to be fused 2934234285Sdim // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 2935234285Sdim // list. After a pair is fused, the flipped pair is removed from the list. 2936243830Sdim DenseSet<ValuePair> FlippedPairs; 2937234285Sdim for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 2938234285Sdim E = ChosenPairs.end(); P != E; ++P) 2939243830Sdim FlippedPairs.insert(ValuePair(P->second, P->first)); 2940243830Sdim for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), 2941234285Sdim E = FlippedPairs.end(); P != E; ++P) 2942234285Sdim ChosenPairs.insert(*P); 2943234285Sdim 2944249423Sdim DenseMap<Value *, std::vector<Value *> > LoadMoveSet; 2945249423Sdim DenseSet<ValuePair> LoadMoveSetPairs; 2946249423Sdim collectLoadMoveSet(BB, PairableInsts, ChosenPairs, 2947249423Sdim LoadMoveSet, LoadMoveSetPairs); 2948234285Sdim 2949234285Sdim DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 2950234285Sdim 2951234285Sdim for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 2952234285Sdim DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 2953234285Sdim if (P == ChosenPairs.end()) { 2954234285Sdim ++PI; 2955234285Sdim continue; 2956234285Sdim } 2957234285Sdim 2958234285Sdim if (getDepthFactor(P->first) == 0) { 2959234285Sdim // These instructions are not really fused, but are tracked as though 2960234285Sdim // they are. Any case in which it would be interesting to fuse them 2961234285Sdim // will be taken care of by InstCombine. 2962234285Sdim --NumFusedOps; 2963234285Sdim ++PI; 2964234285Sdim continue; 2965234285Sdim } 2966234285Sdim 2967234285Sdim Instruction *I = cast<Instruction>(P->first), 2968234285Sdim *J = cast<Instruction>(P->second); 2969234285Sdim 2970234285Sdim DEBUG(dbgs() << "BBV: fusing: " << *I << 2971234285Sdim " <-> " << *J << "\n"); 2972234285Sdim 2973234285Sdim // Remove the pair and flipped pair from the list. 2974234285Sdim DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 2975234285Sdim assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 2976234285Sdim ChosenPairs.erase(FP); 2977234285Sdim ChosenPairs.erase(P); 2978234285Sdim 2979249423Sdim if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { 2980234285Sdim DEBUG(dbgs() << "BBV: fusion of: " << *I << 2981234285Sdim " <-> " << *J << 2982234285Sdim " aborted because of non-trivial dependency cycle\n"); 2983234285Sdim --NumFusedOps; 2984234285Sdim ++PI; 2985234285Sdim continue; 2986234285Sdim } 2987234285Sdim 2988243830Sdim // If the pair must have the other order, then flip it. 2989243830Sdim bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); 2990243830Sdim if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { 2991243830Sdim // This pair does not have a fixed order, and so we might want to 2992243830Sdim // flip it if that will yield fewer shuffles. We count the number 2993243830Sdim // of dependencies connected via swaps, and those directly connected, 2994243830Sdim // and flip the order if the number of swaps is greater. 2995243830Sdim bool OrigOrder = true; 2996249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = 2997249423Sdim ConnectedPairDeps.find(ValuePair(I, J)); 2998249423Sdim if (IJ == ConnectedPairDeps.end()) { 2999249423Sdim IJ = ConnectedPairDeps.find(ValuePair(J, I)); 3000243830Sdim OrigOrder = false; 3001243830Sdim } 3002239462Sdim 3003249423Sdim if (IJ != ConnectedPairDeps.end()) { 3004243830Sdim unsigned NumDepsDirect = 0, NumDepsSwap = 0; 3005249423Sdim for (std::vector<ValuePair>::iterator T = IJ->second.begin(), 3006249423Sdim TE = IJ->second.end(); T != TE; ++T) { 3007249423Sdim VPPair Q(IJ->first, *T); 3008243830Sdim DenseMap<VPPair, unsigned>::iterator R = 3009249423Sdim PairConnectionTypes.find(VPPair(Q.second, Q.first)); 3010243830Sdim assert(R != PairConnectionTypes.end() && 3011243830Sdim "Cannot find pair connection type"); 3012243830Sdim if (R->second == PairConnectionDirect) 3013243830Sdim ++NumDepsDirect; 3014243830Sdim else if (R->second == PairConnectionSwap) 3015243830Sdim ++NumDepsSwap; 3016243830Sdim } 3017243830Sdim 3018243830Sdim if (!OrigOrder) 3019243830Sdim std::swap(NumDepsDirect, NumDepsSwap); 3020243830Sdim 3021243830Sdim if (NumDepsSwap > NumDepsDirect) { 3022243830Sdim FlipPairOrder = true; 3023243830Sdim DEBUG(dbgs() << "BBV: reordering pair: " << *I << 3024243830Sdim " <-> " << *J << "\n"); 3025243830Sdim } 3026243830Sdim } 3027243830Sdim } 3028243830Sdim 3029243830Sdim Instruction *L = I, *H = J; 3030243830Sdim if (FlipPairOrder) 3031243830Sdim std::swap(H, L); 3032243830Sdim 3033243830Sdim // If the pair being fused uses the opposite order from that in the pair 3034243830Sdim // connection map, then we need to flip the types. 3035249423Sdim DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = 3036249423Sdim ConnectedPairs.find(ValuePair(H, L)); 3037249423Sdim if (HL != ConnectedPairs.end()) 3038249423Sdim for (std::vector<ValuePair>::iterator T = HL->second.begin(), 3039249423Sdim TE = HL->second.end(); T != TE; ++T) { 3040249423Sdim VPPair Q(HL->first, *T); 3041249423Sdim DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); 3042249423Sdim assert(R != PairConnectionTypes.end() && 3043249423Sdim "Cannot find pair connection type"); 3044249423Sdim if (R->second == PairConnectionDirect) 3045249423Sdim R->second = PairConnectionSwap; 3046249423Sdim else if (R->second == PairConnectionSwap) 3047249423Sdim R->second = PairConnectionDirect; 3048249423Sdim } 3049243830Sdim 3050243830Sdim bool LBeforeH = !FlipPairOrder; 3051234285Sdim unsigned NumOperands = I->getNumOperands(); 3052234285Sdim SmallVector<Value *, 3> ReplacedOperands(NumOperands); 3053243830Sdim getReplacementInputsForPair(Context, L, H, ReplacedOperands, 3054243830Sdim LBeforeH); 3055234285Sdim 3056234285Sdim // Make a copy of the original operation, change its type to the vector 3057234285Sdim // type and replace its operands with the vector operands. 3058243830Sdim Instruction *K = L->clone(); 3059243830Sdim if (L->hasName()) 3060243830Sdim K->takeName(L); 3061243830Sdim else if (H->hasName()) 3062243830Sdim K->takeName(H); 3063234285Sdim 3064234285Sdim if (!isa<StoreInst>(K)) 3065243830Sdim K->mutateType(getVecTypeForPair(L->getType(), H->getType())); 3066234285Sdim 3067243830Sdim combineMetadata(K, H); 3068243830Sdim K->intersectOptionalDataWith(H); 3069239462Sdim 3070234285Sdim for (unsigned o = 0; o < NumOperands; ++o) 3071234285Sdim K->setOperand(o, ReplacedOperands[o]); 3072234285Sdim 3073234285Sdim K->insertAfter(J); 3074234285Sdim 3075234285Sdim // Instruction insertion point: 3076234285Sdim Instruction *InsertionPt = K; 3077234285Sdim Instruction *K1 = 0, *K2 = 0; 3078243830Sdim replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); 3079234285Sdim 3080249423Sdim // The use dag of the first original instruction must be moved to after 3081249423Sdim // the location of the second instruction. The entire use dag of the 3082249423Sdim // first instruction is disjoint from the input dag of the second 3083234285Sdim // (by definition), and so commutes with it. 3084234285Sdim 3085249423Sdim moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); 3086234285Sdim 3087234285Sdim if (!isa<StoreInst>(I)) { 3088243830Sdim L->replaceAllUsesWith(K1); 3089243830Sdim H->replaceAllUsesWith(K2); 3090243830Sdim AA->replaceWithNewValue(L, K1); 3091243830Sdim AA->replaceWithNewValue(H, K2); 3092234285Sdim } 3093234285Sdim 3094234285Sdim // Instructions that may read from memory may be in the load move set. 3095234285Sdim // Once an instruction is fused, we no longer need its move set, and so 3096234285Sdim // the values of the map never need to be updated. However, when a load 3097234285Sdim // is fused, we need to merge the entries from both instructions in the 3098234285Sdim // pair in case those instructions were in the move set of some other 3099234285Sdim // yet-to-be-fused pair. The loads in question are the keys of the map. 3100234285Sdim if (I->mayReadFromMemory()) { 3101234285Sdim std::vector<ValuePair> NewSetMembers; 3102249423Sdim DenseMap<Value *, std::vector<Value *> >::iterator II = 3103249423Sdim LoadMoveSet.find(I); 3104249423Sdim if (II != LoadMoveSet.end()) 3105249423Sdim for (std::vector<Value *>::iterator N = II->second.begin(), 3106249423Sdim NE = II->second.end(); N != NE; ++N) 3107249423Sdim NewSetMembers.push_back(ValuePair(K, *N)); 3108249423Sdim DenseMap<Value *, std::vector<Value *> >::iterator JJ = 3109249423Sdim LoadMoveSet.find(J); 3110249423Sdim if (JJ != LoadMoveSet.end()) 3111249423Sdim for (std::vector<Value *>::iterator N = JJ->second.begin(), 3112249423Sdim NE = JJ->second.end(); N != NE; ++N) 3113249423Sdim NewSetMembers.push_back(ValuePair(K, *N)); 3114234285Sdim for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 3115249423Sdim AE = NewSetMembers.end(); A != AE; ++A) { 3116249423Sdim LoadMoveSet[A->first].push_back(A->second); 3117249423Sdim LoadMoveSetPairs.insert(*A); 3118249423Sdim } 3119234285Sdim } 3120234285Sdim 3121234285Sdim // Before removing I, set the iterator to the next instruction. 3122234285Sdim PI = llvm::next(BasicBlock::iterator(I)); 3123234285Sdim if (cast<Instruction>(PI) == J) 3124234285Sdim ++PI; 3125234285Sdim 3126234285Sdim SE->forgetValue(I); 3127234285Sdim SE->forgetValue(J); 3128234285Sdim I->eraseFromParent(); 3129234285Sdim J->eraseFromParent(); 3130243830Sdim 3131243830Sdim DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << 3132243830Sdim BB << "\n"); 3133234285Sdim } 3134234285Sdim 3135234285Sdim DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 3136234285Sdim } 3137234285Sdim} 3138234285Sdim 3139234285Sdimchar BBVectorize::ID = 0; 3140234285Sdimstatic const char bb_vectorize_name[] = "Basic-Block Vectorization"; 3141234285SdimINITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3142234285SdimINITIALIZE_AG_DEPENDENCY(AliasAnalysis) 3143249423SdimINITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 3144243830SdimINITIALIZE_PASS_DEPENDENCY(DominatorTree) 3145234285SdimINITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 3146234285SdimINITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3147234285Sdim 3148234285SdimBasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 3149234285Sdim return new BBVectorize(C); 3150234285Sdim} 3151234285Sdim 3152234285Sdimbool 3153234285Sdimllvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 3154234285Sdim BBVectorize BBVectorizer(P, C); 3155234285Sdim return BBVectorizer.vectorizeBB(BB); 3156234285Sdim} 3157234285Sdim 3158234285Sdim//===----------------------------------------------------------------------===// 3159234285SdimVectorizeConfig::VectorizeConfig() { 3160234285Sdim VectorBits = ::VectorBits; 3161239462Sdim VectorizeBools = !::NoBools; 3162234285Sdim VectorizeInts = !::NoInts; 3163234285Sdim VectorizeFloats = !::NoFloats; 3164234982Sdim VectorizePointers = !::NoPointers; 3165234285Sdim VectorizeCasts = !::NoCasts; 3166234285Sdim VectorizeMath = !::NoMath; 3167234285Sdim VectorizeFMA = !::NoFMA; 3168234982Sdim VectorizeSelect = !::NoSelect; 3169239462Sdim VectorizeCmp = !::NoCmp; 3170234982Sdim VectorizeGEP = !::NoGEP; 3171234285Sdim VectorizeMemOps = !::NoMemOps; 3172234285Sdim AlignedOnly = ::AlignedOnly; 3173234285Sdim ReqChainDepth= ::ReqChainDepth; 3174234285Sdim SearchLimit = ::SearchLimit; 3175234285Sdim MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 3176234285Sdim SplatBreaksChain = ::SplatBreaksChain; 3177234285Sdim MaxInsts = ::MaxInsts; 3178249423Sdim MaxPairs = ::MaxPairs; 3179234285Sdim MaxIter = ::MaxIter; 3180239462Sdim Pow2LenOnly = ::Pow2LenOnly; 3181234285Sdim NoMemOpBoost = ::NoMemOpBoost; 3182234285Sdim FastDep = ::FastDep; 3183234285Sdim} 3184