1//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Loop Distribution Pass. Its main focus is to 11// distribute loops that cannot be vectorized due to dependence cycles. It 12// tries to isolate the offending dependences into a new loop allowing 13// vectorization of the remaining parts. 14// 15// For dependence analysis, the pass uses the LoopVectorizer's 16// LoopAccessAnalysis. Because this analysis presumes no change in the order of 17// memory operations, special care is taken to preserve the lexical order of 18// these operations. 19// 20// Similarly to the Vectorizer, the pass also supports loop versioning to 21// run-time disambiguate potentially overlapping arrays. 22// 23//===----------------------------------------------------------------------===// 24 25#include "llvm/ADT/DepthFirstIterator.h" 26#include "llvm/ADT/EquivalenceClasses.h" 27#include "llvm/ADT/STLExtras.h" 28#include "llvm/ADT/Statistic.h" 29#include "llvm/Analysis/LoopAccessAnalysis.h" 30#include "llvm/Analysis/LoopInfo.h" 31#include "llvm/IR/Dominators.h" 32#include "llvm/Pass.h" 33#include "llvm/Support/CommandLine.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Transforms/Utils/BasicBlockUtils.h" 36#include "llvm/Transforms/Utils/Cloning.h" 37#include "llvm/Transforms/Utils/LoopUtils.h" 38#include "llvm/Transforms/Utils/LoopVersioning.h" 39#include <list> 40 41#define LDIST_NAME "loop-distribute" 42#define DEBUG_TYPE LDIST_NAME 43 44using namespace llvm; 45 46static cl::opt<bool> 47 LDistVerify("loop-distribute-verify", cl::Hidden, 48 cl::desc("Turn on DominatorTree and LoopInfo verification " 49 "after Loop Distribution"), 50 cl::init(false)); 51 52static cl::opt<bool> DistributeNonIfConvertible( 53 "loop-distribute-non-if-convertible", cl::Hidden, 54 cl::desc("Whether to distribute into a loop that may not be " 55 "if-convertible by the loop vectorizer"), 56 cl::init(false)); 57 58static cl::opt<unsigned> DistributeSCEVCheckThreshold( 59 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 60 cl::desc("The maximum number of SCEV checks allowed for Loop " 61 "Distribution")); 62 63STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 64 65namespace { 66/// \brief Maintains the set of instructions of the loop for a partition before 67/// cloning. After cloning, it hosts the new loop. 68class InstPartition { 69 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 70 71public: 72 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 73 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 74 Set.insert(I); 75 } 76 77 /// \brief Returns whether this partition contains a dependence cycle. 78 bool hasDepCycle() const { return DepCycle; } 79 80 /// \brief Adds an instruction to this partition. 81 void add(Instruction *I) { Set.insert(I); } 82 83 /// \brief Collection accessors. 84 InstructionSet::iterator begin() { return Set.begin(); } 85 InstructionSet::iterator end() { return Set.end(); } 86 InstructionSet::const_iterator begin() const { return Set.begin(); } 87 InstructionSet::const_iterator end() const { return Set.end(); } 88 bool empty() const { return Set.empty(); } 89 90 /// \brief Moves this partition into \p Other. This partition becomes empty 91 /// after this. 92 void moveTo(InstPartition &Other) { 93 Other.Set.insert(Set.begin(), Set.end()); 94 Set.clear(); 95 Other.DepCycle |= DepCycle; 96 } 97 98 /// \brief Populates the partition with a transitive closure of all the 99 /// instructions that the seeded instructions dependent on. 100 void populateUsedSet() { 101 // FIXME: We currently don't use control-dependence but simply include all 102 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 103 // up. 104 for (auto *B : OrigLoop->getBlocks()) 105 Set.insert(B->getTerminator()); 106 107 // Follow the use-def chains to form a transitive closure of all the 108 // instructions that the originally seeded instructions depend on. 109 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 110 while (!Worklist.empty()) { 111 Instruction *I = Worklist.pop_back_val(); 112 // Insert instructions from the loop that we depend on. 113 for (Value *V : I->operand_values()) { 114 auto *I = dyn_cast<Instruction>(V); 115 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 116 Worklist.push_back(I); 117 } 118 } 119 } 120 121 /// \brief Clones the original loop. 122 /// 123 /// Updates LoopInfo and DominatorTree using the information that block \p 124 /// LoopDomBB dominates the loop. 125 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 126 unsigned Index, LoopInfo *LI, 127 DominatorTree *DT) { 128 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 129 VMap, Twine(".ldist") + Twine(Index), 130 LI, DT, ClonedLoopBlocks); 131 return ClonedLoop; 132 } 133 134 /// \brief The cloned loop. If this partition is mapped to the original loop, 135 /// this is null. 136 const Loop *getClonedLoop() const { return ClonedLoop; } 137 138 /// \brief Returns the loop where this partition ends up after distribution. 139 /// If this partition is mapped to the original loop then use the block from 140 /// the loop. 141 const Loop *getDistributedLoop() const { 142 return ClonedLoop ? ClonedLoop : OrigLoop; 143 } 144 145 /// \brief The VMap that is populated by cloning and then used in 146 /// remapinstruction to remap the cloned instructions. 147 ValueToValueMapTy &getVMap() { return VMap; } 148 149 /// \brief Remaps the cloned instructions using VMap. 150 void remapInstructions() { 151 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 152 } 153 154 /// \brief Based on the set of instructions selected for this partition, 155 /// removes the unnecessary ones. 156 void removeUnusedInsts() { 157 SmallVector<Instruction *, 8> Unused; 158 159 for (auto *Block : OrigLoop->getBlocks()) 160 for (auto &Inst : *Block) 161 if (!Set.count(&Inst)) { 162 Instruction *NewInst = &Inst; 163 if (!VMap.empty()) 164 NewInst = cast<Instruction>(VMap[NewInst]); 165 166 assert(!isa<BranchInst>(NewInst) && 167 "Branches are marked used early on"); 168 Unused.push_back(NewInst); 169 } 170 171 // Delete the instructions backwards, as it has a reduced likelihood of 172 // having to update as many def-use and use-def chains. 173 for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { 174 if (!Inst->use_empty()) 175 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 176 Inst->eraseFromParent(); 177 } 178 } 179 180 void print() const { 181 if (DepCycle) 182 dbgs() << " (cycle)\n"; 183 for (auto *I : Set) 184 // Prefix with the block name. 185 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 186 } 187 188 void printBlocks() const { 189 for (auto *BB : getDistributedLoop()->getBlocks()) 190 dbgs() << *BB; 191 } 192 193private: 194 /// \brief Instructions from OrigLoop selected for this partition. 195 InstructionSet Set; 196 197 /// \brief Whether this partition contains a dependence cycle. 198 bool DepCycle; 199 200 /// \brief The original loop. 201 Loop *OrigLoop; 202 203 /// \brief The cloned loop. If this partition is mapped to the original loop, 204 /// this is null. 205 Loop *ClonedLoop; 206 207 /// \brief The blocks of ClonedLoop including the preheader. If this 208 /// partition is mapped to the original loop, this is empty. 209 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 210 211 /// \brief These gets populated once the set of instructions have been 212 /// finalized. If this partition is mapped to the original loop, these are not 213 /// set. 214 ValueToValueMapTy VMap; 215}; 216 217/// \brief Holds the set of Partitions. It populates them, merges them and then 218/// clones the loops. 219class InstPartitionContainer { 220 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 221 222public: 223 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 224 : L(L), LI(LI), DT(DT) {} 225 226 /// \brief Returns the number of partitions. 227 unsigned getSize() const { return PartitionContainer.size(); } 228 229 /// \brief Adds \p Inst into the current partition if that is marked to 230 /// contain cycles. Otherwise start a new partition for it. 231 void addToCyclicPartition(Instruction *Inst) { 232 // If the current partition is non-cyclic. Start a new one. 233 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 234 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 235 else 236 PartitionContainer.back().add(Inst); 237 } 238 239 /// \brief Adds \p Inst into a partition that is not marked to contain 240 /// dependence cycles. 241 /// 242 // Initially we isolate memory instructions into as many partitions as 243 // possible, then later we may merge them back together. 244 void addToNewNonCyclicPartition(Instruction *Inst) { 245 PartitionContainer.emplace_back(Inst, L); 246 } 247 248 /// \brief Merges adjacent non-cyclic partitions. 249 /// 250 /// The idea is that we currently only want to isolate the non-vectorizable 251 /// partition. We could later allow more distribution among these partition 252 /// too. 253 void mergeAdjacentNonCyclic() { 254 mergeAdjacentPartitionsIf( 255 [](const InstPartition *P) { return !P->hasDepCycle(); }); 256 } 257 258 /// \brief If a partition contains only conditional stores, we won't vectorize 259 /// it. Try to merge it with a previous cyclic partition. 260 void mergeNonIfConvertible() { 261 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 262 if (Partition->hasDepCycle()) 263 return true; 264 265 // Now, check if all stores are conditional in this partition. 266 bool seenStore = false; 267 268 for (auto *Inst : *Partition) 269 if (isa<StoreInst>(Inst)) { 270 seenStore = true; 271 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 272 return false; 273 } 274 return seenStore; 275 }); 276 } 277 278 /// \brief Merges the partitions according to various heuristics. 279 void mergeBeforePopulating() { 280 mergeAdjacentNonCyclic(); 281 if (!DistributeNonIfConvertible) 282 mergeNonIfConvertible(); 283 } 284 285 /// \brief Merges partitions in order to ensure that no loads are duplicated. 286 /// 287 /// We can't duplicate loads because that could potentially reorder them. 288 /// LoopAccessAnalysis provides dependency information with the context that 289 /// the order of memory operation is preserved. 290 /// 291 /// Return if any partitions were merged. 292 bool mergeToAvoidDuplicatedLoads() { 293 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 294 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 295 296 LoadToPartitionT LoadToPartition; 297 ToBeMergedT ToBeMerged; 298 299 // Step through the partitions and create equivalence between partitions 300 // that contain the same load. Also put partitions in between them in the 301 // same equivalence class to avoid reordering of memory operations. 302 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 303 E = PartitionContainer.end(); 304 I != E; ++I) { 305 auto *PartI = &*I; 306 307 // If a load occurs in two partitions PartI and PartJ, merge all 308 // partitions (PartI, PartJ] into PartI. 309 for (Instruction *Inst : *PartI) 310 if (isa<LoadInst>(Inst)) { 311 bool NewElt; 312 LoadToPartitionT::iterator LoadToPart; 313 314 std::tie(LoadToPart, NewElt) = 315 LoadToPartition.insert(std::make_pair(Inst, PartI)); 316 if (!NewElt) { 317 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 318 << "partitions: " << PartI << ", " 319 << LoadToPart->second << "\n" << *Inst << "\n"); 320 321 auto PartJ = I; 322 do { 323 --PartJ; 324 ToBeMerged.unionSets(PartI, &*PartJ); 325 } while (&*PartJ != LoadToPart->second); 326 } 327 } 328 } 329 if (ToBeMerged.empty()) 330 return false; 331 332 // Merge the member of an equivalence class into its class leader. This 333 // makes the members empty. 334 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 335 I != E; ++I) { 336 if (!I->isLeader()) 337 continue; 338 339 auto PartI = I->getData(); 340 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 341 ToBeMerged.member_end())) { 342 PartJ->moveTo(*PartI); 343 } 344 } 345 346 // Remove the empty partitions. 347 PartitionContainer.remove_if( 348 [](const InstPartition &P) { return P.empty(); }); 349 350 return true; 351 } 352 353 /// \brief Sets up the mapping between instructions to partitions. If the 354 /// instruction is duplicated across multiple partitions, set the entry to -1. 355 void setupPartitionIdOnInstructions() { 356 int PartitionID = 0; 357 for (const auto &Partition : PartitionContainer) { 358 for (Instruction *Inst : Partition) { 359 bool NewElt; 360 InstToPartitionIdT::iterator Iter; 361 362 std::tie(Iter, NewElt) = 363 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 364 if (!NewElt) 365 Iter->second = -1; 366 } 367 ++PartitionID; 368 } 369 } 370 371 /// \brief Populates the partition with everything that the seeding 372 /// instructions require. 373 void populateUsedSet() { 374 for (auto &P : PartitionContainer) 375 P.populateUsedSet(); 376 } 377 378 /// \brief This performs the main chunk of the work of cloning the loops for 379 /// the partitions. 380 void cloneLoops() { 381 BasicBlock *OrigPH = L->getLoopPreheader(); 382 // At this point the predecessor of the preheader is either the memcheck 383 // block or the top part of the original preheader. 384 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 385 assert(Pred && "Preheader does not have a single predecessor"); 386 BasicBlock *ExitBlock = L->getExitBlock(); 387 assert(ExitBlock && "No single exit block"); 388 Loop *NewLoop; 389 390 assert(!PartitionContainer.empty() && "at least two partitions expected"); 391 // We're cloning the preheader along with the loop so we already made sure 392 // it was empty. 393 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 394 "preheader not empty"); 395 396 // Create a loop for each partition except the last. Clone the original 397 // loop before PH along with adding a preheader for the cloned loop. Then 398 // update PH to point to the newly added preheader. 399 BasicBlock *TopPH = OrigPH; 400 unsigned Index = getSize() - 1; 401 for (auto I = std::next(PartitionContainer.rbegin()), 402 E = PartitionContainer.rend(); 403 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 404 auto *Part = &*I; 405 406 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 407 408 Part->getVMap()[ExitBlock] = TopPH; 409 Part->remapInstructions(); 410 } 411 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 412 413 // Now go in forward order and update the immediate dominator for the 414 // preheaders with the exiting block of the previous loop. Dominance 415 // within the loop is updated in cloneLoopWithPreheader. 416 for (auto Curr = PartitionContainer.cbegin(), 417 Next = std::next(PartitionContainer.cbegin()), 418 E = PartitionContainer.cend(); 419 Next != E; ++Curr, ++Next) 420 DT->changeImmediateDominator( 421 Next->getDistributedLoop()->getLoopPreheader(), 422 Curr->getDistributedLoop()->getExitingBlock()); 423 } 424 425 /// \brief Removes the dead instructions from the cloned loops. 426 void removeUnusedInsts() { 427 for (auto &Partition : PartitionContainer) 428 Partition.removeUnusedInsts(); 429 } 430 431 /// \brief For each memory pointer, it computes the partitionId the pointer is 432 /// used in. 433 /// 434 /// This returns an array of int where the I-th entry corresponds to I-th 435 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 436 /// partitions its entry is set to -1. 437 SmallVector<int, 8> 438 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 439 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 440 441 unsigned N = RtPtrCheck->Pointers.size(); 442 SmallVector<int, 8> PtrToPartitions(N); 443 for (unsigned I = 0; I < N; ++I) { 444 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 445 auto Instructions = 446 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 447 448 int &Partition = PtrToPartitions[I]; 449 // First set it to uninitialized. 450 Partition = -2; 451 for (Instruction *Inst : Instructions) { 452 // Note that this could be -1 if Inst is duplicated across multiple 453 // partitions. 454 int ThisPartition = this->InstToPartitionId[Inst]; 455 if (Partition == -2) 456 Partition = ThisPartition; 457 // -1 means belonging to multiple partitions. 458 else if (Partition == -1) 459 break; 460 else if (Partition != (int)ThisPartition) 461 Partition = -1; 462 } 463 assert(Partition != -2 && "Pointer not belonging to any partition"); 464 } 465 466 return PtrToPartitions; 467 } 468 469 void print(raw_ostream &OS) const { 470 unsigned Index = 0; 471 for (const auto &P : PartitionContainer) { 472 OS << "Partition " << Index++ << " (" << &P << "):\n"; 473 P.print(); 474 } 475 } 476 477 void dump() const { print(dbgs()); } 478 479#ifndef NDEBUG 480 friend raw_ostream &operator<<(raw_ostream &OS, 481 const InstPartitionContainer &Partitions) { 482 Partitions.print(OS); 483 return OS; 484 } 485#endif 486 487 void printBlocks() const { 488 unsigned Index = 0; 489 for (const auto &P : PartitionContainer) { 490 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 491 P.printBlocks(); 492 } 493 } 494 495private: 496 typedef std::list<InstPartition> PartitionContainerT; 497 498 /// \brief List of partitions. 499 PartitionContainerT PartitionContainer; 500 501 /// \brief Mapping from Instruction to partition Id. If the instruction 502 /// belongs to multiple partitions the entry contains -1. 503 InstToPartitionIdT InstToPartitionId; 504 505 Loop *L; 506 LoopInfo *LI; 507 DominatorTree *DT; 508 509 /// \brief The control structure to merge adjacent partitions if both satisfy 510 /// the \p Predicate. 511 template <class UnaryPredicate> 512 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 513 InstPartition *PrevMatch = nullptr; 514 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 515 auto DoesMatch = Predicate(&*I); 516 if (PrevMatch == nullptr && DoesMatch) { 517 PrevMatch = &*I; 518 ++I; 519 } else if (PrevMatch != nullptr && DoesMatch) { 520 I->moveTo(*PrevMatch); 521 I = PartitionContainer.erase(I); 522 } else { 523 PrevMatch = nullptr; 524 ++I; 525 } 526 } 527 } 528}; 529 530/// \brief For each memory instruction, this class maintains difference of the 531/// number of unsafe dependences that start out from this instruction minus 532/// those that end here. 533/// 534/// By traversing the memory instructions in program order and accumulating this 535/// number, we know whether any unsafe dependence crosses over a program point. 536class MemoryInstructionDependences { 537 typedef MemoryDepChecker::Dependence Dependence; 538 539public: 540 struct Entry { 541 Instruction *Inst; 542 unsigned NumUnsafeDependencesStartOrEnd; 543 544 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 545 }; 546 547 typedef SmallVector<Entry, 8> AccessesType; 548 549 AccessesType::const_iterator begin() const { return Accesses.begin(); } 550 AccessesType::const_iterator end() const { return Accesses.end(); } 551 552 MemoryInstructionDependences( 553 const SmallVectorImpl<Instruction *> &Instructions, 554 const SmallVectorImpl<Dependence> &Dependences) { 555 Accesses.append(Instructions.begin(), Instructions.end()); 556 557 DEBUG(dbgs() << "Backward dependences:\n"); 558 for (auto &Dep : Dependences) 559 if (Dep.isPossiblyBackward()) { 560 // Note that the designations source and destination follow the program 561 // order, i.e. source is always first. (The direction is given by the 562 // DepType.) 563 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 564 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 565 566 DEBUG(Dep.print(dbgs(), 2, Instructions)); 567 } 568 } 569 570private: 571 AccessesType Accesses; 572}; 573 574/// \brief The pass class. 575class LoopDistribute : public FunctionPass { 576public: 577 LoopDistribute() : FunctionPass(ID) { 578 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 579 } 580 581 bool runOnFunction(Function &F) override { 582 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 583 LAA = &getAnalysis<LoopAccessAnalysis>(); 584 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 585 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 586 587 // Build up a worklist of inner-loops to vectorize. This is necessary as the 588 // act of distributing a loop creates new loops and can invalidate iterators 589 // across the loops. 590 SmallVector<Loop *, 8> Worklist; 591 592 for (Loop *TopLevelLoop : *LI) 593 for (Loop *L : depth_first(TopLevelLoop)) 594 // We only handle inner-most loops. 595 if (L->empty()) 596 Worklist.push_back(L); 597 598 // Now walk the identified inner loops. 599 bool Changed = false; 600 for (Loop *L : Worklist) 601 Changed |= processLoop(L); 602 603 // Process each loop nest in the function. 604 return Changed; 605 } 606 607 void getAnalysisUsage(AnalysisUsage &AU) const override { 608 AU.addRequired<ScalarEvolutionWrapperPass>(); 609 AU.addRequired<LoopInfoWrapperPass>(); 610 AU.addPreserved<LoopInfoWrapperPass>(); 611 AU.addRequired<LoopAccessAnalysis>(); 612 AU.addRequired<DominatorTreeWrapperPass>(); 613 AU.addPreserved<DominatorTreeWrapperPass>(); 614 } 615 616 static char ID; 617 618private: 619 /// \brief Filter out checks between pointers from the same partition. 620 /// 621 /// \p PtrToPartition contains the partition number for pointers. Partition 622 /// number -1 means that the pointer is used in multiple partitions. In this 623 /// case we can't safely omit the check. 624 SmallVector<RuntimePointerChecking::PointerCheck, 4> 625 includeOnlyCrossPartitionChecks( 626 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 627 const SmallVectorImpl<int> &PtrToPartition, 628 const RuntimePointerChecking *RtPtrChecking) { 629 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 630 631 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 632 [&](const RuntimePointerChecking::PointerCheck &Check) { 633 for (unsigned PtrIdx1 : Check.first->Members) 634 for (unsigned PtrIdx2 : Check.second->Members) 635 // Only include this check if there is a pair of pointers 636 // that require checking and the pointers fall into 637 // separate partitions. 638 // 639 // (Note that we already know at this point that the two 640 // pointer groups need checking but it doesn't follow 641 // that each pair of pointers within the two groups need 642 // checking as well. 643 // 644 // In other words we don't want to include a check just 645 // because there is a pair of pointers between the two 646 // pointer groups that require checks and a different 647 // pair whose pointers fall into different partitions.) 648 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 649 !RuntimePointerChecking::arePointersInSamePartition( 650 PtrToPartition, PtrIdx1, PtrIdx2)) 651 return true; 652 return false; 653 }); 654 655 return Checks; 656 } 657 658 /// \brief Try to distribute an inner-most loop. 659 bool processLoop(Loop *L) { 660 assert(L->empty() && "Only process inner loops."); 661 662 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 663 << "\" checking " << *L << "\n"); 664 665 BasicBlock *PH = L->getLoopPreheader(); 666 if (!PH) { 667 DEBUG(dbgs() << "Skipping; no preheader"); 668 return false; 669 } 670 if (!L->getExitBlock()) { 671 DEBUG(dbgs() << "Skipping; multiple exit blocks"); 672 return false; 673 } 674 // LAA will check that we only have a single exiting block. 675 676 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); 677 678 // Currently, we only distribute to isolate the part of the loop with 679 // dependence cycles to enable partial vectorization. 680 if (LAI.canVectorizeMemory()) { 681 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); 682 return false; 683 } 684 auto *Dependences = LAI.getDepChecker().getDependences(); 685 if (!Dependences || Dependences->empty()) { 686 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); 687 return false; 688 } 689 690 InstPartitionContainer Partitions(L, LI, DT); 691 692 // First, go through each memory operation and assign them to consecutive 693 // partitions (the order of partitions follows program order). Put those 694 // with unsafe dependences into "cyclic" partition otherwise put each store 695 // in its own "non-cyclic" partition (we'll merge these later). 696 // 697 // Note that a memory operation (e.g. Load2 below) at a program point that 698 // has an unsafe dependence (Store3->Load1) spanning over it must be 699 // included in the same cyclic partition as the dependent operations. This 700 // is to preserve the original program order after distribution. E.g.: 701 // 702 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 703 // Load1 -. 1 0->1 704 // Load2 | /Unsafe/ 0 1 705 // Store3 -' -1 1->0 706 // Load4 0 0 707 // 708 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 709 // we just keep assigning to the same cyclic partition until 710 // NumUnsafeDependencesActive reaches 0. 711 const MemoryDepChecker &DepChecker = LAI.getDepChecker(); 712 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 713 *Dependences); 714 715 int NumUnsafeDependencesActive = 0; 716 for (auto &InstDep : MID) { 717 Instruction *I = InstDep.Inst; 718 // We update NumUnsafeDependencesActive post-instruction, catch the 719 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 720 if (NumUnsafeDependencesActive || 721 InstDep.NumUnsafeDependencesStartOrEnd > 0) 722 Partitions.addToCyclicPartition(I); 723 else 724 Partitions.addToNewNonCyclicPartition(I); 725 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 726 assert(NumUnsafeDependencesActive >= 0 && 727 "Negative number of dependences active"); 728 } 729 730 // Add partitions for values used outside. These partitions can be out of 731 // order from the original program order. This is OK because if the 732 // partition uses a load we will merge this partition with the original 733 // partition of the load that we set up in the previous loop (see 734 // mergeToAvoidDuplicatedLoads). 735 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 736 for (auto *Inst : DefsUsedOutside) 737 Partitions.addToNewNonCyclicPartition(Inst); 738 739 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 740 if (Partitions.getSize() < 2) 741 return false; 742 743 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 744 // should be able to vectorize these together. 745 Partitions.mergeBeforePopulating(); 746 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 747 if (Partitions.getSize() < 2) 748 return false; 749 750 // Now, populate the partitions with non-memory operations. 751 Partitions.populateUsedSet(); 752 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 753 754 // In order to preserve original lexical order for loads, keep them in the 755 // partition that we set up in the MemoryInstructionDependences loop. 756 if (Partitions.mergeToAvoidDuplicatedLoads()) { 757 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 758 << Partitions); 759 if (Partitions.getSize() < 2) 760 return false; 761 } 762 763 // Don't distribute the loop if we need too many SCEV run-time checks. 764 const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); 765 if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { 766 DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); 767 return false; 768 } 769 770 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 771 // We're done forming the partitions set up the reverse mapping from 772 // instructions to partitions. 773 Partitions.setupPartitionIdOnInstructions(); 774 775 // To keep things simple have an empty preheader before we version or clone 776 // the loop. (Also split if this has no predecessor, i.e. entry, because we 777 // rely on PH having a predecessor.) 778 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 779 SplitBlock(PH, PH->getTerminator(), DT, LI); 780 781 // If we need run-time checks, version the loop now. 782 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); 783 const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); 784 const auto &AllChecks = RtPtrChecking->getChecks(); 785 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 786 RtPtrChecking); 787 788 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 789 DEBUG(dbgs() << "\nPointers:\n"); 790 DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 791 LoopVersioning LVer(LAI, L, LI, DT, SE, false); 792 LVer.setAliasChecks(std::move(Checks)); 793 LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); 794 LVer.versionLoop(DefsUsedOutside); 795 } 796 797 // Create identical copies of the original loop for each partition and hook 798 // them up sequentially. 799 Partitions.cloneLoops(); 800 801 // Now, we remove the instruction from each loop that don't belong to that 802 // partition. 803 Partitions.removeUnusedInsts(); 804 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 805 DEBUG(Partitions.printBlocks()); 806 807 if (LDistVerify) { 808 LI->verify(); 809 DT->verifyDomTree(); 810 } 811 812 ++NumLoopsDistributed; 813 return true; 814 } 815 816 // Analyses used. 817 LoopInfo *LI; 818 LoopAccessAnalysis *LAA; 819 DominatorTree *DT; 820 ScalarEvolution *SE; 821}; 822} // anonymous namespace 823 824char LoopDistribute::ID; 825static const char ldist_name[] = "Loop Distribition"; 826 827INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 828INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 829INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) 830INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 831INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 832INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 833 834namespace llvm { 835FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } 836} 837