1//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements loop unroll and jam as a routine, much like
10// LoopUnroll.cpp implements loop unroll.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/ArrayRef.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/Optional.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/Sequence.h"
19#include "llvm/ADT/SmallPtrSet.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/ADT/Twine.h"
24#include "llvm/ADT/iterator_range.h"
25#include "llvm/Analysis/AssumptionCache.h"
26#include "llvm/Analysis/DependenceAnalysis.h"
27#include "llvm/Analysis/DomTreeUpdater.h"
28#include "llvm/Analysis/LoopInfo.h"
29#include "llvm/Analysis/LoopIterator.h"
30#include "llvm/Analysis/MustExecute.h"
31#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32#include "llvm/Analysis/ScalarEvolution.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/DebugInfoMetadata.h"
35#include "llvm/IR/DebugLoc.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/Dominators.h"
38#include "llvm/IR/Function.h"
39#include "llvm/IR/Instruction.h"
40#include "llvm/IR/Instructions.h"
41#include "llvm/IR/IntrinsicInst.h"
42#include "llvm/IR/Use.h"
43#include "llvm/IR/User.h"
44#include "llvm/IR/Value.h"
45#include "llvm/IR/ValueHandle.h"
46#include "llvm/IR/ValueMap.h"
47#include "llvm/Support/Casting.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/GenericDomTree.h"
51#include "llvm/Support/raw_ostream.h"
52#include "llvm/Transforms/Utils/BasicBlockUtils.h"
53#include "llvm/Transforms/Utils/Cloning.h"
54#include "llvm/Transforms/Utils/LoopUtils.h"
55#include "llvm/Transforms/Utils/UnrollLoop.h"
56#include "llvm/Transforms/Utils/ValueMapper.h"
57#include <assert.h>
58#include <memory>
59#include <type_traits>
60#include <vector>
61
62using namespace llvm;
63
64#define DEBUG_TYPE "loop-unroll-and-jam"
65
66STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
67STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
68
69typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
70
71// Partition blocks in an outer/inner loop pair into blocks before and after
72// the loop
73static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
74                                BasicBlockSet &AftBlocks, DominatorTree &DT) {
75  Loop *SubLoop = L.getSubLoops()[0];
76  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
77
78  for (BasicBlock *BB : L.blocks()) {
79    if (!SubLoop->contains(BB)) {
80      if (DT.dominates(SubLoopLatch, BB))
81        AftBlocks.insert(BB);
82      else
83        ForeBlocks.insert(BB);
84    }
85  }
86
87  // Check that all blocks in ForeBlocks together dominate the subloop
88  // TODO: This might ideally be done better with a dominator/postdominators.
89  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
90  for (BasicBlock *BB : ForeBlocks) {
91    if (BB == SubLoopPreHeader)
92      continue;
93    Instruction *TI = BB->getTerminator();
94    for (BasicBlock *Succ : successors(TI))
95      if (!ForeBlocks.count(Succ))
96        return false;
97  }
98
99  return true;
100}
101
102/// Partition blocks in a loop nest into blocks before and after each inner
103/// loop.
104static bool partitionOuterLoopBlocks(
105    Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
106    DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
107    DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
108  JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
109
110  for (Loop *L : Root.getLoopsInPreorder()) {
111    if (L == &JamLoop)
112      break;
113
114    if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
115      return false;
116  }
117
118  return true;
119}
120
121// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
122// than 2 levels loop.
123static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
124                                     BasicBlockSet &ForeBlocks,
125                                     BasicBlockSet &SubLoopBlocks,
126                                     BasicBlockSet &AftBlocks,
127                                     DominatorTree *DT) {
128  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
129  return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
130}
131
132// Looks at the phi nodes in Header for values coming from Latch. For these
133// instructions and all their operands calls Visit on them, keeping going for
134// all the operands in AftBlocks. Returns false if Visit returns false,
135// otherwise returns true. This is used to process the instructions in the
136// Aft blocks that need to be moved before the subloop. It is used in two
137// places. One to check that the required set of instructions can be moved
138// before the loop. Then to collect the instructions to actually move in
139// moveHeaderPhiOperandsToForeBlocks.
140template <typename T>
141static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
142                                     BasicBlockSet &AftBlocks, T Visit) {
143  SmallVector<Instruction *, 8> Worklist;
144  for (auto &Phi : Header->phis()) {
145    Value *V = Phi.getIncomingValueForBlock(Latch);
146    if (Instruction *I = dyn_cast<Instruction>(V))
147      Worklist.push_back(I);
148  }
149
150  while (!Worklist.empty()) {
151    Instruction *I = Worklist.back();
152    Worklist.pop_back();
153    if (!Visit(I))
154      return false;
155
156    if (AftBlocks.count(I->getParent()))
157      for (auto &U : I->operands())
158        if (Instruction *II = dyn_cast<Instruction>(U))
159          Worklist.push_back(II);
160  }
161
162  return true;
163}
164
165// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
166static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
167                                              BasicBlock *Latch,
168                                              Instruction *InsertLoc,
169                                              BasicBlockSet &AftBlocks) {
170  // We need to ensure we move the instructions in the correct order,
171  // starting with the earliest required instruction and moving forward.
172  std::vector<Instruction *> Visited;
173  processHeaderPhiOperands(Header, Latch, AftBlocks,
174                           [&Visited, &AftBlocks](Instruction *I) {
175                             if (AftBlocks.count(I->getParent()))
176                               Visited.push_back(I);
177                             return true;
178                           });
179
180  // Move all instructions in program order to before the InsertLoc
181  BasicBlock *InsertLocBB = InsertLoc->getParent();
182  for (Instruction *I : reverse(Visited)) {
183    if (I->getParent() != InsertLocBB)
184      I->moveBefore(InsertLoc);
185  }
186}
187
188/*
189  This method performs Unroll and Jam. For a simple loop like:
190  for (i = ..)
191    Fore(i)
192    for (j = ..)
193      SubLoop(i, j)
194    Aft(i)
195
196  Instead of doing normal inner or outer unrolling, we do:
197  for (i = .., i+=2)
198    Fore(i)
199    Fore(i+1)
200    for (j = ..)
201      SubLoop(i, j)
202      SubLoop(i+1, j)
203    Aft(i)
204    Aft(i+1)
205
206  So the outer loop is essetially unrolled and then the inner loops are fused
207  ("jammed") together into a single loop. This can increase speed when there
208  are loads in SubLoop that are invariant to i, as they become shared between
209  the now jammed inner loops.
210
211  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
212  Fore blocks are those before the inner loop, Aft are those after. Normal
213  Unroll code is used to copy each of these sets of blocks and the results are
214  combined together into the final form above.
215
216  isSafeToUnrollAndJam should be used prior to calling this to make sure the
217  unrolling will be valid. Checking profitablility is also advisable.
218
219  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
220  necessary to create one and not fully unrolled).
221*/
222LoopUnrollResult
223llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
224                       unsigned TripMultiple, bool UnrollRemainder,
225                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
226                       AssumptionCache *AC, const TargetTransformInfo *TTI,
227                       OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
228
229  // When we enter here we should have already checked that it is safe
230  BasicBlock *Header = L->getHeader();
231  assert(Header && "No header.");
232  assert(L->getSubLoops().size() == 1);
233  Loop *SubLoop = *L->begin();
234
235  // Don't enter the unroll code if there is nothing to do.
236  if (TripCount == 0 && Count < 2) {
237    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
238    return LoopUnrollResult::Unmodified;
239  }
240
241  assert(Count > 0);
242  assert(TripMultiple > 0);
243  assert(TripCount == 0 || TripCount % TripMultiple == 0);
244
245  // Are we eliminating the loop control altogether?
246  bool CompletelyUnroll = (Count == TripCount);
247
248  // We use the runtime remainder in cases where we don't know trip multiple
249  if (TripMultiple == 1 || TripMultiple % Count != 0) {
250    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
251                                    /*UseEpilogRemainder*/ true,
252                                    UnrollRemainder, /*ForgetAllSCEV*/ false,
253                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
254      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
255                           "generated when assuming runtime trip count\n");
256      return LoopUnrollResult::Unmodified;
257    }
258  }
259
260  // Notify ScalarEvolution that the loop will be substantially changed,
261  // if not outright eliminated.
262  if (SE) {
263    SE->forgetLoop(L);
264    SE->forgetLoop(SubLoop);
265  }
266
267  using namespace ore;
268  // Report the unrolling decision.
269  if (CompletelyUnroll) {
270    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
271                      << Header->getName() << " with trip count " << TripCount
272                      << "!\n");
273    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
274                                 L->getHeader())
275              << "completely unroll and jammed loop with "
276              << NV("UnrollCount", TripCount) << " iterations");
277  } else {
278    auto DiagBuilder = [&]() {
279      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
280                              L->getHeader());
281      return Diag << "unroll and jammed loop by a factor of "
282                  << NV("UnrollCount", Count);
283    };
284
285    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
286                      << " by " << Count);
287    if (TripMultiple != 1) {
288      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
289      ORE->emit([&]() {
290        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
291                             << " trips per branch";
292      });
293    } else {
294      LLVM_DEBUG(dbgs() << " with run-time trip count");
295      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
296    }
297    LLVM_DEBUG(dbgs() << "!\n");
298  }
299
300  BasicBlock *Preheader = L->getLoopPreheader();
301  BasicBlock *LatchBlock = L->getLoopLatch();
302  assert(Preheader && "No preheader");
303  assert(LatchBlock && "No latch block");
304  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
305  assert(BI && !BI->isUnconditional());
306  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
307  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
308  bool SubLoopContinueOnTrue = SubLoop->contains(
309      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
310
311  // Partition blocks in an outer/inner loop pair into blocks before and after
312  // the loop
313  BasicBlockSet SubLoopBlocks;
314  BasicBlockSet ForeBlocks;
315  BasicBlockSet AftBlocks;
316  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
317                           DT);
318
319  // We keep track of the entering/first and exiting/last block of each of
320  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
321  // blocks easier.
322  std::vector<BasicBlock *> ForeBlocksFirst;
323  std::vector<BasicBlock *> ForeBlocksLast;
324  std::vector<BasicBlock *> SubLoopBlocksFirst;
325  std::vector<BasicBlock *> SubLoopBlocksLast;
326  std::vector<BasicBlock *> AftBlocksFirst;
327  std::vector<BasicBlock *> AftBlocksLast;
328  ForeBlocksFirst.push_back(Header);
329  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
330  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
331  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
332  AftBlocksFirst.push_back(SubLoop->getExitBlock());
333  AftBlocksLast.push_back(L->getExitingBlock());
334  // Maps Blocks[0] -> Blocks[It]
335  ValueToValueMapTy LastValueMap;
336
337  // Move any instructions from fore phi operands from AftBlocks into Fore.
338  moveHeaderPhiOperandsToForeBlocks(
339      Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
340
341  // The current on-the-fly SSA update requires blocks to be processed in
342  // reverse postorder so that LastValueMap contains the correct value at each
343  // exit.
344  LoopBlocksDFS DFS(L);
345  DFS.perform(LI);
346  // Stash the DFS iterators before adding blocks to the loop.
347  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
348  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
349
350  if (Header->getParent()->isDebugInfoForProfiling())
351    for (BasicBlock *BB : L->getBlocks())
352      for (Instruction &I : *BB)
353        if (!isa<DbgInfoIntrinsic>(&I))
354          if (const DILocation *DIL = I.getDebugLoc()) {
355            auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
356            if (NewDIL)
357              I.setDebugLoc(NewDIL.getValue());
358            else
359              LLVM_DEBUG(dbgs()
360                         << "Failed to create new discriminator: "
361                         << DIL->getFilename() << " Line: " << DIL->getLine());
362          }
363
364  // Copy all blocks
365  for (unsigned It = 1; It != Count; ++It) {
366    SmallVector<BasicBlock *, 8> NewBlocks;
367    // Maps Blocks[It] -> Blocks[It-1]
368    DenseMap<Value *, Value *> PrevItValueMap;
369    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
370    NewLoops[L] = L;
371    NewLoops[SubLoop] = SubLoop;
372
373    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
374      ValueToValueMapTy VMap;
375      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
376      Header->getParent()->getBasicBlockList().push_back(New);
377
378      // Tell LI about New.
379      addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
380
381      if (ForeBlocks.count(*BB)) {
382        if (*BB == ForeBlocksFirst[0])
383          ForeBlocksFirst.push_back(New);
384        if (*BB == ForeBlocksLast[0])
385          ForeBlocksLast.push_back(New);
386      } else if (SubLoopBlocks.count(*BB)) {
387        if (*BB == SubLoopBlocksFirst[0])
388          SubLoopBlocksFirst.push_back(New);
389        if (*BB == SubLoopBlocksLast[0])
390          SubLoopBlocksLast.push_back(New);
391      } else if (AftBlocks.count(*BB)) {
392        if (*BB == AftBlocksFirst[0])
393          AftBlocksFirst.push_back(New);
394        if (*BB == AftBlocksLast[0])
395          AftBlocksLast.push_back(New);
396      } else {
397        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
398      }
399
400      // Update our running maps of newest clones
401      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
402      LastValueMap[*BB] = New;
403      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
404           VI != VE; ++VI) {
405        PrevItValueMap[VI->second] =
406            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
407        LastValueMap[VI->first] = VI->second;
408      }
409
410      NewBlocks.push_back(New);
411
412      // Update DomTree:
413      if (*BB == ForeBlocksFirst[0])
414        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
415      else if (*BB == SubLoopBlocksFirst[0])
416        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
417      else if (*BB == AftBlocksFirst[0])
418        DT->addNewBlock(New, AftBlocksLast[It - 1]);
419      else {
420        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
421        // structure.
422        auto BBDomNode = DT->getNode(*BB);
423        auto BBIDom = BBDomNode->getIDom();
424        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
425        assert(OriginalBBIDom);
426        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
427        DT->addNewBlock(
428            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
429      }
430    }
431
432    // Remap all instructions in the most recent iteration
433    remapInstructionsInBlocks(NewBlocks, LastValueMap);
434    for (BasicBlock *NewBlock : NewBlocks) {
435      for (Instruction &I : *NewBlock) {
436        if (auto *II = dyn_cast<IntrinsicInst>(&I))
437          if (II->getIntrinsicID() == Intrinsic::assume)
438            AC->registerAssumption(II);
439      }
440    }
441
442    // Alter the ForeBlocks phi's, pointing them at the latest version of the
443    // value from the previous iteration's phis
444    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
445      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
446      assert(OldValue && "should have incoming edge from Aft[It]");
447      Value *NewValue = OldValue;
448      if (Value *PrevValue = PrevItValueMap[OldValue])
449        NewValue = PrevValue;
450
451      assert(Phi.getNumOperands() == 2);
452      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
453      Phi.setIncomingValue(0, NewValue);
454      Phi.removeIncomingValue(1);
455    }
456  }
457
458  // Now that all the basic blocks for the unrolled iterations are in place,
459  // finish up connecting the blocks and phi nodes. At this point LastValueMap
460  // is the last unrolled iterations values.
461
462  // Update Phis in BB from OldBB to point to NewBB
463  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
464                            BasicBlock *NewBB) {
465    for (PHINode &Phi : BB->phis()) {
466      int I = Phi.getBasicBlockIndex(OldBB);
467      Phi.setIncomingBlock(I, NewBB);
468    }
469  };
470  // Update Phis in BB from OldBB to point to NewBB and use the latest value
471  // from LastValueMap
472  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
473                                     BasicBlock *NewBB,
474                                     ValueToValueMapTy &LastValueMap) {
475    for (PHINode &Phi : BB->phis()) {
476      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
477        if (Phi.getIncomingBlock(b) == OldBB) {
478          Value *OldValue = Phi.getIncomingValue(b);
479          if (Value *LastValue = LastValueMap[OldValue])
480            Phi.setIncomingValue(b, LastValue);
481          Phi.setIncomingBlock(b, NewBB);
482          break;
483        }
484      }
485    }
486  };
487  // Move all the phis from Src into Dest
488  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
489    Instruction *insertPoint = Dest->getFirstNonPHI();
490    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
491      Phi->moveBefore(insertPoint);
492  };
493
494  // Update the PHI values outside the loop to point to the last block
495  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
496                           LastValueMap);
497
498  // Update ForeBlocks successors and phi nodes
499  BranchInst *ForeTerm =
500      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
501  assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
502  ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
503
504  if (CompletelyUnroll) {
505    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
506      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
507      Phi->getParent()->getInstList().erase(Phi);
508    }
509  } else {
510    // Update the PHI values to point to the last aft block
511    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
512                             AftBlocksLast.back(), LastValueMap);
513  }
514
515  for (unsigned It = 1; It != Count; It++) {
516    // Remap ForeBlock successors from previous iteration to this
517    BranchInst *ForeTerm =
518        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
519    assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
520    ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
521  }
522
523  // Subloop successors and phis
524  BranchInst *SubTerm =
525      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
526  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
527  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
528  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
529                  ForeBlocksLast.back());
530  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
531                  SubLoopBlocksLast.back());
532
533  for (unsigned It = 1; It != Count; It++) {
534    // Replace the conditional branch of the previous iteration subloop with an
535    // unconditional one to this one
536    BranchInst *SubTerm =
537        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
538    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
539    SubTerm->eraseFromParent();
540
541    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
542                    ForeBlocksLast.back());
543    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
544                    SubLoopBlocksLast.back());
545    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
546  }
547
548  // Aft blocks successors and phis
549  BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
550  if (CompletelyUnroll) {
551    BranchInst::Create(LoopExit, AftTerm);
552    AftTerm->eraseFromParent();
553  } else {
554    AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
555    assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
556           "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
557  }
558  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
559                  SubLoopBlocksLast.back());
560
561  for (unsigned It = 1; It != Count; It++) {
562    // Replace the conditional branch of the previous iteration subloop with an
563    // unconditional one to this one
564    BranchInst *AftTerm =
565        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
566    BranchInst::Create(AftBlocksFirst[It], AftTerm);
567    AftTerm->eraseFromParent();
568
569    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
570                    SubLoopBlocksLast.back());
571    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
572  }
573
574  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
575  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
576  // new ones required.
577  if (Count != 1) {
578    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
579    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
580                           SubLoopBlocksFirst[0]);
581    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
582                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
583
584    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
585                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
586    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
587                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
588    DTU.applyUpdatesPermissive(DTUpdates);
589  }
590
591  // Merge adjacent basic blocks, if possible.
592  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
593  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
594  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
595  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
596
597  MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
598
599  // Apply updates to the DomTree.
600  DT = &DTU.getDomTree();
601
602  // At this point, the code is well formed.  We now do a quick sweep over the
603  // inserted code, doing constant propagation and dead code elimination as we
604  // go.
605  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
606  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
607                          TTI);
608
609  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
610  ++NumUnrolledAndJammed;
611
612  // Update LoopInfo if the loop is completely removed.
613  if (CompletelyUnroll)
614    LI->erase(L);
615
616#ifndef NDEBUG
617  // We shouldn't have done anything to break loop simplify form or LCSSA.
618  Loop *OutestLoop = SubLoop->getParentLoop()
619                         ? SubLoop->getParentLoop()->getParentLoop()
620                               ? SubLoop->getParentLoop()->getParentLoop()
621                               : SubLoop->getParentLoop()
622                         : SubLoop;
623  assert(DT->verify());
624  LI->verify(*DT);
625  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
626  if (!CompletelyUnroll)
627    assert(L->isLoopSimplifyForm());
628  assert(SubLoop->isLoopSimplifyForm());
629  SE->verify();
630#endif
631
632  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
633                          : LoopUnrollResult::PartiallyUnrolled;
634}
635
636static bool getLoadsAndStores(BasicBlockSet &Blocks,
637                              SmallVector<Instruction *, 4> &MemInstr) {
638  // Scan the BBs and collect legal loads and stores.
639  // Returns false if non-simple loads/stores are found.
640  for (BasicBlock *BB : Blocks) {
641    for (Instruction &I : *BB) {
642      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
643        if (!Ld->isSimple())
644          return false;
645        MemInstr.push_back(&I);
646      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
647        if (!St->isSimple())
648          return false;
649        MemInstr.push_back(&I);
650      } else if (I.mayReadOrWriteMemory()) {
651        return false;
652      }
653    }
654  }
655  return true;
656}
657
658static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
659                                       unsigned UnrollLevel, unsigned JamLevel,
660                                       bool Sequentialized, Dependence *D) {
661  // UnrollLevel might carry the dependency Src --> Dst
662  // Does a different loop after unrolling?
663  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
664       ++CurLoopDepth) {
665    auto JammedDir = D->getDirection(CurLoopDepth);
666    if (JammedDir == Dependence::DVEntry::LT)
667      return true;
668
669    if (JammedDir & Dependence::DVEntry::GT)
670      return false;
671  }
672
673  return true;
674}
675
676static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
677                                        unsigned UnrollLevel, unsigned JamLevel,
678                                        bool Sequentialized, Dependence *D) {
679  // UnrollLevel might carry the dependency Dst --> Src
680  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
681       ++CurLoopDepth) {
682    auto JammedDir = D->getDirection(CurLoopDepth);
683    if (JammedDir == Dependence::DVEntry::GT)
684      return true;
685
686    if (JammedDir & Dependence::DVEntry::LT)
687      return false;
688  }
689
690  // Backward dependencies are only preserved if not interleaved.
691  return Sequentialized;
692}
693
694// Check whether it is semantically safe Src and Dst considering any potential
695// dependency between them.
696//
697// @param UnrollLevel The level of the loop being unrolled
698// @param JamLevel    The level of the loop being jammed; if Src and Dst are on
699// different levels, the outermost common loop counts as jammed level
700//
701// @return true if is safe and false if there is a dependency violation.
702static bool checkDependency(Instruction *Src, Instruction *Dst,
703                            unsigned UnrollLevel, unsigned JamLevel,
704                            bool Sequentialized, DependenceInfo &DI) {
705  assert(UnrollLevel <= JamLevel &&
706         "Expecting JamLevel to be at least UnrollLevel");
707
708  if (Src == Dst)
709    return true;
710  // Ignore Input dependencies.
711  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
712    return true;
713
714  // Check whether unroll-and-jam may violate a dependency.
715  // By construction, every dependency will be lexicographically non-negative
716  // (if it was, it would violate the current execution order), such as
717  //   (0,0,>,*,*)
718  // Unroll-and-jam changes the GT execution of two executions to the same
719  // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
720  // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
721  //   (0,0,>=,*,*)
722  // Now, the dependency is not necessarily non-negative anymore, i.e.
723  // unroll-and-jam may violate correctness.
724  std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
725  if (!D)
726    return true;
727  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
728
729  if (D->isConfused()) {
730    LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
731                      << "  " << *Src << "\n"
732                      << "  " << *Dst << "\n");
733    return false;
734  }
735
736  // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
737  // non-equal direction, then the locations accessed in the inner levels cannot
738  // overlap in memory. We assumes the indexes never overlap into neighboring
739  // dimensions.
740  for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
741    if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
742      return true;
743
744  auto UnrollDirection = D->getDirection(UnrollLevel);
745
746  // If the distance carried by the unrolled loop is 0, then after unrolling
747  // that distance will become non-zero resulting in non-overlapping accesses in
748  // the inner loops.
749  if (UnrollDirection == Dependence::DVEntry::EQ)
750    return true;
751
752  if (UnrollDirection & Dependence::DVEntry::LT &&
753      !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
754                                  Sequentialized, D.get()))
755    return false;
756
757  if (UnrollDirection & Dependence::DVEntry::GT &&
758      !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
759                                   Sequentialized, D.get()))
760    return false;
761
762  return true;
763}
764
765static bool
766checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
767                  const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
768                  const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
769                  DependenceInfo &DI, LoopInfo &LI) {
770  SmallVector<BasicBlockSet, 8> AllBlocks;
771  for (Loop *L : Root.getLoopsInPreorder())
772    if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
773      AllBlocks.push_back(ForeBlocksMap.lookup(L));
774  AllBlocks.push_back(SubLoopBlocks);
775  for (Loop *L : Root.getLoopsInPreorder())
776    if (AftBlocksMap.find(L) != AftBlocksMap.end())
777      AllBlocks.push_back(AftBlocksMap.lookup(L));
778
779  unsigned LoopDepth = Root.getLoopDepth();
780  SmallVector<Instruction *, 4> EarlierLoadsAndStores;
781  SmallVector<Instruction *, 4> CurrentLoadsAndStores;
782  for (BasicBlockSet &Blocks : AllBlocks) {
783    CurrentLoadsAndStores.clear();
784    if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
785      return false;
786
787    Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
788    unsigned CurLoopDepth = CurLoop->getLoopDepth();
789
790    for (auto *Earlier : EarlierLoadsAndStores) {
791      Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
792      unsigned EarlierDepth = EarlierLoop->getLoopDepth();
793      unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
794      for (auto *Later : CurrentLoadsAndStores) {
795        if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
796                             DI))
797          return false;
798      }
799    }
800
801    size_t NumInsts = CurrentLoadsAndStores.size();
802    for (size_t I = 0; I < NumInsts; ++I) {
803      for (size_t J = I; J < NumInsts; ++J) {
804        if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
805                             LoopDepth, CurLoopDepth, true, DI))
806          return false;
807      }
808    }
809
810    EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
811                                 CurrentLoadsAndStores.end());
812  }
813  return true;
814}
815
816static bool isEligibleLoopForm(const Loop &Root) {
817  // Root must have a child.
818  if (Root.getSubLoops().size() != 1)
819    return false;
820
821  const Loop *L = &Root;
822  do {
823    // All loops in Root need to be in simplify and rotated form.
824    if (!L->isLoopSimplifyForm())
825      return false;
826
827    if (!L->isRotatedForm())
828      return false;
829
830    if (L->getHeader()->hasAddressTaken()) {
831      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
832      return false;
833    }
834
835    unsigned SubLoopsSize = L->getSubLoops().size();
836    if (SubLoopsSize == 0)
837      return true;
838
839    // Only one child is allowed.
840    if (SubLoopsSize != 1)
841      return false;
842
843    L = L->getSubLoops()[0];
844  } while (L);
845
846  return true;
847}
848
849static Loop *getInnerMostLoop(Loop *L) {
850  while (!L->getSubLoops().empty())
851    L = L->getSubLoops()[0];
852  return L;
853}
854
855bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
856                                DependenceInfo &DI, LoopInfo &LI) {
857  if (!isEligibleLoopForm(*L)) {
858    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
859    return false;
860  }
861
862  /* We currently handle outer loops like this:
863        |
864    ForeFirst    <------\   }
865     Blocks             |   } ForeBlocks of L
866    ForeLast            |   }
867        |               |
868       ...              |
869        |               |
870    ForeFirst    <----\ |   }
871     Blocks           | |   } ForeBlocks of a inner loop of L
872    ForeLast          | |   }
873        |             | |
874    JamLoopFirst  <\  | |   }
875     Blocks        |  | |   } JamLoopBlocks of the innermost loop
876    JamLoopLast   -/  | |   }
877        |             | |
878    AftFirst          | |   }
879     Blocks           | |   } AftBlocks of a inner loop of L
880    AftLast     ------/ |   }
881        |               |
882       ...              |
883        |               |
884    AftFirst            |   }
885     Blocks             |   } AftBlocks of L
886    AftLast     --------/   }
887        |
888
889    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
890    and AftBlocks, providing that there is one edge from Fores to SubLoops,
891    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
892    In practice we currently limit Aft blocks to a single block, and limit
893    things further in the profitablility checks of the unroll and jam pass.
894
895    Because of the way we rearrange basic blocks, we also require that
896    the Fore blocks of L on all unrolled iterations are safe to move before the
897    blocks of the direct child of L of all iterations. So we require that the
898    phi node looping operands of ForeHeader can be moved to at least the end of
899    ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
900    match up Phi's correctly.
901
902    i.e. The old order of blocks used to be
903           (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
904         It needs to be safe to transform this to
905           (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
906
907    There are then a number of checks along the lines of no calls, no
908    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
909    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
910    UnrollAndJamLoop if the trip count cannot be easily calculated.
911  */
912
913  // Split blocks into Fore/SubLoop/Aft based on dominators
914  Loop *JamLoop = getInnerMostLoop(L);
915  BasicBlockSet SubLoopBlocks;
916  DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
917  DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
918  if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
919                                AftBlocksMap, DT)) {
920    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
921    return false;
922  }
923
924  // Aft blocks may need to move instructions to fore blocks, which becomes more
925  // difficult if there are multiple (potentially conditionally executed)
926  // blocks. For now we just exclude loops with multiple aft blocks.
927  if (AftBlocksMap[L].size() != 1) {
928    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
929                         "multiple blocks after the loop\n");
930    return false;
931  }
932
933  // Check inner loop backedge count is consistent on all iterations of the
934  // outer loop
935  if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
936        return !hasIterationCountInvariantInParent(SubLoop, SE);
937      })) {
938    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
939                         "not consistent on each iteration\n");
940    return false;
941  }
942
943  // Check the loop safety info for exceptions.
944  SimpleLoopSafetyInfo LSI;
945  LSI.computeLoopSafetyInfo(L);
946  if (LSI.anyBlockMayThrow()) {
947    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
948    return false;
949  }
950
951  // We've ruled out the easy stuff and now need to check that there are no
952  // interdependencies which may prevent us from moving the:
953  //  ForeBlocks before Subloop and AftBlocks.
954  //  Subloop before AftBlocks.
955  //  ForeBlock phi operands before the subloop
956
957  // Make sure we can move all instructions we need to before the subloop
958  BasicBlock *Header = L->getHeader();
959  BasicBlock *Latch = L->getLoopLatch();
960  BasicBlockSet AftBlocks = AftBlocksMap[L];
961  Loop *SubLoop = L->getSubLoops()[0];
962  if (!processHeaderPhiOperands(
963          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
964            if (SubLoop->contains(I->getParent()))
965              return false;
966            if (AftBlocks.count(I->getParent())) {
967              // If we hit a phi node in afts we know we are done (probably
968              // LCSSA)
969              if (isa<PHINode>(I))
970                return false;
971              // Can't move instructions with side effects or memory
972              // reads/writes
973              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
974                return false;
975            }
976            // Keep going
977            return true;
978          })) {
979    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
980                         "instructions after subloop to before it\n");
981    return false;
982  }
983
984  // Check for memory dependencies which prohibit the unrolling we are doing.
985  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
986  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
987  if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
988                         LI)) {
989    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
990    return false;
991  }
992
993  return true;
994}
995