1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/BitVector.h"
34#include "llvm/ADT/DenseMap.h"
35#include "llvm/ADT/MapVector.h"
36#include "llvm/ADT/PriorityQueue.h"
37#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/SmallPtrSet.h"
39#include "llvm/ADT/SmallSet.h"
40#include "llvm/ADT/SmallVector.h"
41#include "llvm/ADT/Statistic.h"
42#include "llvm/ADT/iterator_range.h"
43#include "llvm/Analysis/AliasAnalysis.h"
44#include "llvm/Analysis/MemoryLocation.h"
45#include "llvm/Analysis/ValueTracking.h"
46#include "llvm/CodeGen/DFAPacketizer.h"
47#include "llvm/CodeGen/LiveIntervals.h"
48#include "llvm/CodeGen/MachineBasicBlock.h"
49#include "llvm/CodeGen/MachineDominators.h"
50#include "llvm/CodeGen/MachineFunction.h"
51#include "llvm/CodeGen/MachineFunctionPass.h"
52#include "llvm/CodeGen/MachineInstr.h"
53#include "llvm/CodeGen/MachineInstrBuilder.h"
54#include "llvm/CodeGen/MachineLoopInfo.h"
55#include "llvm/CodeGen/MachineMemOperand.h"
56#include "llvm/CodeGen/MachineOperand.h"
57#include "llvm/CodeGen/MachinePipeliner.h"
58#include "llvm/CodeGen/MachineRegisterInfo.h"
59#include "llvm/CodeGen/ModuloSchedule.h"
60#include "llvm/CodeGen/RegisterPressure.h"
61#include "llvm/CodeGen/ScheduleDAG.h"
62#include "llvm/CodeGen/ScheduleDAGMutation.h"
63#include "llvm/CodeGen/TargetOpcodes.h"
64#include "llvm/CodeGen/TargetRegisterInfo.h"
65#include "llvm/CodeGen/TargetSubtargetInfo.h"
66#include "llvm/Config/llvm-config.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/DebugLoc.h"
69#include "llvm/IR/Function.h"
70#include "llvm/MC/LaneBitmask.h"
71#include "llvm/MC/MCInstrDesc.h"
72#include "llvm/MC/MCInstrItineraries.h"
73#include "llvm/MC/MCRegisterInfo.h"
74#include "llvm/Pass.h"
75#include "llvm/Support/CommandLine.h"
76#include "llvm/Support/Compiler.h"
77#include "llvm/Support/Debug.h"
78#include "llvm/Support/MathExtras.h"
79#include "llvm/Support/raw_ostream.h"
80#include <algorithm>
81#include <cassert>
82#include <climits>
83#include <cstdint>
84#include <deque>
85#include <functional>
86#include <iterator>
87#include <map>
88#include <memory>
89#include <tuple>
90#include <utility>
91#include <vector>
92
93using namespace llvm;
94
95#define DEBUG_TYPE "pipeliner"
96
97STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
98STATISTIC(NumPipelined, "Number of loops software pipelined");
99STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
100STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
101STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
102STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
103STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
104STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
105STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
106STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
107STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
108
109/// A command line option to turn software pipelining on or off.
110static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
111                               cl::ZeroOrMore,
112                               cl::desc("Enable Software Pipelining"));
113
114/// A command line option to enable SWP at -Os.
115static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
116                                      cl::desc("Enable SWP at Os."), cl::Hidden,
117                                      cl::init(false));
118
119/// A command line argument to limit minimum initial interval for pipelining.
120static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
121                              cl::desc("Size limit for the MII."),
122                              cl::Hidden, cl::init(27));
123
124/// A command line argument to limit the number of stages in the pipeline.
125static cl::opt<int>
126    SwpMaxStages("pipeliner-max-stages",
127                 cl::desc("Maximum stages allowed in the generated scheduled."),
128                 cl::Hidden, cl::init(3));
129
130/// A command line option to disable the pruning of chain dependences due to
131/// an unrelated Phi.
132static cl::opt<bool>
133    SwpPruneDeps("pipeliner-prune-deps",
134                 cl::desc("Prune dependences between unrelated Phi nodes."),
135                 cl::Hidden, cl::init(true));
136
137/// A command line option to disable the pruning of loop carried order
138/// dependences.
139static cl::opt<bool>
140    SwpPruneLoopCarried("pipeliner-prune-loop-carried",
141                        cl::desc("Prune loop carried order dependences."),
142                        cl::Hidden, cl::init(true));
143
144#ifndef NDEBUG
145static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
146#endif
147
148static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
149                                     cl::ReallyHidden, cl::init(false),
150                                     cl::ZeroOrMore, cl::desc("Ignore RecMII"));
151
152static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
153                                    cl::init(false));
154static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
155                                      cl::init(false));
156
157static cl::opt<bool> EmitTestAnnotations(
158    "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
159    cl::desc("Instead of emitting the pipelined code, annotate instructions "
160             "with the generated schedule for feeding into the "
161             "-modulo-schedule-test pass"));
162
163static cl::opt<bool> ExperimentalCodeGen(
164    "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
165    cl::desc(
166        "Use the experimental peeling code generator for software pipelining"));
167
168namespace llvm {
169
170// A command line option to enable the CopyToPhi DAG mutation.
171cl::opt<bool>
172    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
173                       cl::init(true), cl::ZeroOrMore,
174                       cl::desc("Enable CopyToPhi DAG Mutation"));
175
176} // end namespace llvm
177
178unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
179char MachinePipeliner::ID = 0;
180#ifndef NDEBUG
181int MachinePipeliner::NumTries = 0;
182#endif
183char &llvm::MachinePipelinerID = MachinePipeliner::ID;
184
185INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
186                      "Modulo Software Pipelining", false, false)
187INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
188INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
189INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
190INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
191INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
192                    "Modulo Software Pipelining", false, false)
193
194/// The "main" function for implementing Swing Modulo Scheduling.
195bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
196  if (skipFunction(mf.getFunction()))
197    return false;
198
199  if (!EnableSWP)
200    return false;
201
202  if (mf.getFunction().getAttributes().hasAttribute(
203          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
204      !EnableSWPOptSize.getPosition())
205    return false;
206
207  if (!mf.getSubtarget().enableMachinePipeliner())
208    return false;
209
210  // Cannot pipeline loops without instruction itineraries if we are using
211  // DFA for the pipeliner.
212  if (mf.getSubtarget().useDFAforSMS() &&
213      (!mf.getSubtarget().getInstrItineraryData() ||
214       mf.getSubtarget().getInstrItineraryData()->isEmpty()))
215    return false;
216
217  MF = &mf;
218  MLI = &getAnalysis<MachineLoopInfo>();
219  MDT = &getAnalysis<MachineDominatorTree>();
220  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
221  TII = MF->getSubtarget().getInstrInfo();
222  RegClassInfo.runOnMachineFunction(*MF);
223
224  for (auto &L : *MLI)
225    scheduleLoop(*L);
226
227  return false;
228}
229
230/// Attempt to perform the SMS algorithm on the specified loop. This function is
231/// the main entry point for the algorithm.  The function identifies candidate
232/// loops, calculates the minimum initiation interval, and attempts to schedule
233/// the loop.
234bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
235  bool Changed = false;
236  for (auto &InnerLoop : L)
237    Changed |= scheduleLoop(*InnerLoop);
238
239#ifndef NDEBUG
240  // Stop trying after reaching the limit (if any).
241  int Limit = SwpLoopLimit;
242  if (Limit >= 0) {
243    if (NumTries >= SwpLoopLimit)
244      return Changed;
245    NumTries++;
246  }
247#endif
248
249  setPragmaPipelineOptions(L);
250  if (!canPipelineLoop(L)) {
251    LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
252    ORE->emit([&]() {
253      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
254                                             L.getStartLoc(), L.getHeader())
255             << "Failed to pipeline loop";
256    });
257
258    return Changed;
259  }
260
261  ++NumTrytoPipeline;
262
263  Changed = swingModuloScheduler(L);
264
265  return Changed;
266}
267
268void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
269  // Reset the pragma for the next loop in iteration.
270  disabledByPragma = false;
271
272  MachineBasicBlock *LBLK = L.getTopBlock();
273
274  if (LBLK == nullptr)
275    return;
276
277  const BasicBlock *BBLK = LBLK->getBasicBlock();
278  if (BBLK == nullptr)
279    return;
280
281  const Instruction *TI = BBLK->getTerminator();
282  if (TI == nullptr)
283    return;
284
285  MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
286  if (LoopID == nullptr)
287    return;
288
289  assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
290  assert(LoopID->getOperand(0) == LoopID && "invalid loop");
291
292  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
293    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
294
295    if (MD == nullptr)
296      continue;
297
298    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
299
300    if (S == nullptr)
301      continue;
302
303    if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
304      assert(MD->getNumOperands() == 2 &&
305             "Pipeline initiation interval hint metadata should have two operands.");
306      II_setByPragma =
307          mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
308      assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
309    } else if (S->getString() == "llvm.loop.pipeline.disable") {
310      disabledByPragma = true;
311    }
312  }
313}
314
315/// Return true if the loop can be software pipelined.  The algorithm is
316/// restricted to loops with a single basic block.  Make sure that the
317/// branch in the loop can be analyzed.
318bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
319  if (L.getNumBlocks() != 1) {
320    ORE->emit([&]() {
321      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
322                                               L.getStartLoc(), L.getHeader())
323             << "Not a single basic block: "
324             << ore::NV("NumBlocks", L.getNumBlocks());
325    });
326    return false;
327  }
328
329  if (disabledByPragma) {
330    ORE->emit([&]() {
331      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
332                                               L.getStartLoc(), L.getHeader())
333             << "Disabled by Pragma.";
334    });
335    return false;
336  }
337
338  // Check if the branch can't be understood because we can't do pipelining
339  // if that's the case.
340  LI.TBB = nullptr;
341  LI.FBB = nullptr;
342  LI.BrCond.clear();
343  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
344    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
345    NumFailBranch++;
346    ORE->emit([&]() {
347      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
348                                               L.getStartLoc(), L.getHeader())
349             << "The branch can't be understood";
350    });
351    return false;
352  }
353
354  LI.LoopInductionVar = nullptr;
355  LI.LoopCompare = nullptr;
356  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
357    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
358    NumFailLoop++;
359    ORE->emit([&]() {
360      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
361                                               L.getStartLoc(), L.getHeader())
362             << "The loop structure is not supported";
363    });
364    return false;
365  }
366
367  if (!L.getLoopPreheader()) {
368    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
369    NumFailPreheader++;
370    ORE->emit([&]() {
371      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
372                                               L.getStartLoc(), L.getHeader())
373             << "No loop preheader found";
374    });
375    return false;
376  }
377
378  // Remove any subregisters from inputs to phi nodes.
379  preprocessPhiNodes(*L.getHeader());
380  return true;
381}
382
383void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
384  MachineRegisterInfo &MRI = MF->getRegInfo();
385  SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
386
387  for (MachineInstr &PI : make_range(B.begin(), B.getFirstNonPHI())) {
388    MachineOperand &DefOp = PI.getOperand(0);
389    assert(DefOp.getSubReg() == 0);
390    auto *RC = MRI.getRegClass(DefOp.getReg());
391
392    for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
393      MachineOperand &RegOp = PI.getOperand(i);
394      if (RegOp.getSubReg() == 0)
395        continue;
396
397      // If the operand uses a subregister, replace it with a new register
398      // without subregisters, and generate a copy to the new register.
399      Register NewReg = MRI.createVirtualRegister(RC);
400      MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
401      MachineBasicBlock::iterator At = PredB.getFirstTerminator();
402      const DebugLoc &DL = PredB.findDebugLoc(At);
403      auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
404                    .addReg(RegOp.getReg(), getRegState(RegOp),
405                            RegOp.getSubReg());
406      Slots.insertMachineInstrInMaps(*Copy);
407      RegOp.setReg(NewReg);
408      RegOp.setSubReg(0);
409    }
410  }
411}
412
413/// The SMS algorithm consists of the following main steps:
414/// 1. Computation and analysis of the dependence graph.
415/// 2. Ordering of the nodes (instructions).
416/// 3. Attempt to Schedule the loop.
417bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
418  assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
419
420  SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
421                        II_setByPragma);
422
423  MachineBasicBlock *MBB = L.getHeader();
424  // The kernel should not include any terminator instructions.  These
425  // will be added back later.
426  SMS.startBlock(MBB);
427
428  // Compute the number of 'real' instructions in the basic block by
429  // ignoring terminators.
430  unsigned size = MBB->size();
431  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
432                                   E = MBB->instr_end();
433       I != E; ++I, --size)
434    ;
435
436  SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
437  SMS.schedule();
438  SMS.exitRegion();
439
440  SMS.finishBlock();
441  return SMS.hasNewSchedule();
442}
443
444void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
445  if (II_setByPragma > 0)
446    MII = II_setByPragma;
447  else
448    MII = std::max(ResMII, RecMII);
449}
450
451void SwingSchedulerDAG::setMAX_II() {
452  if (II_setByPragma > 0)
453    MAX_II = II_setByPragma;
454  else
455    MAX_II = MII + 10;
456}
457
458/// We override the schedule function in ScheduleDAGInstrs to implement the
459/// scheduling part of the Swing Modulo Scheduling algorithm.
460void SwingSchedulerDAG::schedule() {
461  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
462  buildSchedGraph(AA);
463  addLoopCarriedDependences(AA);
464  updatePhiDependences();
465  Topo.InitDAGTopologicalSorting();
466  changeDependences();
467  postprocessDAG();
468  LLVM_DEBUG(dump());
469
470  NodeSetType NodeSets;
471  findCircuits(NodeSets);
472  NodeSetType Circuits = NodeSets;
473
474  // Calculate the MII.
475  unsigned ResMII = calculateResMII();
476  unsigned RecMII = calculateRecMII(NodeSets);
477
478  fuseRecs(NodeSets);
479
480  // This flag is used for testing and can cause correctness problems.
481  if (SwpIgnoreRecMII)
482    RecMII = 0;
483
484  setMII(ResMII, RecMII);
485  setMAX_II();
486
487  LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
488                    << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
489
490  // Can't schedule a loop without a valid MII.
491  if (MII == 0) {
492    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
493    NumFailZeroMII++;
494    Pass.ORE->emit([&]() {
495      return MachineOptimizationRemarkAnalysis(
496                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
497             << "Invalid Minimal Initiation Interval: 0";
498    });
499    return;
500  }
501
502  // Don't pipeline large loops.
503  if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
504    LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
505                      << ", we don't pipleline large loops\n");
506    NumFailLargeMaxMII++;
507    Pass.ORE->emit([&]() {
508      return MachineOptimizationRemarkAnalysis(
509                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
510             << "Minimal Initiation Interval too large: "
511             << ore::NV("MII", (int)MII) << " > "
512             << ore::NV("SwpMaxMii", SwpMaxMii) << "."
513             << "Refer to -pipeliner-max-mii.";
514    });
515    return;
516  }
517
518  computeNodeFunctions(NodeSets);
519
520  registerPressureFilter(NodeSets);
521
522  colocateNodeSets(NodeSets);
523
524  checkNodeSets(NodeSets);
525
526  LLVM_DEBUG({
527    for (auto &I : NodeSets) {
528      dbgs() << "  Rec NodeSet ";
529      I.dump();
530    }
531  });
532
533  llvm::stable_sort(NodeSets, std::greater<NodeSet>());
534
535  groupRemainingNodes(NodeSets);
536
537  removeDuplicateNodes(NodeSets);
538
539  LLVM_DEBUG({
540    for (auto &I : NodeSets) {
541      dbgs() << "  NodeSet ";
542      I.dump();
543    }
544  });
545
546  computeNodeOrder(NodeSets);
547
548  // check for node order issues
549  checkValidNodeOrder(Circuits);
550
551  SMSchedule Schedule(Pass.MF);
552  Scheduled = schedulePipeline(Schedule);
553
554  if (!Scheduled){
555    LLVM_DEBUG(dbgs() << "No schedule found, return\n");
556    NumFailNoSchedule++;
557    Pass.ORE->emit([&]() {
558      return MachineOptimizationRemarkAnalysis(
559                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560             << "Unable to find schedule";
561    });
562    return;
563  }
564
565  unsigned numStages = Schedule.getMaxStageCount();
566  // No need to generate pipeline if there are no overlapped iterations.
567  if (numStages == 0) {
568    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
569    NumFailZeroStage++;
570    Pass.ORE->emit([&]() {
571      return MachineOptimizationRemarkAnalysis(
572                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
573             << "No need to pipeline - no overlapped iterations in schedule.";
574    });
575    return;
576  }
577  // Check that the maximum stage count is less than user-defined limit.
578  if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
579    LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
580                      << " : too many stages, abort\n");
581    NumFailLargeMaxStage++;
582    Pass.ORE->emit([&]() {
583      return MachineOptimizationRemarkAnalysis(
584                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
585             << "Too many stages in schedule: "
586             << ore::NV("numStages", (int)numStages) << " > "
587             << ore::NV("SwpMaxStages", SwpMaxStages)
588             << ". Refer to -pipeliner-max-stages.";
589    });
590    return;
591  }
592
593  Pass.ORE->emit([&]() {
594    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
595                                     Loop.getHeader())
596           << "Pipelined succesfully!";
597  });
598
599  // Generate the schedule as a ModuloSchedule.
600  DenseMap<MachineInstr *, int> Cycles, Stages;
601  std::vector<MachineInstr *> OrderedInsts;
602  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
603       ++Cycle) {
604    for (SUnit *SU : Schedule.getInstructions(Cycle)) {
605      OrderedInsts.push_back(SU->getInstr());
606      Cycles[SU->getInstr()] = Cycle;
607      Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
608    }
609  }
610  DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
611  for (auto &KV : NewMIs) {
612    Cycles[KV.first] = Cycles[KV.second];
613    Stages[KV.first] = Stages[KV.second];
614    NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
615  }
616
617  ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
618                    std::move(Stages));
619  if (EmitTestAnnotations) {
620    assert(NewInstrChanges.empty() &&
621           "Cannot serialize a schedule with InstrChanges!");
622    ModuloScheduleTestAnnotater MSTI(MF, MS);
623    MSTI.annotate();
624    return;
625  }
626  // The experimental code generator can't work if there are InstChanges.
627  if (ExperimentalCodeGen && NewInstrChanges.empty()) {
628    PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
629    MSE.expand();
630  } else {
631    ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
632    MSE.expand();
633    MSE.cleanup();
634  }
635  ++NumPipelined;
636}
637
638/// Clean up after the software pipeliner runs.
639void SwingSchedulerDAG::finishBlock() {
640  for (auto &KV : NewMIs)
641    MF.DeleteMachineInstr(KV.second);
642  NewMIs.clear();
643
644  // Call the superclass.
645  ScheduleDAGInstrs::finishBlock();
646}
647
648/// Return the register values for  the operands of a Phi instruction.
649/// This function assume the instruction is a Phi.
650static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
651                       unsigned &InitVal, unsigned &LoopVal) {
652  assert(Phi.isPHI() && "Expecting a Phi.");
653
654  InitVal = 0;
655  LoopVal = 0;
656  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
657    if (Phi.getOperand(i + 1).getMBB() != Loop)
658      InitVal = Phi.getOperand(i).getReg();
659    else
660      LoopVal = Phi.getOperand(i).getReg();
661
662  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
663}
664
665/// Return the Phi register value that comes the loop block.
666static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
667  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
668    if (Phi.getOperand(i + 1).getMBB() == LoopBB)
669      return Phi.getOperand(i).getReg();
670  return 0;
671}
672
673/// Return true if SUb can be reached from SUa following the chain edges.
674static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
675  SmallPtrSet<SUnit *, 8> Visited;
676  SmallVector<SUnit *, 8> Worklist;
677  Worklist.push_back(SUa);
678  while (!Worklist.empty()) {
679    const SUnit *SU = Worklist.pop_back_val();
680    for (auto &SI : SU->Succs) {
681      SUnit *SuccSU = SI.getSUnit();
682      if (SI.getKind() == SDep::Order) {
683        if (Visited.count(SuccSU))
684          continue;
685        if (SuccSU == SUb)
686          return true;
687        Worklist.push_back(SuccSU);
688        Visited.insert(SuccSU);
689      }
690    }
691  }
692  return false;
693}
694
695/// Return true if the instruction causes a chain between memory
696/// references before and after it.
697static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
698  return MI.isCall() || MI.mayRaiseFPException() ||
699         MI.hasUnmodeledSideEffects() ||
700         (MI.hasOrderedMemoryRef() &&
701          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
702}
703
704/// Return the underlying objects for the memory references of an instruction.
705/// This function calls the code in ValueTracking, but first checks that the
706/// instruction has a memory operand.
707static void getUnderlyingObjects(const MachineInstr *MI,
708                                 SmallVectorImpl<const Value *> &Objs,
709                                 const DataLayout &DL) {
710  if (!MI->hasOneMemOperand())
711    return;
712  MachineMemOperand *MM = *MI->memoperands_begin();
713  if (!MM->getValue())
714    return;
715  GetUnderlyingObjects(MM->getValue(), Objs, DL);
716  for (const Value *V : Objs) {
717    if (!isIdentifiedObject(V)) {
718      Objs.clear();
719      return;
720    }
721    Objs.push_back(V);
722  }
723}
724
725/// Add a chain edge between a load and store if the store can be an
726/// alias of the load on a subsequent iteration, i.e., a loop carried
727/// dependence. This code is very similar to the code in ScheduleDAGInstrs
728/// but that code doesn't create loop carried dependences.
729void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
730  MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
731  Value *UnknownValue =
732    UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
733  for (auto &SU : SUnits) {
734    MachineInstr &MI = *SU.getInstr();
735    if (isDependenceBarrier(MI, AA))
736      PendingLoads.clear();
737    else if (MI.mayLoad()) {
738      SmallVector<const Value *, 4> Objs;
739      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
740      if (Objs.empty())
741        Objs.push_back(UnknownValue);
742      for (auto V : Objs) {
743        SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
744        SUs.push_back(&SU);
745      }
746    } else if (MI.mayStore()) {
747      SmallVector<const Value *, 4> Objs;
748      getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
749      if (Objs.empty())
750        Objs.push_back(UnknownValue);
751      for (auto V : Objs) {
752        MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
753            PendingLoads.find(V);
754        if (I == PendingLoads.end())
755          continue;
756        for (auto Load : I->second) {
757          if (isSuccOrder(Load, &SU))
758            continue;
759          MachineInstr &LdMI = *Load->getInstr();
760          // First, perform the cheaper check that compares the base register.
761          // If they are the same and the load offset is less than the store
762          // offset, then mark the dependence as loop carried potentially.
763          const MachineOperand *BaseOp1, *BaseOp2;
764          int64_t Offset1, Offset2;
765          bool Offset1IsScalable, Offset2IsScalable;
766          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
767                                           Offset1IsScalable, TRI) &&
768              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
769                                           Offset2IsScalable, TRI)) {
770            if (BaseOp1->isIdenticalTo(*BaseOp2) &&
771                Offset1IsScalable == Offset2IsScalable &&
772                (int)Offset1 < (int)Offset2) {
773              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
774                     "What happened to the chain edge?");
775              SDep Dep(Load, SDep::Barrier);
776              Dep.setLatency(1);
777              SU.addPred(Dep);
778              continue;
779            }
780          }
781          // Second, the more expensive check that uses alias analysis on the
782          // base registers. If they alias, and the load offset is less than
783          // the store offset, the mark the dependence as loop carried.
784          if (!AA) {
785            SDep Dep(Load, SDep::Barrier);
786            Dep.setLatency(1);
787            SU.addPred(Dep);
788            continue;
789          }
790          MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
791          MachineMemOperand *MMO2 = *MI.memoperands_begin();
792          if (!MMO1->getValue() || !MMO2->getValue()) {
793            SDep Dep(Load, SDep::Barrier);
794            Dep.setLatency(1);
795            SU.addPred(Dep);
796            continue;
797          }
798          if (MMO1->getValue() == MMO2->getValue() &&
799              MMO1->getOffset() <= MMO2->getOffset()) {
800            SDep Dep(Load, SDep::Barrier);
801            Dep.setLatency(1);
802            SU.addPred(Dep);
803            continue;
804          }
805          AliasResult AAResult = AA->alias(
806              MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
807                             MMO1->getAAInfo()),
808              MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
809                             MMO2->getAAInfo()));
810
811          if (AAResult != NoAlias) {
812            SDep Dep(Load, SDep::Barrier);
813            Dep.setLatency(1);
814            SU.addPred(Dep);
815          }
816        }
817      }
818    }
819  }
820}
821
822/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
823/// processes dependences for PHIs. This function adds true dependences
824/// from a PHI to a use, and a loop carried dependence from the use to the
825/// PHI. The loop carried dependence is represented as an anti dependence
826/// edge. This function also removes chain dependences between unrelated
827/// PHIs.
828void SwingSchedulerDAG::updatePhiDependences() {
829  SmallVector<SDep, 4> RemoveDeps;
830  const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
831
832  // Iterate over each DAG node.
833  for (SUnit &I : SUnits) {
834    RemoveDeps.clear();
835    // Set to true if the instruction has an operand defined by a Phi.
836    unsigned HasPhiUse = 0;
837    unsigned HasPhiDef = 0;
838    MachineInstr *MI = I.getInstr();
839    // Iterate over each operand, and we process the definitions.
840    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
841                                    MOE = MI->operands_end();
842         MOI != MOE; ++MOI) {
843      if (!MOI->isReg())
844        continue;
845      Register Reg = MOI->getReg();
846      if (MOI->isDef()) {
847        // If the register is used by a Phi, then create an anti dependence.
848        for (MachineRegisterInfo::use_instr_iterator
849                 UI = MRI.use_instr_begin(Reg),
850                 UE = MRI.use_instr_end();
851             UI != UE; ++UI) {
852          MachineInstr *UseMI = &*UI;
853          SUnit *SU = getSUnit(UseMI);
854          if (SU != nullptr && UseMI->isPHI()) {
855            if (!MI->isPHI()) {
856              SDep Dep(SU, SDep::Anti, Reg);
857              Dep.setLatency(1);
858              I.addPred(Dep);
859            } else {
860              HasPhiDef = Reg;
861              // Add a chain edge to a dependent Phi that isn't an existing
862              // predecessor.
863              if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
864                I.addPred(SDep(SU, SDep::Barrier));
865            }
866          }
867        }
868      } else if (MOI->isUse()) {
869        // If the register is defined by a Phi, then create a true dependence.
870        MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
871        if (DefMI == nullptr)
872          continue;
873        SUnit *SU = getSUnit(DefMI);
874        if (SU != nullptr && DefMI->isPHI()) {
875          if (!MI->isPHI()) {
876            SDep Dep(SU, SDep::Data, Reg);
877            Dep.setLatency(0);
878            ST.adjustSchedDependency(SU, 0, &I, MI->getOperandNo(MOI), Dep);
879            I.addPred(Dep);
880          } else {
881            HasPhiUse = Reg;
882            // Add a chain edge to a dependent Phi that isn't an existing
883            // predecessor.
884            if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
885              I.addPred(SDep(SU, SDep::Barrier));
886          }
887        }
888      }
889    }
890    // Remove order dependences from an unrelated Phi.
891    if (!SwpPruneDeps)
892      continue;
893    for (auto &PI : I.Preds) {
894      MachineInstr *PMI = PI.getSUnit()->getInstr();
895      if (PMI->isPHI() && PI.getKind() == SDep::Order) {
896        if (I.getInstr()->isPHI()) {
897          if (PMI->getOperand(0).getReg() == HasPhiUse)
898            continue;
899          if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
900            continue;
901        }
902        RemoveDeps.push_back(PI);
903      }
904    }
905    for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
906      I.removePred(RemoveDeps[i]);
907  }
908}
909
910/// Iterate over each DAG node and see if we can change any dependences
911/// in order to reduce the recurrence MII.
912void SwingSchedulerDAG::changeDependences() {
913  // See if an instruction can use a value from the previous iteration.
914  // If so, we update the base and offset of the instruction and change
915  // the dependences.
916  for (SUnit &I : SUnits) {
917    unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
918    int64_t NewOffset = 0;
919    if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
920                               NewOffset))
921      continue;
922
923    // Get the MI and SUnit for the instruction that defines the original base.
924    Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
925    MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
926    if (!DefMI)
927      continue;
928    SUnit *DefSU = getSUnit(DefMI);
929    if (!DefSU)
930      continue;
931    // Get the MI and SUnit for the instruction that defins the new base.
932    MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
933    if (!LastMI)
934      continue;
935    SUnit *LastSU = getSUnit(LastMI);
936    if (!LastSU)
937      continue;
938
939    if (Topo.IsReachable(&I, LastSU))
940      continue;
941
942    // Remove the dependence. The value now depends on a prior iteration.
943    SmallVector<SDep, 4> Deps;
944    for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
945         ++P)
946      if (P->getSUnit() == DefSU)
947        Deps.push_back(*P);
948    for (int i = 0, e = Deps.size(); i != e; i++) {
949      Topo.RemovePred(&I, Deps[i].getSUnit());
950      I.removePred(Deps[i]);
951    }
952    // Remove the chain dependence between the instructions.
953    Deps.clear();
954    for (auto &P : LastSU->Preds)
955      if (P.getSUnit() == &I && P.getKind() == SDep::Order)
956        Deps.push_back(P);
957    for (int i = 0, e = Deps.size(); i != e; i++) {
958      Topo.RemovePred(LastSU, Deps[i].getSUnit());
959      LastSU->removePred(Deps[i]);
960    }
961
962    // Add a dependence between the new instruction and the instruction
963    // that defines the new base.
964    SDep Dep(&I, SDep::Anti, NewBase);
965    Topo.AddPred(LastSU, &I);
966    LastSU->addPred(Dep);
967
968    // Remember the base and offset information so that we can update the
969    // instruction during code generation.
970    InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
971  }
972}
973
974namespace {
975
976// FuncUnitSorter - Comparison operator used to sort instructions by
977// the number of functional unit choices.
978struct FuncUnitSorter {
979  const InstrItineraryData *InstrItins;
980  const MCSubtargetInfo *STI;
981  DenseMap<InstrStage::FuncUnits, unsigned> Resources;
982
983  FuncUnitSorter(const TargetSubtargetInfo &TSI)
984      : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
985
986  // Compute the number of functional unit alternatives needed
987  // at each stage, and take the minimum value. We prioritize the
988  // instructions by the least number of choices first.
989  unsigned minFuncUnits(const MachineInstr *Inst,
990                        InstrStage::FuncUnits &F) const {
991    unsigned SchedClass = Inst->getDesc().getSchedClass();
992    unsigned min = UINT_MAX;
993    if (InstrItins && !InstrItins->isEmpty()) {
994      for (const InstrStage &IS :
995           make_range(InstrItins->beginStage(SchedClass),
996                      InstrItins->endStage(SchedClass))) {
997        InstrStage::FuncUnits funcUnits = IS.getUnits();
998        unsigned numAlternatives = countPopulation(funcUnits);
999        if (numAlternatives < min) {
1000          min = numAlternatives;
1001          F = funcUnits;
1002        }
1003      }
1004      return min;
1005    }
1006    if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1007      const MCSchedClassDesc *SCDesc =
1008          STI->getSchedModel().getSchedClassDesc(SchedClass);
1009      if (!SCDesc->isValid())
1010        // No valid Schedule Class Desc for schedClass, should be
1011        // Pseudo/PostRAPseudo
1012        return min;
1013
1014      for (const MCWriteProcResEntry &PRE :
1015           make_range(STI->getWriteProcResBegin(SCDesc),
1016                      STI->getWriteProcResEnd(SCDesc))) {
1017        if (!PRE.Cycles)
1018          continue;
1019        const MCProcResourceDesc *ProcResource =
1020            STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1021        unsigned NumUnits = ProcResource->NumUnits;
1022        if (NumUnits < min) {
1023          min = NumUnits;
1024          F = PRE.ProcResourceIdx;
1025        }
1026      }
1027      return min;
1028    }
1029    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1030  }
1031
1032  // Compute the critical resources needed by the instruction. This
1033  // function records the functional units needed by instructions that
1034  // must use only one functional unit. We use this as a tie breaker
1035  // for computing the resource MII. The instrutions that require
1036  // the same, highly used, functional unit have high priority.
1037  void calcCriticalResources(MachineInstr &MI) {
1038    unsigned SchedClass = MI.getDesc().getSchedClass();
1039    if (InstrItins && !InstrItins->isEmpty()) {
1040      for (const InstrStage &IS :
1041           make_range(InstrItins->beginStage(SchedClass),
1042                      InstrItins->endStage(SchedClass))) {
1043        InstrStage::FuncUnits FuncUnits = IS.getUnits();
1044        if (countPopulation(FuncUnits) == 1)
1045          Resources[FuncUnits]++;
1046      }
1047      return;
1048    }
1049    if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1050      const MCSchedClassDesc *SCDesc =
1051          STI->getSchedModel().getSchedClassDesc(SchedClass);
1052      if (!SCDesc->isValid())
1053        // No valid Schedule Class Desc for schedClass, should be
1054        // Pseudo/PostRAPseudo
1055        return;
1056
1057      for (const MCWriteProcResEntry &PRE :
1058           make_range(STI->getWriteProcResBegin(SCDesc),
1059                      STI->getWriteProcResEnd(SCDesc))) {
1060        if (!PRE.Cycles)
1061          continue;
1062        Resources[PRE.ProcResourceIdx]++;
1063      }
1064      return;
1065    }
1066    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1067  }
1068
1069  /// Return true if IS1 has less priority than IS2.
1070  bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1071    InstrStage::FuncUnits F1 = 0, F2 = 0;
1072    unsigned MFUs1 = minFuncUnits(IS1, F1);
1073    unsigned MFUs2 = minFuncUnits(IS2, F2);
1074    if (MFUs1 == MFUs2)
1075      return Resources.lookup(F1) < Resources.lookup(F2);
1076    return MFUs1 > MFUs2;
1077  }
1078};
1079
1080} // end anonymous namespace
1081
1082/// Calculate the resource constrained minimum initiation interval for the
1083/// specified loop. We use the DFA to model the resources needed for
1084/// each instruction, and we ignore dependences. A different DFA is created
1085/// for each cycle that is required. When adding a new instruction, we attempt
1086/// to add it to each existing DFA, until a legal space is found. If the
1087/// instruction cannot be reserved in an existing DFA, we create a new one.
1088unsigned SwingSchedulerDAG::calculateResMII() {
1089
1090  LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1091  SmallVector<ResourceManager*, 8> Resources;
1092  MachineBasicBlock *MBB = Loop.getHeader();
1093  Resources.push_back(new ResourceManager(&MF.getSubtarget()));
1094
1095  // Sort the instructions by the number of available choices for scheduling,
1096  // least to most. Use the number of critical resources as the tie breaker.
1097  FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget());
1098  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1099                                   E = MBB->getFirstTerminator();
1100       I != E; ++I)
1101    FUS.calcCriticalResources(*I);
1102  PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
1103      FuncUnitOrder(FUS);
1104
1105  for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1106                                   E = MBB->getFirstTerminator();
1107       I != E; ++I)
1108    FuncUnitOrder.push(&*I);
1109
1110  while (!FuncUnitOrder.empty()) {
1111    MachineInstr *MI = FuncUnitOrder.top();
1112    FuncUnitOrder.pop();
1113    if (TII->isZeroCost(MI->getOpcode()))
1114      continue;
1115    // Attempt to reserve the instruction in an existing DFA. At least one
1116    // DFA is needed for each cycle.
1117    unsigned NumCycles = getSUnit(MI)->Latency;
1118    unsigned ReservedCycles = 0;
1119    SmallVectorImpl<ResourceManager *>::iterator RI = Resources.begin();
1120    SmallVectorImpl<ResourceManager *>::iterator RE = Resources.end();
1121    LLVM_DEBUG({
1122      dbgs() << "Trying to reserve resource for " << NumCycles
1123             << " cycles for \n";
1124      MI->dump();
1125    });
1126    for (unsigned C = 0; C < NumCycles; ++C)
1127      while (RI != RE) {
1128        if ((*RI)->canReserveResources(*MI)) {
1129          (*RI)->reserveResources(*MI);
1130          ++ReservedCycles;
1131          break;
1132        }
1133        RI++;
1134      }
1135    LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
1136                      << ", NumCycles:" << NumCycles << "\n");
1137    // Add new DFAs, if needed, to reserve resources.
1138    for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1139      LLVM_DEBUG(if (SwpDebugResource) dbgs()
1140                 << "NewResource created to reserve resources"
1141                 << "\n");
1142      ResourceManager *NewResource = new ResourceManager(&MF.getSubtarget());
1143      assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1144      NewResource->reserveResources(*MI);
1145      Resources.push_back(NewResource);
1146    }
1147  }
1148  int Resmii = Resources.size();
1149  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
1150  // Delete the memory for each of the DFAs that were created earlier.
1151  for (ResourceManager *RI : Resources) {
1152    ResourceManager *D = RI;
1153    delete D;
1154  }
1155  Resources.clear();
1156  return Resmii;
1157}
1158
1159/// Calculate the recurrence-constrainted minimum initiation interval.
1160/// Iterate over each circuit.  Compute the delay(c) and distance(c)
1161/// for each circuit. The II needs to satisfy the inequality
1162/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1163/// II that satisfies the inequality, and the RecMII is the maximum
1164/// of those values.
1165unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1166  unsigned RecMII = 0;
1167
1168  for (NodeSet &Nodes : NodeSets) {
1169    if (Nodes.empty())
1170      continue;
1171
1172    unsigned Delay = Nodes.getLatency();
1173    unsigned Distance = 1;
1174
1175    // ii = ceil(delay / distance)
1176    unsigned CurMII = (Delay + Distance - 1) / Distance;
1177    Nodes.setRecMII(CurMII);
1178    if (CurMII > RecMII)
1179      RecMII = CurMII;
1180  }
1181
1182  return RecMII;
1183}
1184
1185/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1186/// but we do this to find the circuits, and then change them back.
1187static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1188  SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
1189  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1190    SUnit *SU = &SUnits[i];
1191    for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1192         IP != EP; ++IP) {
1193      if (IP->getKind() != SDep::Anti)
1194        continue;
1195      DepsAdded.push_back(std::make_pair(SU, *IP));
1196    }
1197  }
1198  for (SmallVector<std::pair<SUnit *, SDep>, 8>::iterator I = DepsAdded.begin(),
1199                                                          E = DepsAdded.end();
1200       I != E; ++I) {
1201    // Remove this anti dependency and add one in the reverse direction.
1202    SUnit *SU = I->first;
1203    SDep &D = I->second;
1204    SUnit *TargetSU = D.getSUnit();
1205    unsigned Reg = D.getReg();
1206    unsigned Lat = D.getLatency();
1207    SU->removePred(D);
1208    SDep Dep(SU, SDep::Anti, Reg);
1209    Dep.setLatency(Lat);
1210    TargetSU->addPred(Dep);
1211  }
1212}
1213
1214/// Create the adjacency structure of the nodes in the graph.
1215void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1216    SwingSchedulerDAG *DAG) {
1217  BitVector Added(SUnits.size());
1218  DenseMap<int, int> OutputDeps;
1219  for (int i = 0, e = SUnits.size(); i != e; ++i) {
1220    Added.reset();
1221    // Add any successor to the adjacency matrix and exclude duplicates.
1222    for (auto &SI : SUnits[i].Succs) {
1223      // Only create a back-edge on the first and last nodes of a dependence
1224      // chain. This records any chains and adds them later.
1225      if (SI.getKind() == SDep::Output) {
1226        int N = SI.getSUnit()->NodeNum;
1227        int BackEdge = i;
1228        auto Dep = OutputDeps.find(BackEdge);
1229        if (Dep != OutputDeps.end()) {
1230          BackEdge = Dep->second;
1231          OutputDeps.erase(Dep);
1232        }
1233        OutputDeps[N] = BackEdge;
1234      }
1235      // Do not process a boundary node, an artificial node.
1236      // A back-edge is processed only if it goes to a Phi.
1237      if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1238          (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1239        continue;
1240      int N = SI.getSUnit()->NodeNum;
1241      if (!Added.test(N)) {
1242        AdjK[i].push_back(N);
1243        Added.set(N);
1244      }
1245    }
1246    // A chain edge between a store and a load is treated as a back-edge in the
1247    // adjacency matrix.
1248    for (auto &PI : SUnits[i].Preds) {
1249      if (!SUnits[i].getInstr()->mayStore() ||
1250          !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1251        continue;
1252      if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1253        int N = PI.getSUnit()->NodeNum;
1254        if (!Added.test(N)) {
1255          AdjK[i].push_back(N);
1256          Added.set(N);
1257        }
1258      }
1259    }
1260  }
1261  // Add back-edges in the adjacency matrix for the output dependences.
1262  for (auto &OD : OutputDeps)
1263    if (!Added.test(OD.second)) {
1264      AdjK[OD.first].push_back(OD.second);
1265      Added.set(OD.second);
1266    }
1267}
1268
1269/// Identify an elementary circuit in the dependence graph starting at the
1270/// specified node.
1271bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1272                                          bool HasBackedge) {
1273  SUnit *SV = &SUnits[V];
1274  bool F = false;
1275  Stack.insert(SV);
1276  Blocked.set(V);
1277
1278  for (auto W : AdjK[V]) {
1279    if (NumPaths > MaxPaths)
1280      break;
1281    if (W < S)
1282      continue;
1283    if (W == S) {
1284      if (!HasBackedge)
1285        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1286      F = true;
1287      ++NumPaths;
1288      break;
1289    } else if (!Blocked.test(W)) {
1290      if (circuit(W, S, NodeSets,
1291                  Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1292        F = true;
1293    }
1294  }
1295
1296  if (F)
1297    unblock(V);
1298  else {
1299    for (auto W : AdjK[V]) {
1300      if (W < S)
1301        continue;
1302      if (B[W].count(SV) == 0)
1303        B[W].insert(SV);
1304    }
1305  }
1306  Stack.pop_back();
1307  return F;
1308}
1309
1310/// Unblock a node in the circuit finding algorithm.
1311void SwingSchedulerDAG::Circuits::unblock(int U) {
1312  Blocked.reset(U);
1313  SmallPtrSet<SUnit *, 4> &BU = B[U];
1314  while (!BU.empty()) {
1315    SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1316    assert(SI != BU.end() && "Invalid B set.");
1317    SUnit *W = *SI;
1318    BU.erase(W);
1319    if (Blocked.test(W->NodeNum))
1320      unblock(W->NodeNum);
1321  }
1322}
1323
1324/// Identify all the elementary circuits in the dependence graph using
1325/// Johnson's circuit algorithm.
1326void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1327  // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1328  // but we do this to find the circuits, and then change them back.
1329  swapAntiDependences(SUnits);
1330
1331  Circuits Cir(SUnits, Topo);
1332  // Create the adjacency structure.
1333  Cir.createAdjacencyStructure(this);
1334  for (int i = 0, e = SUnits.size(); i != e; ++i) {
1335    Cir.reset();
1336    Cir.circuit(i, i, NodeSets);
1337  }
1338
1339  // Change the dependences back so that we've created a DAG again.
1340  swapAntiDependences(SUnits);
1341}
1342
1343// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1344// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1345// additional copies that are needed across iterations. An artificial dependence
1346// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1347
1348// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1349// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1350// PHI-------True-Dep------> USEOfPhi
1351
1352// The mutation creates
1353// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1354
1355// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1356// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1357// late  to avoid additional copies across iterations. The possible scheduling
1358// order would be
1359// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
1360
1361void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1362  for (SUnit &SU : DAG->SUnits) {
1363    // Find the COPY/REG_SEQUENCE instruction.
1364    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1365      continue;
1366
1367    // Record the loop carried PHIs.
1368    SmallVector<SUnit *, 4> PHISUs;
1369    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1370    SmallVector<SUnit *, 4> SrcSUs;
1371
1372    for (auto &Dep : SU.Preds) {
1373      SUnit *TmpSU = Dep.getSUnit();
1374      MachineInstr *TmpMI = TmpSU->getInstr();
1375      SDep::Kind DepKind = Dep.getKind();
1376      // Save the loop carried PHI.
1377      if (DepKind == SDep::Anti && TmpMI->isPHI())
1378        PHISUs.push_back(TmpSU);
1379      // Save the source of COPY/REG_SEQUENCE.
1380      // If the source has no pre-decessors, we will end up creating cycles.
1381      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1382        SrcSUs.push_back(TmpSU);
1383    }
1384
1385    if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1386      continue;
1387
1388    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1389    // SUnit to the container.
1390    SmallVector<SUnit *, 8> UseSUs;
1391    // Do not use iterator based loop here as we are updating the container.
1392    for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1393      for (auto &Dep : PHISUs[Index]->Succs) {
1394        if (Dep.getKind() != SDep::Data)
1395          continue;
1396
1397        SUnit *TmpSU = Dep.getSUnit();
1398        MachineInstr *TmpMI = TmpSU->getInstr();
1399        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1400          PHISUs.push_back(TmpSU);
1401          continue;
1402        }
1403        UseSUs.push_back(TmpSU);
1404      }
1405    }
1406
1407    if (UseSUs.size() == 0)
1408      continue;
1409
1410    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1411    // Add the artificial dependencies if it does not form a cycle.
1412    for (auto I : UseSUs) {
1413      for (auto Src : SrcSUs) {
1414        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1415          Src->addPred(SDep(I, SDep::Artificial));
1416          SDAG->Topo.AddPred(Src, I);
1417        }
1418      }
1419    }
1420  }
1421}
1422
1423/// Return true for DAG nodes that we ignore when computing the cost functions.
1424/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1425/// in the calculation of the ASAP, ALAP, etc functions.
1426static bool ignoreDependence(const SDep &D, bool isPred) {
1427  if (D.isArtificial())
1428    return true;
1429  return D.getKind() == SDep::Anti && isPred;
1430}
1431
1432/// Compute several functions need to order the nodes for scheduling.
1433///  ASAP - Earliest time to schedule a node.
1434///  ALAP - Latest time to schedule a node.
1435///  MOV - Mobility function, difference between ALAP and ASAP.
1436///  D - Depth of each node.
1437///  H - Height of each node.
1438void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1439  ScheduleInfo.resize(SUnits.size());
1440
1441  LLVM_DEBUG({
1442    for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1443                                                    E = Topo.end();
1444         I != E; ++I) {
1445      const SUnit &SU = SUnits[*I];
1446      dumpNode(SU);
1447    }
1448  });
1449
1450  int maxASAP = 0;
1451  // Compute ASAP and ZeroLatencyDepth.
1452  for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1453                                                  E = Topo.end();
1454       I != E; ++I) {
1455    int asap = 0;
1456    int zeroLatencyDepth = 0;
1457    SUnit *SU = &SUnits[*I];
1458    for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1459                                    EP = SU->Preds.end();
1460         IP != EP; ++IP) {
1461      SUnit *pred = IP->getSUnit();
1462      if (IP->getLatency() == 0)
1463        zeroLatencyDepth =
1464            std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1465      if (ignoreDependence(*IP, true))
1466        continue;
1467      asap = std::max(asap, (int)(getASAP(pred) + IP->getLatency() -
1468                                  getDistance(pred, SU, *IP) * MII));
1469    }
1470    maxASAP = std::max(maxASAP, asap);
1471    ScheduleInfo[*I].ASAP = asap;
1472    ScheduleInfo[*I].ZeroLatencyDepth = zeroLatencyDepth;
1473  }
1474
1475  // Compute ALAP, ZeroLatencyHeight, and MOV.
1476  for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1477                                                          E = Topo.rend();
1478       I != E; ++I) {
1479    int alap = maxASAP;
1480    int zeroLatencyHeight = 0;
1481    SUnit *SU = &SUnits[*I];
1482    for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1483                                    ES = SU->Succs.end();
1484         IS != ES; ++IS) {
1485      SUnit *succ = IS->getSUnit();
1486      if (IS->getLatency() == 0)
1487        zeroLatencyHeight =
1488            std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1489      if (ignoreDependence(*IS, true))
1490        continue;
1491      alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() +
1492                                  getDistance(SU, succ, *IS) * MII));
1493    }
1494
1495    ScheduleInfo[*I].ALAP = alap;
1496    ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight;
1497  }
1498
1499  // After computing the node functions, compute the summary for each node set.
1500  for (NodeSet &I : NodeSets)
1501    I.computeNodeSetInfo(this);
1502
1503  LLVM_DEBUG({
1504    for (unsigned i = 0; i < SUnits.size(); i++) {
1505      dbgs() << "\tNode " << i << ":\n";
1506      dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
1507      dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
1508      dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
1509      dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
1510      dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
1511      dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1512      dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1513    }
1514  });
1515}
1516
1517/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1518/// as the predecessors of the elements of NodeOrder that are not also in
1519/// NodeOrder.
1520static bool pred_L(SetVector<SUnit *> &NodeOrder,
1521                   SmallSetVector<SUnit *, 8> &Preds,
1522                   const NodeSet *S = nullptr) {
1523  Preds.clear();
1524  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1525       I != E; ++I) {
1526    for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1527         PI != PE; ++PI) {
1528      if (S && S->count(PI->getSUnit()) == 0)
1529        continue;
1530      if (ignoreDependence(*PI, true))
1531        continue;
1532      if (NodeOrder.count(PI->getSUnit()) == 0)
1533        Preds.insert(PI->getSUnit());
1534    }
1535    // Back-edges are predecessors with an anti-dependence.
1536    for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1537                                    ES = (*I)->Succs.end();
1538         IS != ES; ++IS) {
1539      if (IS->getKind() != SDep::Anti)
1540        continue;
1541      if (S && S->count(IS->getSUnit()) == 0)
1542        continue;
1543      if (NodeOrder.count(IS->getSUnit()) == 0)
1544        Preds.insert(IS->getSUnit());
1545    }
1546  }
1547  return !Preds.empty();
1548}
1549
1550/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1551/// as the successors of the elements of NodeOrder that are not also in
1552/// NodeOrder.
1553static bool succ_L(SetVector<SUnit *> &NodeOrder,
1554                   SmallSetVector<SUnit *, 8> &Succs,
1555                   const NodeSet *S = nullptr) {
1556  Succs.clear();
1557  for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1558       I != E; ++I) {
1559    for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1560         SI != SE; ++SI) {
1561      if (S && S->count(SI->getSUnit()) == 0)
1562        continue;
1563      if (ignoreDependence(*SI, false))
1564        continue;
1565      if (NodeOrder.count(SI->getSUnit()) == 0)
1566        Succs.insert(SI->getSUnit());
1567    }
1568    for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1569                                    PE = (*I)->Preds.end();
1570         PI != PE; ++PI) {
1571      if (PI->getKind() != SDep::Anti)
1572        continue;
1573      if (S && S->count(PI->getSUnit()) == 0)
1574        continue;
1575      if (NodeOrder.count(PI->getSUnit()) == 0)
1576        Succs.insert(PI->getSUnit());
1577    }
1578  }
1579  return !Succs.empty();
1580}
1581
1582/// Return true if there is a path from the specified node to any of the nodes
1583/// in DestNodes. Keep track and return the nodes in any path.
1584static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1585                        SetVector<SUnit *> &DestNodes,
1586                        SetVector<SUnit *> &Exclude,
1587                        SmallPtrSet<SUnit *, 8> &Visited) {
1588  if (Cur->isBoundaryNode())
1589    return false;
1590  if (Exclude.count(Cur) != 0)
1591    return false;
1592  if (DestNodes.count(Cur) != 0)
1593    return true;
1594  if (!Visited.insert(Cur).second)
1595    return Path.count(Cur) != 0;
1596  bool FoundPath = false;
1597  for (auto &SI : Cur->Succs)
1598    FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1599  for (auto &PI : Cur->Preds)
1600    if (PI.getKind() == SDep::Anti)
1601      FoundPath |=
1602          computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1603  if (FoundPath)
1604    Path.insert(Cur);
1605  return FoundPath;
1606}
1607
1608/// Return true if Set1 is a subset of Set2.
1609template <class S1Ty, class S2Ty> static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1610  for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1611    if (Set2.count(*I) == 0)
1612      return false;
1613  return true;
1614}
1615
1616/// Compute the live-out registers for the instructions in a node-set.
1617/// The live-out registers are those that are defined in the node-set,
1618/// but not used. Except for use operands of Phis.
1619static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1620                            NodeSet &NS) {
1621  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1622  MachineRegisterInfo &MRI = MF.getRegInfo();
1623  SmallVector<RegisterMaskPair, 8> LiveOutRegs;
1624  SmallSet<unsigned, 4> Uses;
1625  for (SUnit *SU : NS) {
1626    const MachineInstr *MI = SU->getInstr();
1627    if (MI->isPHI())
1628      continue;
1629    for (const MachineOperand &MO : MI->operands())
1630      if (MO.isReg() && MO.isUse()) {
1631        Register Reg = MO.getReg();
1632        if (Register::isVirtualRegister(Reg))
1633          Uses.insert(Reg);
1634        else if (MRI.isAllocatable(Reg))
1635          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1636            Uses.insert(*Units);
1637      }
1638  }
1639  for (SUnit *SU : NS)
1640    for (const MachineOperand &MO : SU->getInstr()->operands())
1641      if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1642        Register Reg = MO.getReg();
1643        if (Register::isVirtualRegister(Reg)) {
1644          if (!Uses.count(Reg))
1645            LiveOutRegs.push_back(RegisterMaskPair(Reg,
1646                                                   LaneBitmask::getNone()));
1647        } else if (MRI.isAllocatable(Reg)) {
1648          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1649            if (!Uses.count(*Units))
1650              LiveOutRegs.push_back(RegisterMaskPair(*Units,
1651                                                     LaneBitmask::getNone()));
1652        }
1653      }
1654  RPTracker.addLiveRegs(LiveOutRegs);
1655}
1656
1657/// A heuristic to filter nodes in recurrent node-sets if the register
1658/// pressure of a set is too high.
1659void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1660  for (auto &NS : NodeSets) {
1661    // Skip small node-sets since they won't cause register pressure problems.
1662    if (NS.size() <= 2)
1663      continue;
1664    IntervalPressure RecRegPressure;
1665    RegPressureTracker RecRPTracker(RecRegPressure);
1666    RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1667    computeLiveOuts(MF, RecRPTracker, NS);
1668    RecRPTracker.closeBottom();
1669
1670    std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1671    llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1672      return A->NodeNum > B->NodeNum;
1673    });
1674
1675    for (auto &SU : SUnits) {
1676      // Since we're computing the register pressure for a subset of the
1677      // instructions in a block, we need to set the tracker for each
1678      // instruction in the node-set. The tracker is set to the instruction
1679      // just after the one we're interested in.
1680      MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1681      RecRPTracker.setPos(std::next(CurInstI));
1682
1683      RegPressureDelta RPDelta;
1684      ArrayRef<PressureChange> CriticalPSets;
1685      RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1686                                             CriticalPSets,
1687                                             RecRegPressure.MaxSetPressure);
1688      if (RPDelta.Excess.isValid()) {
1689        LLVM_DEBUG(
1690            dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1691                   << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1692                   << ":" << RPDelta.Excess.getUnitInc());
1693        NS.setExceedPressure(SU);
1694        break;
1695      }
1696      RecRPTracker.recede();
1697    }
1698  }
1699}
1700
1701/// A heuristic to colocate node sets that have the same set of
1702/// successors.
1703void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1704  unsigned Colocate = 0;
1705  for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1706    NodeSet &N1 = NodeSets[i];
1707    SmallSetVector<SUnit *, 8> S1;
1708    if (N1.empty() || !succ_L(N1, S1))
1709      continue;
1710    for (int j = i + 1; j < e; ++j) {
1711      NodeSet &N2 = NodeSets[j];
1712      if (N1.compareRecMII(N2) != 0)
1713        continue;
1714      SmallSetVector<SUnit *, 8> S2;
1715      if (N2.empty() || !succ_L(N2, S2))
1716        continue;
1717      if (isSubset(S1, S2) && S1.size() == S2.size()) {
1718        N1.setColocate(++Colocate);
1719        N2.setColocate(Colocate);
1720        break;
1721      }
1722    }
1723  }
1724}
1725
1726/// Check if the existing node-sets are profitable. If not, then ignore the
1727/// recurrent node-sets, and attempt to schedule all nodes together. This is
1728/// a heuristic. If the MII is large and all the recurrent node-sets are small,
1729/// then it's best to try to schedule all instructions together instead of
1730/// starting with the recurrent node-sets.
1731void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1732  // Look for loops with a large MII.
1733  if (MII < 17)
1734    return;
1735  // Check if the node-set contains only a simple add recurrence.
1736  for (auto &NS : NodeSets) {
1737    if (NS.getRecMII() > 2)
1738      return;
1739    if (NS.getMaxDepth() > MII)
1740      return;
1741  }
1742  NodeSets.clear();
1743  LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1744  return;
1745}
1746
1747/// Add the nodes that do not belong to a recurrence set into groups
1748/// based upon connected componenets.
1749void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1750  SetVector<SUnit *> NodesAdded;
1751  SmallPtrSet<SUnit *, 8> Visited;
1752  // Add the nodes that are on a path between the previous node sets and
1753  // the current node set.
1754  for (NodeSet &I : NodeSets) {
1755    SmallSetVector<SUnit *, 8> N;
1756    // Add the nodes from the current node set to the previous node set.
1757    if (succ_L(I, N)) {
1758      SetVector<SUnit *> Path;
1759      for (SUnit *NI : N) {
1760        Visited.clear();
1761        computePath(NI, Path, NodesAdded, I, Visited);
1762      }
1763      if (!Path.empty())
1764        I.insert(Path.begin(), Path.end());
1765    }
1766    // Add the nodes from the previous node set to the current node set.
1767    N.clear();
1768    if (succ_L(NodesAdded, N)) {
1769      SetVector<SUnit *> Path;
1770      for (SUnit *NI : N) {
1771        Visited.clear();
1772        computePath(NI, Path, I, NodesAdded, Visited);
1773      }
1774      if (!Path.empty())
1775        I.insert(Path.begin(), Path.end());
1776    }
1777    NodesAdded.insert(I.begin(), I.end());
1778  }
1779
1780  // Create a new node set with the connected nodes of any successor of a node
1781  // in a recurrent set.
1782  NodeSet NewSet;
1783  SmallSetVector<SUnit *, 8> N;
1784  if (succ_L(NodesAdded, N))
1785    for (SUnit *I : N)
1786      addConnectedNodes(I, NewSet, NodesAdded);
1787  if (!NewSet.empty())
1788    NodeSets.push_back(NewSet);
1789
1790  // Create a new node set with the connected nodes of any predecessor of a node
1791  // in a recurrent set.
1792  NewSet.clear();
1793  if (pred_L(NodesAdded, N))
1794    for (SUnit *I : N)
1795      addConnectedNodes(I, NewSet, NodesAdded);
1796  if (!NewSet.empty())
1797    NodeSets.push_back(NewSet);
1798
1799  // Create new nodes sets with the connected nodes any remaining node that
1800  // has no predecessor.
1801  for (unsigned i = 0; i < SUnits.size(); ++i) {
1802    SUnit *SU = &SUnits[i];
1803    if (NodesAdded.count(SU) == 0) {
1804      NewSet.clear();
1805      addConnectedNodes(SU, NewSet, NodesAdded);
1806      if (!NewSet.empty())
1807        NodeSets.push_back(NewSet);
1808    }
1809  }
1810}
1811
1812/// Add the node to the set, and add all of its connected nodes to the set.
1813void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1814                                          SetVector<SUnit *> &NodesAdded) {
1815  NewSet.insert(SU);
1816  NodesAdded.insert(SU);
1817  for (auto &SI : SU->Succs) {
1818    SUnit *Successor = SI.getSUnit();
1819    if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1820      addConnectedNodes(Successor, NewSet, NodesAdded);
1821  }
1822  for (auto &PI : SU->Preds) {
1823    SUnit *Predecessor = PI.getSUnit();
1824    if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1825      addConnectedNodes(Predecessor, NewSet, NodesAdded);
1826  }
1827}
1828
1829/// Return true if Set1 contains elements in Set2. The elements in common
1830/// are returned in a different container.
1831static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1832                        SmallSetVector<SUnit *, 8> &Result) {
1833  Result.clear();
1834  for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1835    SUnit *SU = Set1[i];
1836    if (Set2.count(SU) != 0)
1837      Result.insert(SU);
1838  }
1839  return !Result.empty();
1840}
1841
1842/// Merge the recurrence node sets that have the same initial node.
1843void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1844  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1845       ++I) {
1846    NodeSet &NI = *I;
1847    for (NodeSetType::iterator J = I + 1; J != E;) {
1848      NodeSet &NJ = *J;
1849      if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1850        if (NJ.compareRecMII(NI) > 0)
1851          NI.setRecMII(NJ.getRecMII());
1852        for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1853             ++NII)
1854          I->insert(*NII);
1855        NodeSets.erase(J);
1856        E = NodeSets.end();
1857      } else {
1858        ++J;
1859      }
1860    }
1861  }
1862}
1863
1864/// Remove nodes that have been scheduled in previous NodeSets.
1865void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1866  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1867       ++I)
1868    for (NodeSetType::iterator J = I + 1; J != E;) {
1869      J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1870
1871      if (J->empty()) {
1872        NodeSets.erase(J);
1873        E = NodeSets.end();
1874      } else {
1875        ++J;
1876      }
1877    }
1878}
1879
1880/// Compute an ordered list of the dependence graph nodes, which
1881/// indicates the order that the nodes will be scheduled.  This is a
1882/// two-level algorithm. First, a partial order is created, which
1883/// consists of a list of sets ordered from highest to lowest priority.
1884void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1885  SmallSetVector<SUnit *, 8> R;
1886  NodeOrder.clear();
1887
1888  for (auto &Nodes : NodeSets) {
1889    LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1890    OrderKind Order;
1891    SmallSetVector<SUnit *, 8> N;
1892    if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
1893      R.insert(N.begin(), N.end());
1894      Order = BottomUp;
1895      LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
1896    } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
1897      R.insert(N.begin(), N.end());
1898      Order = TopDown;
1899      LLVM_DEBUG(dbgs() << "  Top down (succs) ");
1900    } else if (isIntersect(N, Nodes, R)) {
1901      // If some of the successors are in the existing node-set, then use the
1902      // top-down ordering.
1903      Order = TopDown;
1904      LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
1905    } else if (NodeSets.size() == 1) {
1906      for (auto &N : Nodes)
1907        if (N->Succs.size() == 0)
1908          R.insert(N);
1909      Order = BottomUp;
1910      LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
1911    } else {
1912      // Find the node with the highest ASAP.
1913      SUnit *maxASAP = nullptr;
1914      for (SUnit *SU : Nodes) {
1915        if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1916            (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1917          maxASAP = SU;
1918      }
1919      R.insert(maxASAP);
1920      Order = BottomUp;
1921      LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
1922    }
1923
1924    while (!R.empty()) {
1925      if (Order == TopDown) {
1926        // Choose the node with the maximum height.  If more than one, choose
1927        // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1928        // choose the node with the lowest MOV.
1929        while (!R.empty()) {
1930          SUnit *maxHeight = nullptr;
1931          for (SUnit *I : R) {
1932            if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1933              maxHeight = I;
1934            else if (getHeight(I) == getHeight(maxHeight) &&
1935                     getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
1936              maxHeight = I;
1937            else if (getHeight(I) == getHeight(maxHeight) &&
1938                     getZeroLatencyHeight(I) ==
1939                         getZeroLatencyHeight(maxHeight) &&
1940                     getMOV(I) < getMOV(maxHeight))
1941              maxHeight = I;
1942          }
1943          NodeOrder.insert(maxHeight);
1944          LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1945          R.remove(maxHeight);
1946          for (const auto &I : maxHeight->Succs) {
1947            if (Nodes.count(I.getSUnit()) == 0)
1948              continue;
1949            if (NodeOrder.count(I.getSUnit()) != 0)
1950              continue;
1951            if (ignoreDependence(I, false))
1952              continue;
1953            R.insert(I.getSUnit());
1954          }
1955          // Back-edges are predecessors with an anti-dependence.
1956          for (const auto &I : maxHeight->Preds) {
1957            if (I.getKind() != SDep::Anti)
1958              continue;
1959            if (Nodes.count(I.getSUnit()) == 0)
1960              continue;
1961            if (NodeOrder.count(I.getSUnit()) != 0)
1962              continue;
1963            R.insert(I.getSUnit());
1964          }
1965        }
1966        Order = BottomUp;
1967        LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
1968        SmallSetVector<SUnit *, 8> N;
1969        if (pred_L(NodeOrder, N, &Nodes))
1970          R.insert(N.begin(), N.end());
1971      } else {
1972        // Choose the node with the maximum depth.  If more than one, choose
1973        // the node with the maximum ZeroLatencyDepth. If still more than one,
1974        // choose the node with the lowest MOV.
1975        while (!R.empty()) {
1976          SUnit *maxDepth = nullptr;
1977          for (SUnit *I : R) {
1978            if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1979              maxDepth = I;
1980            else if (getDepth(I) == getDepth(maxDepth) &&
1981                     getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
1982              maxDepth = I;
1983            else if (getDepth(I) == getDepth(maxDepth) &&
1984                     getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
1985                     getMOV(I) < getMOV(maxDepth))
1986              maxDepth = I;
1987          }
1988          NodeOrder.insert(maxDepth);
1989          LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1990          R.remove(maxDepth);
1991          if (Nodes.isExceedSU(maxDepth)) {
1992            Order = TopDown;
1993            R.clear();
1994            R.insert(Nodes.getNode(0));
1995            break;
1996          }
1997          for (const auto &I : maxDepth->Preds) {
1998            if (Nodes.count(I.getSUnit()) == 0)
1999              continue;
2000            if (NodeOrder.count(I.getSUnit()) != 0)
2001              continue;
2002            R.insert(I.getSUnit());
2003          }
2004          // Back-edges are predecessors with an anti-dependence.
2005          for (const auto &I : maxDepth->Succs) {
2006            if (I.getKind() != SDep::Anti)
2007              continue;
2008            if (Nodes.count(I.getSUnit()) == 0)
2009              continue;
2010            if (NodeOrder.count(I.getSUnit()) != 0)
2011              continue;
2012            R.insert(I.getSUnit());
2013          }
2014        }
2015        Order = TopDown;
2016        LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
2017        SmallSetVector<SUnit *, 8> N;
2018        if (succ_L(NodeOrder, N, &Nodes))
2019          R.insert(N.begin(), N.end());
2020      }
2021    }
2022    LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2023  }
2024
2025  LLVM_DEBUG({
2026    dbgs() << "Node order: ";
2027    for (SUnit *I : NodeOrder)
2028      dbgs() << " " << I->NodeNum << " ";
2029    dbgs() << "\n";
2030  });
2031}
2032
2033/// Process the nodes in the computed order and create the pipelined schedule
2034/// of the instructions, if possible. Return true if a schedule is found.
2035bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2036
2037  if (NodeOrder.empty()){
2038    LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2039    return false;
2040  }
2041
2042  bool scheduleFound = false;
2043  unsigned II = 0;
2044  // Keep increasing II until a valid schedule is found.
2045  for (II = MII; II <= MAX_II && !scheduleFound; ++II) {
2046    Schedule.reset();
2047    Schedule.setInitiationInterval(II);
2048    LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2049
2050    SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2051    SetVector<SUnit *>::iterator NE = NodeOrder.end();
2052    do {
2053      SUnit *SU = *NI;
2054
2055      // Compute the schedule time for the instruction, which is based
2056      // upon the scheduled time for any predecessors/successors.
2057      int EarlyStart = INT_MIN;
2058      int LateStart = INT_MAX;
2059      // These values are set when the size of the schedule window is limited
2060      // due to chain dependences.
2061      int SchedEnd = INT_MAX;
2062      int SchedStart = INT_MIN;
2063      Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2064                            II, this);
2065      LLVM_DEBUG({
2066        dbgs() << "\n";
2067        dbgs() << "Inst (" << SU->NodeNum << ") ";
2068        SU->getInstr()->dump();
2069        dbgs() << "\n";
2070      });
2071      LLVM_DEBUG({
2072        dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2073                         LateStart, SchedEnd, SchedStart);
2074      });
2075
2076      if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2077          SchedStart > LateStart)
2078        scheduleFound = false;
2079      else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2080        SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2081        scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2082      } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2083        SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2084        scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2085      } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2086        SchedEnd =
2087            std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2088        // When scheduling a Phi it is better to start at the late cycle and go
2089        // backwards. The default order may insert the Phi too far away from
2090        // its first dependence.
2091        if (SU->getInstr()->isPHI())
2092          scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2093        else
2094          scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2095      } else {
2096        int FirstCycle = Schedule.getFirstCycle();
2097        scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2098                                        FirstCycle + getASAP(SU) + II - 1, II);
2099      }
2100      // Even if we find a schedule, make sure the schedule doesn't exceed the
2101      // allowable number of stages. We keep trying if this happens.
2102      if (scheduleFound)
2103        if (SwpMaxStages > -1 &&
2104            Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2105          scheduleFound = false;
2106
2107      LLVM_DEBUG({
2108        if (!scheduleFound)
2109          dbgs() << "\tCan't schedule\n";
2110      });
2111    } while (++NI != NE && scheduleFound);
2112
2113    // If a schedule is found, check if it is a valid schedule too.
2114    if (scheduleFound)
2115      scheduleFound = Schedule.isValidSchedule(this);
2116  }
2117
2118  LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
2119                    << ")\n");
2120
2121  if (scheduleFound) {
2122    Schedule.finalizeSchedule(this);
2123    Pass.ORE->emit([&]() {
2124      return MachineOptimizationRemarkAnalysis(
2125                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2126             << "Schedule found with Initiation Interval: " << ore::NV("II", II)
2127             << ", MaxStageCount: "
2128             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2129    });
2130  } else
2131    Schedule.reset();
2132
2133  return scheduleFound && Schedule.getMaxStageCount() > 0;
2134}
2135
2136/// Return true if we can compute the amount the instruction changes
2137/// during each iteration. Set Delta to the amount of the change.
2138bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2139  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2140  const MachineOperand *BaseOp;
2141  int64_t Offset;
2142  bool OffsetIsScalable;
2143  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2144    return false;
2145
2146  // FIXME: This algorithm assumes instructions have fixed-size offsets.
2147  if (OffsetIsScalable)
2148    return false;
2149
2150  if (!BaseOp->isReg())
2151    return false;
2152
2153  Register BaseReg = BaseOp->getReg();
2154
2155  MachineRegisterInfo &MRI = MF.getRegInfo();
2156  // Check if there is a Phi. If so, get the definition in the loop.
2157  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2158  if (BaseDef && BaseDef->isPHI()) {
2159    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2160    BaseDef = MRI.getVRegDef(BaseReg);
2161  }
2162  if (!BaseDef)
2163    return false;
2164
2165  int D = 0;
2166  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2167    return false;
2168
2169  Delta = D;
2170  return true;
2171}
2172
2173/// Check if we can change the instruction to use an offset value from the
2174/// previous iteration. If so, return true and set the base and offset values
2175/// so that we can rewrite the load, if necessary.
2176///   v1 = Phi(v0, v3)
2177///   v2 = load v1, 0
2178///   v3 = post_store v1, 4, x
2179/// This function enables the load to be rewritten as v2 = load v3, 4.
2180bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2181                                              unsigned &BasePos,
2182                                              unsigned &OffsetPos,
2183                                              unsigned &NewBase,
2184                                              int64_t &Offset) {
2185  // Get the load instruction.
2186  if (TII->isPostIncrement(*MI))
2187    return false;
2188  unsigned BasePosLd, OffsetPosLd;
2189  if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2190    return false;
2191  Register BaseReg = MI->getOperand(BasePosLd).getReg();
2192
2193  // Look for the Phi instruction.
2194  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2195  MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2196  if (!Phi || !Phi->isPHI())
2197    return false;
2198  // Get the register defined in the loop block.
2199  unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2200  if (!PrevReg)
2201    return false;
2202
2203  // Check for the post-increment load/store instruction.
2204  MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2205  if (!PrevDef || PrevDef == MI)
2206    return false;
2207
2208  if (!TII->isPostIncrement(*PrevDef))
2209    return false;
2210
2211  unsigned BasePos1 = 0, OffsetPos1 = 0;
2212  if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2213    return false;
2214
2215  // Make sure that the instructions do not access the same memory location in
2216  // the next iteration.
2217  int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2218  int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2219  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2220  NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2221  bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2222  MF.DeleteMachineInstr(NewMI);
2223  if (!Disjoint)
2224    return false;
2225
2226  // Set the return value once we determine that we return true.
2227  BasePos = BasePosLd;
2228  OffsetPos = OffsetPosLd;
2229  NewBase = PrevReg;
2230  Offset = StoreOffset;
2231  return true;
2232}
2233
2234/// Apply changes to the instruction if needed. The changes are need
2235/// to improve the scheduling and depend up on the final schedule.
2236void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2237                                         SMSchedule &Schedule) {
2238  SUnit *SU = getSUnit(MI);
2239  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2240      InstrChanges.find(SU);
2241  if (It != InstrChanges.end()) {
2242    std::pair<unsigned, int64_t> RegAndOffset = It->second;
2243    unsigned BasePos, OffsetPos;
2244    if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2245      return;
2246    Register BaseReg = MI->getOperand(BasePos).getReg();
2247    MachineInstr *LoopDef = findDefInLoop(BaseReg);
2248    int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2249    int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2250    int BaseStageNum = Schedule.stageScheduled(SU);
2251    int BaseCycleNum = Schedule.cycleScheduled(SU);
2252    if (BaseStageNum < DefStageNum) {
2253      MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2254      int OffsetDiff = DefStageNum - BaseStageNum;
2255      if (DefCycleNum < BaseCycleNum) {
2256        NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2257        if (OffsetDiff > 0)
2258          --OffsetDiff;
2259      }
2260      int64_t NewOffset =
2261          MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2262      NewMI->getOperand(OffsetPos).setImm(NewOffset);
2263      SU->setInstr(NewMI);
2264      MISUnitMap[NewMI] = SU;
2265      NewMIs[MI] = NewMI;
2266    }
2267  }
2268}
2269
2270/// Return the instruction in the loop that defines the register.
2271/// If the definition is a Phi, then follow the Phi operand to
2272/// the instruction in the loop.
2273MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
2274  SmallPtrSet<MachineInstr *, 8> Visited;
2275  MachineInstr *Def = MRI.getVRegDef(Reg);
2276  while (Def->isPHI()) {
2277    if (!Visited.insert(Def).second)
2278      break;
2279    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2280      if (Def->getOperand(i + 1).getMBB() == BB) {
2281        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2282        break;
2283      }
2284  }
2285  return Def;
2286}
2287
2288/// Return true for an order or output dependence that is loop carried
2289/// potentially. A dependence is loop carried if the destination defines a valu
2290/// that may be used or defined by the source in a subsequent iteration.
2291bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
2292                                         bool isSucc) {
2293  if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2294      Dep.isArtificial())
2295    return false;
2296
2297  if (!SwpPruneLoopCarried)
2298    return true;
2299
2300  if (Dep.getKind() == SDep::Output)
2301    return true;
2302
2303  MachineInstr *SI = Source->getInstr();
2304  MachineInstr *DI = Dep.getSUnit()->getInstr();
2305  if (!isSucc)
2306    std::swap(SI, DI);
2307  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2308
2309  // Assume ordered loads and stores may have a loop carried dependence.
2310  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2311      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2312      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2313    return true;
2314
2315  // Only chain dependences between a load and store can be loop carried.
2316  if (!DI->mayStore() || !SI->mayLoad())
2317    return false;
2318
2319  unsigned DeltaS, DeltaD;
2320  if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2321    return true;
2322
2323  const MachineOperand *BaseOpS, *BaseOpD;
2324  int64_t OffsetS, OffsetD;
2325  bool OffsetSIsScalable, OffsetDIsScalable;
2326  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2327  if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2328                                    TRI) ||
2329      !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2330                                    TRI))
2331    return true;
2332
2333  assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2334         "Expected offsets to be byte offsets");
2335
2336  if (!BaseOpS->isIdenticalTo(*BaseOpD))
2337    return true;
2338
2339  // Check that the base register is incremented by a constant value for each
2340  // iteration.
2341  MachineInstr *Def = MRI.getVRegDef(BaseOpS->getReg());
2342  if (!Def || !Def->isPHI())
2343    return true;
2344  unsigned InitVal = 0;
2345  unsigned LoopVal = 0;
2346  getPhiRegs(*Def, BB, InitVal, LoopVal);
2347  MachineInstr *LoopDef = MRI.getVRegDef(LoopVal);
2348  int D = 0;
2349  if (!LoopDef || !TII->getIncrementValue(*LoopDef, D))
2350    return true;
2351
2352  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2353  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2354
2355  // This is the main test, which checks the offset values and the loop
2356  // increment value to determine if the accesses may be loop carried.
2357  if (AccessSizeS == MemoryLocation::UnknownSize ||
2358      AccessSizeD == MemoryLocation::UnknownSize)
2359    return true;
2360
2361  if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2362    return true;
2363
2364  return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2365}
2366
2367void SwingSchedulerDAG::postprocessDAG() {
2368  for (auto &M : Mutations)
2369    M->apply(this);
2370}
2371
2372/// Try to schedule the node at the specified StartCycle and continue
2373/// until the node is schedule or the EndCycle is reached.  This function
2374/// returns true if the node is scheduled.  This routine may search either
2375/// forward or backward for a place to insert the instruction based upon
2376/// the relative values of StartCycle and EndCycle.
2377bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2378  bool forward = true;
2379  LLVM_DEBUG({
2380    dbgs() << "Trying to insert node between " << StartCycle << " and "
2381           << EndCycle << " II: " << II << "\n";
2382  });
2383  if (StartCycle > EndCycle)
2384    forward = false;
2385
2386  // The terminating condition depends on the direction.
2387  int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2388  for (int curCycle = StartCycle; curCycle != termCycle;
2389       forward ? ++curCycle : --curCycle) {
2390
2391    // Add the already scheduled instructions at the specified cycle to the
2392    // DFA.
2393    ProcItinResources.clearResources();
2394    for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II);
2395         checkCycle <= LastCycle; checkCycle += II) {
2396      std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[checkCycle];
2397
2398      for (std::deque<SUnit *>::iterator I = cycleInstrs.begin(),
2399                                         E = cycleInstrs.end();
2400           I != E; ++I) {
2401        if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode()))
2402          continue;
2403        assert(ProcItinResources.canReserveResources(*(*I)->getInstr()) &&
2404               "These instructions have already been scheduled.");
2405        ProcItinResources.reserveResources(*(*I)->getInstr());
2406      }
2407    }
2408    if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2409        ProcItinResources.canReserveResources(*SU->getInstr())) {
2410      LLVM_DEBUG({
2411        dbgs() << "\tinsert at cycle " << curCycle << " ";
2412        SU->getInstr()->dump();
2413      });
2414
2415      ScheduledInstrs[curCycle].push_back(SU);
2416      InstrToCycle.insert(std::make_pair(SU, curCycle));
2417      if (curCycle > LastCycle)
2418        LastCycle = curCycle;
2419      if (curCycle < FirstCycle)
2420        FirstCycle = curCycle;
2421      return true;
2422    }
2423    LLVM_DEBUG({
2424      dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2425      SU->getInstr()->dump();
2426    });
2427  }
2428  return false;
2429}
2430
2431// Return the cycle of the earliest scheduled instruction in the chain.
2432int SMSchedule::earliestCycleInChain(const SDep &Dep) {
2433  SmallPtrSet<SUnit *, 8> Visited;
2434  SmallVector<SDep, 8> Worklist;
2435  Worklist.push_back(Dep);
2436  int EarlyCycle = INT_MAX;
2437  while (!Worklist.empty()) {
2438    const SDep &Cur = Worklist.pop_back_val();
2439    SUnit *PrevSU = Cur.getSUnit();
2440    if (Visited.count(PrevSU))
2441      continue;
2442    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2443    if (it == InstrToCycle.end())
2444      continue;
2445    EarlyCycle = std::min(EarlyCycle, it->second);
2446    for (const auto &PI : PrevSU->Preds)
2447      if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2448        Worklist.push_back(PI);
2449    Visited.insert(PrevSU);
2450  }
2451  return EarlyCycle;
2452}
2453
2454// Return the cycle of the latest scheduled instruction in the chain.
2455int SMSchedule::latestCycleInChain(const SDep &Dep) {
2456  SmallPtrSet<SUnit *, 8> Visited;
2457  SmallVector<SDep, 8> Worklist;
2458  Worklist.push_back(Dep);
2459  int LateCycle = INT_MIN;
2460  while (!Worklist.empty()) {
2461    const SDep &Cur = Worklist.pop_back_val();
2462    SUnit *SuccSU = Cur.getSUnit();
2463    if (Visited.count(SuccSU))
2464      continue;
2465    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2466    if (it == InstrToCycle.end())
2467      continue;
2468    LateCycle = std::max(LateCycle, it->second);
2469    for (const auto &SI : SuccSU->Succs)
2470      if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2471        Worklist.push_back(SI);
2472    Visited.insert(SuccSU);
2473  }
2474  return LateCycle;
2475}
2476
2477/// If an instruction has a use that spans multiple iterations, then
2478/// return true. These instructions are characterized by having a back-ege
2479/// to a Phi, which contains a reference to another Phi.
2480static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
2481  for (auto &P : SU->Preds)
2482    if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2483      for (auto &S : P.getSUnit()->Succs)
2484        if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2485          return P.getSUnit();
2486  return nullptr;
2487}
2488
2489/// Compute the scheduling start slot for the instruction.  The start slot
2490/// depends on any predecessor or successor nodes scheduled already.
2491void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2492                              int *MinEnd, int *MaxStart, int II,
2493                              SwingSchedulerDAG *DAG) {
2494  // Iterate over each instruction that has been scheduled already.  The start
2495  // slot computation depends on whether the previously scheduled instruction
2496  // is a predecessor or successor of the specified instruction.
2497  for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2498
2499    // Iterate over each instruction in the current cycle.
2500    for (SUnit *I : getInstructions(cycle)) {
2501      // Because we're processing a DAG for the dependences, we recognize
2502      // the back-edge in recurrences by anti dependences.
2503      for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2504        const SDep &Dep = SU->Preds[i];
2505        if (Dep.getSUnit() == I) {
2506          if (!DAG->isBackedge(SU, Dep)) {
2507            int EarlyStart = cycle + Dep.getLatency() -
2508                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2509            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2510            if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2511              int End = earliestCycleInChain(Dep) + (II - 1);
2512              *MinEnd = std::min(*MinEnd, End);
2513            }
2514          } else {
2515            int LateStart = cycle - Dep.getLatency() +
2516                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2517            *MinLateStart = std::min(*MinLateStart, LateStart);
2518          }
2519        }
2520        // For instruction that requires multiple iterations, make sure that
2521        // the dependent instruction is not scheduled past the definition.
2522        SUnit *BE = multipleIterations(I, DAG);
2523        if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2524            !SU->isPred(I))
2525          *MinLateStart = std::min(*MinLateStart, cycle);
2526      }
2527      for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2528        if (SU->Succs[i].getSUnit() == I) {
2529          const SDep &Dep = SU->Succs[i];
2530          if (!DAG->isBackedge(SU, Dep)) {
2531            int LateStart = cycle - Dep.getLatency() +
2532                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2533            *MinLateStart = std::min(*MinLateStart, LateStart);
2534            if (DAG->isLoopCarriedDep(SU, Dep)) {
2535              int Start = latestCycleInChain(Dep) + 1 - II;
2536              *MaxStart = std::max(*MaxStart, Start);
2537            }
2538          } else {
2539            int EarlyStart = cycle + Dep.getLatency() -
2540                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2541            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2542          }
2543        }
2544      }
2545    }
2546  }
2547}
2548
2549/// Order the instructions within a cycle so that the definitions occur
2550/// before the uses. Returns true if the instruction is added to the start
2551/// of the list, or false if added to the end.
2552void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
2553                                 std::deque<SUnit *> &Insts) {
2554  MachineInstr *MI = SU->getInstr();
2555  bool OrderBeforeUse = false;
2556  bool OrderAfterDef = false;
2557  bool OrderBeforeDef = false;
2558  unsigned MoveDef = 0;
2559  unsigned MoveUse = 0;
2560  int StageInst1 = stageScheduled(SU);
2561
2562  unsigned Pos = 0;
2563  for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2564       ++I, ++Pos) {
2565    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2566      MachineOperand &MO = MI->getOperand(i);
2567      if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
2568        continue;
2569
2570      Register Reg = MO.getReg();
2571      unsigned BasePos, OffsetPos;
2572      if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2573        if (MI->getOperand(BasePos).getReg() == Reg)
2574          if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2575            Reg = NewReg;
2576      bool Reads, Writes;
2577      std::tie(Reads, Writes) =
2578          (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2579      if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2580        OrderBeforeUse = true;
2581        if (MoveUse == 0)
2582          MoveUse = Pos;
2583      } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2584        // Add the instruction after the scheduled instruction.
2585        OrderAfterDef = true;
2586        MoveDef = Pos;
2587      } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2588        if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2589          OrderBeforeUse = true;
2590          if (MoveUse == 0)
2591            MoveUse = Pos;
2592        } else {
2593          OrderAfterDef = true;
2594          MoveDef = Pos;
2595        }
2596      } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2597        OrderBeforeUse = true;
2598        if (MoveUse == 0)
2599          MoveUse = Pos;
2600        if (MoveUse != 0) {
2601          OrderAfterDef = true;
2602          MoveDef = Pos - 1;
2603        }
2604      } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2605        // Add the instruction before the scheduled instruction.
2606        OrderBeforeUse = true;
2607        if (MoveUse == 0)
2608          MoveUse = Pos;
2609      } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2610                 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2611        if (MoveUse == 0) {
2612          OrderBeforeDef = true;
2613          MoveUse = Pos;
2614        }
2615      }
2616    }
2617    // Check for order dependences between instructions. Make sure the source
2618    // is ordered before the destination.
2619    for (auto &S : SU->Succs) {
2620      if (S.getSUnit() != *I)
2621        continue;
2622      if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2623        OrderBeforeUse = true;
2624        if (Pos < MoveUse)
2625          MoveUse = Pos;
2626      }
2627      // We did not handle HW dependences in previous for loop,
2628      // and we normally set Latency = 0 for Anti deps,
2629      // so may have nodes in same cycle with Anti denpendent on HW regs.
2630      else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2631        OrderBeforeUse = true;
2632        if ((MoveUse == 0) || (Pos < MoveUse))
2633          MoveUse = Pos;
2634      }
2635    }
2636    for (auto &P : SU->Preds) {
2637      if (P.getSUnit() != *I)
2638        continue;
2639      if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2640        OrderAfterDef = true;
2641        MoveDef = Pos;
2642      }
2643    }
2644  }
2645
2646  // A circular dependence.
2647  if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
2648    OrderBeforeUse = false;
2649
2650  // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
2651  // to a loop-carried dependence.
2652  if (OrderBeforeDef)
2653    OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
2654
2655  // The uncommon case when the instruction order needs to be updated because
2656  // there is both a use and def.
2657  if (OrderBeforeUse && OrderAfterDef) {
2658    SUnit *UseSU = Insts.at(MoveUse);
2659    SUnit *DefSU = Insts.at(MoveDef);
2660    if (MoveUse > MoveDef) {
2661      Insts.erase(Insts.begin() + MoveUse);
2662      Insts.erase(Insts.begin() + MoveDef);
2663    } else {
2664      Insts.erase(Insts.begin() + MoveDef);
2665      Insts.erase(Insts.begin() + MoveUse);
2666    }
2667    orderDependence(SSD, UseSU, Insts);
2668    orderDependence(SSD, SU, Insts);
2669    orderDependence(SSD, DefSU, Insts);
2670    return;
2671  }
2672  // Put the new instruction first if there is a use in the list. Otherwise,
2673  // put it at the end of the list.
2674  if (OrderBeforeUse)
2675    Insts.push_front(SU);
2676  else
2677    Insts.push_back(SU);
2678}
2679
2680/// Return true if the scheduled Phi has a loop carried operand.
2681bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
2682  if (!Phi.isPHI())
2683    return false;
2684  assert(Phi.isPHI() && "Expecting a Phi.");
2685  SUnit *DefSU = SSD->getSUnit(&Phi);
2686  unsigned DefCycle = cycleScheduled(DefSU);
2687  int DefStage = stageScheduled(DefSU);
2688
2689  unsigned InitVal = 0;
2690  unsigned LoopVal = 0;
2691  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
2692  SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
2693  if (!UseSU)
2694    return true;
2695  if (UseSU->getInstr()->isPHI())
2696    return true;
2697  unsigned LoopCycle = cycleScheduled(UseSU);
2698  int LoopStage = stageScheduled(UseSU);
2699  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
2700}
2701
2702/// Return true if the instruction is a definition that is loop carried
2703/// and defines the use on the next iteration.
2704///        v1 = phi(v2, v3)
2705///  (Def) v3 = op v1
2706///  (MO)   = v1
2707/// If MO appears before Def, then then v1 and v3 may get assigned to the same
2708/// register.
2709bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
2710                                       MachineInstr *Def, MachineOperand &MO) {
2711  if (!MO.isReg())
2712    return false;
2713  if (Def->isPHI())
2714    return false;
2715  MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
2716  if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
2717    return false;
2718  if (!isLoopCarried(SSD, *Phi))
2719    return false;
2720  unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
2721  for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
2722    MachineOperand &DMO = Def->getOperand(i);
2723    if (!DMO.isReg() || !DMO.isDef())
2724      continue;
2725    if (DMO.getReg() == LoopReg)
2726      return true;
2727  }
2728  return false;
2729}
2730
2731// Check if the generated schedule is valid. This function checks if
2732// an instruction that uses a physical register is scheduled in a
2733// different stage than the definition. The pipeliner does not handle
2734// physical register values that may cross a basic block boundary.
2735bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
2736  for (int i = 0, e = SSD->SUnits.size(); i < e; ++i) {
2737    SUnit &SU = SSD->SUnits[i];
2738    if (!SU.hasPhysRegDefs)
2739      continue;
2740    int StageDef = stageScheduled(&SU);
2741    assert(StageDef != -1 && "Instruction should have been scheduled.");
2742    for (auto &SI : SU.Succs)
2743      if (SI.isAssignedRegDep())
2744        if (Register::isPhysicalRegister(SI.getReg()))
2745          if (stageScheduled(SI.getSUnit()) != StageDef)
2746            return false;
2747  }
2748  return true;
2749}
2750
2751/// A property of the node order in swing-modulo-scheduling is
2752/// that for nodes outside circuits the following holds:
2753/// none of them is scheduled after both a successor and a
2754/// predecessor.
2755/// The method below checks whether the property is met.
2756/// If not, debug information is printed and statistics information updated.
2757/// Note that we do not use an assert statement.
2758/// The reason is that although an invalid node oder may prevent
2759/// the pipeliner from finding a pipelined schedule for arbitrary II,
2760/// it does not lead to the generation of incorrect code.
2761void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
2762
2763  // a sorted vector that maps each SUnit to its index in the NodeOrder
2764  typedef std::pair<SUnit *, unsigned> UnitIndex;
2765  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
2766
2767  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
2768    Indices.push_back(std::make_pair(NodeOrder[i], i));
2769
2770  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
2771    return std::get<0>(i1) < std::get<0>(i2);
2772  };
2773
2774  // sort, so that we can perform a binary search
2775  llvm::sort(Indices, CompareKey);
2776
2777  bool Valid = true;
2778  (void)Valid;
2779  // for each SUnit in the NodeOrder, check whether
2780  // it appears after both a successor and a predecessor
2781  // of the SUnit. If this is the case, and the SUnit
2782  // is not part of circuit, then the NodeOrder is not
2783  // valid.
2784  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
2785    SUnit *SU = NodeOrder[i];
2786    unsigned Index = i;
2787
2788    bool PredBefore = false;
2789    bool SuccBefore = false;
2790
2791    SUnit *Succ;
2792    SUnit *Pred;
2793    (void)Succ;
2794    (void)Pred;
2795
2796    for (SDep &PredEdge : SU->Preds) {
2797      SUnit *PredSU = PredEdge.getSUnit();
2798      unsigned PredIndex = std::get<1>(
2799          *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
2800      if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
2801        PredBefore = true;
2802        Pred = PredSU;
2803        break;
2804      }
2805    }
2806
2807    for (SDep &SuccEdge : SU->Succs) {
2808      SUnit *SuccSU = SuccEdge.getSUnit();
2809      // Do not process a boundary node, it was not included in NodeOrder,
2810      // hence not in Indices either, call to std::lower_bound() below will
2811      // return Indices.end().
2812      if (SuccSU->isBoundaryNode())
2813        continue;
2814      unsigned SuccIndex = std::get<1>(
2815          *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
2816      if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
2817        SuccBefore = true;
2818        Succ = SuccSU;
2819        break;
2820      }
2821    }
2822
2823    if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
2824      // instructions in circuits are allowed to be scheduled
2825      // after both a successor and predecessor.
2826      bool InCircuit = llvm::any_of(
2827          Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
2828      if (InCircuit)
2829        LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
2830      else {
2831        Valid = false;
2832        NumNodeOrderIssues++;
2833        LLVM_DEBUG(dbgs() << "Predecessor ";);
2834      }
2835      LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
2836                        << " are scheduled before node " << SU->NodeNum
2837                        << "\n";);
2838    }
2839  }
2840
2841  LLVM_DEBUG({
2842    if (!Valid)
2843      dbgs() << "Invalid node order found!\n";
2844  });
2845}
2846
2847/// Attempt to fix the degenerate cases when the instruction serialization
2848/// causes the register lifetimes to overlap. For example,
2849///   p' = store_pi(p, b)
2850///      = load p, offset
2851/// In this case p and p' overlap, which means that two registers are needed.
2852/// Instead, this function changes the load to use p' and updates the offset.
2853void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
2854  unsigned OverlapReg = 0;
2855  unsigned NewBaseReg = 0;
2856  for (SUnit *SU : Instrs) {
2857    MachineInstr *MI = SU->getInstr();
2858    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2859      const MachineOperand &MO = MI->getOperand(i);
2860      // Look for an instruction that uses p. The instruction occurs in the
2861      // same cycle but occurs later in the serialized order.
2862      if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
2863        // Check that the instruction appears in the InstrChanges structure,
2864        // which contains instructions that can have the offset updated.
2865        DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
2866          InstrChanges.find(SU);
2867        if (It != InstrChanges.end()) {
2868          unsigned BasePos, OffsetPos;
2869          // Update the base register and adjust the offset.
2870          if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
2871            MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2872            NewMI->getOperand(BasePos).setReg(NewBaseReg);
2873            int64_t NewOffset =
2874                MI->getOperand(OffsetPos).getImm() - It->second.second;
2875            NewMI->getOperand(OffsetPos).setImm(NewOffset);
2876            SU->setInstr(NewMI);
2877            MISUnitMap[NewMI] = SU;
2878            NewMIs[MI] = NewMI;
2879          }
2880        }
2881        OverlapReg = 0;
2882        NewBaseReg = 0;
2883        break;
2884      }
2885      // Look for an instruction of the form p' = op(p), which uses and defines
2886      // two virtual registers that get allocated to the same physical register.
2887      unsigned TiedUseIdx = 0;
2888      if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
2889        // OverlapReg is p in the example above.
2890        OverlapReg = MI->getOperand(TiedUseIdx).getReg();
2891        // NewBaseReg is p' in the example above.
2892        NewBaseReg = MI->getOperand(i).getReg();
2893        break;
2894      }
2895    }
2896  }
2897}
2898
2899/// After the schedule has been formed, call this function to combine
2900/// the instructions from the different stages/cycles.  That is, this
2901/// function creates a schedule that represents a single iteration.
2902void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
2903  // Move all instructions to the first stage from later stages.
2904  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2905    for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
2906         ++stage) {
2907      std::deque<SUnit *> &cycleInstrs =
2908          ScheduledInstrs[cycle + (stage * InitiationInterval)];
2909      for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(),
2910                                                 E = cycleInstrs.rend();
2911           I != E; ++I)
2912        ScheduledInstrs[cycle].push_front(*I);
2913    }
2914  }
2915
2916  // Erase all the elements in the later stages. Only one iteration should
2917  // remain in the scheduled list, and it contains all the instructions.
2918  for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
2919    ScheduledInstrs.erase(cycle);
2920
2921  // Change the registers in instruction as specified in the InstrChanges
2922  // map. We need to use the new registers to create the correct order.
2923  for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) {
2924    SUnit *SU = &SSD->SUnits[i];
2925    SSD->applyInstrChange(SU->getInstr(), *this);
2926  }
2927
2928  // Reorder the instructions in each cycle to fix and improve the
2929  // generated code.
2930  for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
2931    std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
2932    std::deque<SUnit *> newOrderPhi;
2933    for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
2934      SUnit *SU = cycleInstrs[i];
2935      if (SU->getInstr()->isPHI())
2936        newOrderPhi.push_back(SU);
2937    }
2938    std::deque<SUnit *> newOrderI;
2939    for (unsigned i = 0, e = cycleInstrs.size(); i < e; ++i) {
2940      SUnit *SU = cycleInstrs[i];
2941      if (!SU->getInstr()->isPHI())
2942        orderDependence(SSD, SU, newOrderI);
2943    }
2944    // Replace the old order with the new order.
2945    cycleInstrs.swap(newOrderPhi);
2946    cycleInstrs.insert(cycleInstrs.end(), newOrderI.begin(), newOrderI.end());
2947    SSD->fixupRegisterOverlaps(cycleInstrs);
2948  }
2949
2950  LLVM_DEBUG(dump(););
2951}
2952
2953void NodeSet::print(raw_ostream &os) const {
2954  os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
2955     << " depth " << MaxDepth << " col " << Colocate << "\n";
2956  for (const auto &I : Nodes)
2957    os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
2958  os << "\n";
2959}
2960
2961#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2962/// Print the schedule information to the given output.
2963void SMSchedule::print(raw_ostream &os) const {
2964  // Iterate over each cycle.
2965  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2966    // Iterate over each instruction in the cycle.
2967    const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
2968    for (SUnit *CI : cycleInstrs->second) {
2969      os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
2970      os << "(" << CI->NodeNum << ") ";
2971      CI->getInstr()->print(os);
2972      os << "\n";
2973    }
2974  }
2975}
2976
2977/// Utility function used for debugging to print the schedule.
2978LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
2979LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
2980
2981#endif
2982
2983void ResourceManager::initProcResourceVectors(
2984    const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
2985  unsigned ProcResourceID = 0;
2986
2987  // We currently limit the resource kinds to 64 and below so that we can use
2988  // uint64_t for Masks
2989  assert(SM.getNumProcResourceKinds() < 64 &&
2990         "Too many kinds of resources, unsupported");
2991  // Create a unique bitmask for every processor resource unit.
2992  // Skip resource at index 0, since it always references 'InvalidUnit'.
2993  Masks.resize(SM.getNumProcResourceKinds());
2994  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
2995    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
2996    if (Desc.SubUnitsIdxBegin)
2997      continue;
2998    Masks[I] = 1ULL << ProcResourceID;
2999    ProcResourceID++;
3000  }
3001  // Create a unique bitmask for every processor resource group.
3002  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3003    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3004    if (!Desc.SubUnitsIdxBegin)
3005      continue;
3006    Masks[I] = 1ULL << ProcResourceID;
3007    for (unsigned U = 0; U < Desc.NumUnits; ++U)
3008      Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3009    ProcResourceID++;
3010  }
3011  LLVM_DEBUG({
3012    if (SwpShowResMask) {
3013      dbgs() << "ProcResourceDesc:\n";
3014      for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3015        const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3016        dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3017                         ProcResource->Name, I, Masks[I],
3018                         ProcResource->NumUnits);
3019      }
3020      dbgs() << " -----------------\n";
3021    }
3022  });
3023}
3024
3025bool ResourceManager::canReserveResources(const MCInstrDesc *MID) const {
3026
3027  LLVM_DEBUG({
3028    if (SwpDebugResource)
3029      dbgs() << "canReserveResources:\n";
3030  });
3031  if (UseDFA)
3032    return DFAResources->canReserveResources(MID);
3033
3034  unsigned InsnClass = MID->getSchedClass();
3035  const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
3036  if (!SCDesc->isValid()) {
3037    LLVM_DEBUG({
3038      dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3039      dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
3040    });
3041    return true;
3042  }
3043
3044  const MCWriteProcResEntry *I = STI->getWriteProcResBegin(SCDesc);
3045  const MCWriteProcResEntry *E = STI->getWriteProcResEnd(SCDesc);
3046  for (; I != E; ++I) {
3047    if (!I->Cycles)
3048      continue;
3049    const MCProcResourceDesc *ProcResource =
3050        SM.getProcResource(I->ProcResourceIdx);
3051    unsigned NumUnits = ProcResource->NumUnits;
3052    LLVM_DEBUG({
3053      if (SwpDebugResource)
3054        dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3055                         ProcResource->Name, I->ProcResourceIdx,
3056                         ProcResourceCount[I->ProcResourceIdx], NumUnits,
3057                         I->Cycles);
3058    });
3059    if (ProcResourceCount[I->ProcResourceIdx] >= NumUnits)
3060      return false;
3061  }
3062  LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return true\n\n";);
3063  return true;
3064}
3065
3066void ResourceManager::reserveResources(const MCInstrDesc *MID) {
3067  LLVM_DEBUG({
3068    if (SwpDebugResource)
3069      dbgs() << "reserveResources:\n";
3070  });
3071  if (UseDFA)
3072    return DFAResources->reserveResources(MID);
3073
3074  unsigned InsnClass = MID->getSchedClass();
3075  const MCSchedClassDesc *SCDesc = SM.getSchedClassDesc(InsnClass);
3076  if (!SCDesc->isValid()) {
3077    LLVM_DEBUG({
3078      dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3079      dbgs() << "isPseduo:" << MID->isPseudo() << "\n";
3080    });
3081    return;
3082  }
3083  for (const MCWriteProcResEntry &PRE :
3084       make_range(STI->getWriteProcResBegin(SCDesc),
3085                  STI->getWriteProcResEnd(SCDesc))) {
3086    if (!PRE.Cycles)
3087      continue;
3088    ++ProcResourceCount[PRE.ProcResourceIdx];
3089    LLVM_DEBUG({
3090      if (SwpDebugResource) {
3091        const MCProcResourceDesc *ProcResource =
3092            SM.getProcResource(PRE.ProcResourceIdx);
3093        dbgs() << format(" %16s(%2d): Count: %2d, NumUnits:%2d, Cycles:%2d\n",
3094                         ProcResource->Name, PRE.ProcResourceIdx,
3095                         ProcResourceCount[PRE.ProcResourceIdx],
3096                         ProcResource->NumUnits, PRE.Cycles);
3097      }
3098    });
3099  }
3100  LLVM_DEBUG({
3101    if (SwpDebugResource)
3102      dbgs() << "reserveResources: done!\n\n";
3103  });
3104}
3105
3106bool ResourceManager::canReserveResources(const MachineInstr &MI) const {
3107  return canReserveResources(&MI.getDesc());
3108}
3109
3110void ResourceManager::reserveResources(const MachineInstr &MI) {
3111  return reserveResources(&MI.getDesc());
3112}
3113
3114void ResourceManager::clearResources() {
3115  if (UseDFA)
3116    return DFAResources->clearResources();
3117  std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
3118}
3119