1//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This implements the ScheduleDAGInstrs class, which implements
10/// re-scheduling of MachineInstrs.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/CodeGen/ScheduleDAGInstrs.h"
15
16#include "llvm/ADT/IntEqClasses.h"
17#include "llvm/ADT/MapVector.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/SparseSet.h"
20#include "llvm/ADT/iterator_range.h"
21#include "llvm/Analysis/AliasAnalysis.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/LiveIntervals.h"
24#include "llvm/CodeGen/LivePhysRegs.h"
25#include "llvm/CodeGen/MachineBasicBlock.h"
26#include "llvm/CodeGen/MachineFrameInfo.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineInstr.h"
29#include "llvm/CodeGen/MachineInstrBundle.h"
30#include "llvm/CodeGen/MachineMemOperand.h"
31#include "llvm/CodeGen/MachineOperand.h"
32#include "llvm/CodeGen/MachineRegisterInfo.h"
33#include "llvm/CodeGen/PseudoSourceValue.h"
34#include "llvm/CodeGen/RegisterPressure.h"
35#include "llvm/CodeGen/ScheduleDAG.h"
36#include "llvm/CodeGen/ScheduleDFS.h"
37#include "llvm/CodeGen/SlotIndexes.h"
38#include "llvm/CodeGen/TargetRegisterInfo.h"
39#include "llvm/CodeGen/TargetSubtargetInfo.h"
40#include "llvm/Config/llvm-config.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
45#include "llvm/MC/LaneBitmask.h"
46#include "llvm/MC/MCRegisterInfo.h"
47#include "llvm/Support/Casting.h"
48#include "llvm/Support/CommandLine.h"
49#include "llvm/Support/Compiler.h"
50#include "llvm/Support/Debug.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Support/Format.h"
53#include "llvm/Support/raw_ostream.h"
54#include <algorithm>
55#include <cassert>
56#include <iterator>
57#include <utility>
58#include <vector>
59
60using namespace llvm;
61
62#define DEBUG_TYPE "machine-scheduler"
63
64static cl::opt<bool>
65    EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
66                    cl::desc("Enable use of AA during MI DAG construction"));
67
68static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
69    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
70
71// Note: the two options below might be used in tuning compile time vs
72// output quality. Setting HugeRegion so large that it will never be
73// reached means best-effort, but may be slow.
74
75// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
76// together hold this many SUs, a reduction of maps will be done.
77static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
78    cl::init(1000), cl::desc("The limit to use while constructing the DAG "
79                             "prior to scheduling, at which point a trade-off "
80                             "is made to avoid excessive compile time."));
81
82static cl::opt<unsigned> ReductionSize(
83    "dag-maps-reduction-size", cl::Hidden,
84    cl::desc("A huge scheduling region will have maps reduced by this many "
85             "nodes at a time. Defaults to HugeRegion / 2."));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88static cl::opt<bool> SchedPrintCycles(
89    "sched-print-cycles", cl::Hidden, cl::init(false),
90    cl::desc("Report top/bottom cycles when dumping SUnit instances"));
91#endif
92
93static unsigned getReductionSize() {
94  // Always reduce a huge region with half of the elements, except
95  // when user sets this number explicitly.
96  if (ReductionSize.getNumOccurrences() == 0)
97    return HugeRegion / 2;
98  return ReductionSize;
99}
100
101static void dumpSUList(const ScheduleDAGInstrs::SUList &L) {
102#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
103  dbgs() << "{ ";
104  for (const SUnit *SU : L) {
105    dbgs() << "SU(" << SU->NodeNum << ")";
106    if (SU != L.back())
107      dbgs() << ", ";
108  }
109  dbgs() << "}\n";
110#endif
111}
112
113ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
114                                     const MachineLoopInfo *mli,
115                                     bool RemoveKillFlags)
116    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
117      RemoveKillFlags(RemoveKillFlags),
118      UnknownValue(UndefValue::get(
119                             Type::getVoidTy(mf.getFunction().getContext()))), Topo(SUnits, &ExitSU) {
120  DbgValues.clear();
121
122  const TargetSubtargetInfo &ST = mf.getSubtarget();
123  SchedModel.init(&ST);
124}
125
126/// If this machine instr has memory reference information and it can be
127/// tracked to a normal reference to a known object, return the Value
128/// for that object. This function returns false the memory location is
129/// unknown or may alias anything.
130static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
131                                         const MachineFrameInfo &MFI,
132                                         UnderlyingObjectsVector &Objects,
133                                         const DataLayout &DL) {
134  auto AllMMOsOkay = [&]() {
135    for (const MachineMemOperand *MMO : MI->memoperands()) {
136      // TODO: Figure out whether isAtomic is really necessary (see D57601).
137      if (MMO->isVolatile() || MMO->isAtomic())
138        return false;
139
140      if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
141        // Function that contain tail calls don't have unique PseudoSourceValue
142        // objects. Two PseudoSourceValues might refer to the same or
143        // overlapping locations. The client code calling this function assumes
144        // this is not the case. So return a conservative answer of no known
145        // object.
146        if (MFI.hasTailCall())
147          return false;
148
149        // For now, ignore PseudoSourceValues which may alias LLVM IR values
150        // because the code that uses this function has no way to cope with
151        // such aliases.
152        if (PSV->isAliased(&MFI))
153          return false;
154
155        bool MayAlias = PSV->mayAlias(&MFI);
156        Objects.emplace_back(PSV, MayAlias);
157      } else if (const Value *V = MMO->getValue()) {
158        SmallVector<Value *, 4> Objs;
159        if (!getUnderlyingObjectsForCodeGen(V, Objs))
160          return false;
161
162        for (Value *V : Objs) {
163          assert(isIdentifiedObject(V));
164          Objects.emplace_back(V, true);
165        }
166      } else
167        return false;
168    }
169    return true;
170  };
171
172  if (!AllMMOsOkay()) {
173    Objects.clear();
174    return false;
175  }
176
177  return true;
178}
179
180void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
181  BB = bb;
182}
183
184void ScheduleDAGInstrs::finishBlock() {
185  // Subclasses should no longer refer to the old block.
186  BB = nullptr;
187}
188
189void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
190                                    MachineBasicBlock::iterator begin,
191                                    MachineBasicBlock::iterator end,
192                                    unsigned regioninstrs) {
193  assert(bb == BB && "startBlock should set BB");
194  RegionBegin = begin;
195  RegionEnd = end;
196  NumRegionInstrs = regioninstrs;
197}
198
199void ScheduleDAGInstrs::exitRegion() {
200  // Nothing to do.
201}
202
203void ScheduleDAGInstrs::addSchedBarrierDeps() {
204  MachineInstr *ExitMI =
205      RegionEnd != BB->end()
206          ? &*skipDebugInstructionsBackward(RegionEnd, RegionBegin)
207          : nullptr;
208  ExitSU.setInstr(ExitMI);
209  // Add dependencies on the defs and uses of the instruction.
210  if (ExitMI) {
211    for (const MachineOperand &MO : ExitMI->operands()) {
212      if (!MO.isReg() || MO.isDef()) continue;
213      Register Reg = MO.getReg();
214      if (Reg.isPhysical()) {
215        Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
216      } else if (Reg.isVirtual() && MO.readsReg()) {
217        addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
218      }
219    }
220  }
221  if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
222    // For others, e.g. fallthrough, conditional branch, assume the exit
223    // uses all the registers that are livein to the successor blocks.
224    for (const MachineBasicBlock *Succ : BB->successors()) {
225      for (const auto &LI : Succ->liveins()) {
226        if (!Uses.contains(LI.PhysReg))
227          Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
228      }
229    }
230  }
231}
232
233/// MO is an operand of SU's instruction that defines a physical register. Adds
234/// data dependencies from SU to any uses of the physical register.
235void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
236  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
237  assert(MO.isDef() && "expect physreg def");
238
239  // Ask the target if address-backscheduling is desirable, and if so how much.
240  const TargetSubtargetInfo &ST = MF.getSubtarget();
241
242  // Only use any non-zero latency for real defs/uses, in contrast to
243  // "fake" operands added by regalloc.
244  const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc();
245  bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() &&
246                            !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
247  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
248       Alias.isValid(); ++Alias) {
249    for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
250      SUnit *UseSU = I->SU;
251      if (UseSU == SU)
252        continue;
253
254      // Adjust the dependence latency using operand def/use information,
255      // then allow the target to perform its own adjustments.
256      int UseOp = I->OpIdx;
257      MachineInstr *RegUse = nullptr;
258      SDep Dep;
259      if (UseOp < 0)
260        Dep = SDep(SU, SDep::Artificial);
261      else {
262        // Set the hasPhysRegDefs only for physreg defs that have a use within
263        // the scheduling region.
264        SU->hasPhysRegDefs = true;
265        Dep = SDep(SU, SDep::Data, *Alias);
266        RegUse = UseSU->getInstr();
267      }
268      const MCInstrDesc *UseMIDesc =
269          (RegUse ? &UseSU->getInstr()->getDesc() : nullptr);
270      bool ImplicitPseudoUse =
271          (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) &&
272           !UseMIDesc->hasImplicitUseOfPhysReg(*Alias));
273      if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
274        Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
275                                                        RegUse, UseOp));
276      } else {
277        Dep.setLatency(0);
278      }
279      ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
280      UseSU->addPred(Dep);
281    }
282  }
283}
284
285/// Adds register dependencies (data, anti, and output) from this SUnit
286/// to following instructions in the same scheduling region that depend the
287/// physical register referenced at OperIdx.
288void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
289  MachineInstr *MI = SU->getInstr();
290  MachineOperand &MO = MI->getOperand(OperIdx);
291  Register Reg = MO.getReg();
292  // We do not need to track any dependencies for constant registers.
293  if (MRI.isConstantPhysReg(Reg))
294    return;
295
296  const TargetSubtargetInfo &ST = MF.getSubtarget();
297
298  // Optionally add output and anti dependencies. For anti
299  // dependencies we use a latency of 0 because for a multi-issue
300  // target we want to allow the defining instruction to issue
301  // in the same cycle as the using instruction.
302  // TODO: Using a latency of 1 here for output dependencies assumes
303  //       there's no cost for reusing registers.
304  SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
305  for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
306    if (!Defs.contains(*Alias))
307      continue;
308    for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
309      SUnit *DefSU = I->SU;
310      if (DefSU == &ExitSU)
311        continue;
312      if (DefSU != SU &&
313          (Kind != SDep::Output || !MO.isDead() ||
314           !DefSU->getInstr()->registerDefIsDead(*Alias))) {
315        SDep Dep(SU, Kind, /*Reg=*/*Alias);
316        if (Kind != SDep::Anti)
317          Dep.setLatency(
318            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
319        ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep);
320        DefSU->addPred(Dep);
321      }
322    }
323  }
324
325  if (!MO.isDef()) {
326    SU->hasPhysRegUses = true;
327    // Either insert a new Reg2SUnits entry with an empty SUnits list, or
328    // retrieve the existing SUnits list for this register's uses.
329    // Push this SUnit on the use list.
330    Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
331    if (RemoveKillFlags)
332      MO.setIsKill(false);
333  } else {
334    addPhysRegDataDeps(SU, OperIdx);
335
336    // Clear previous uses and defs of this register and its subergisters.
337    for (MCSubRegIterator SubReg(Reg, TRI, true); SubReg.isValid(); ++SubReg) {
338      if (Uses.contains(*SubReg))
339        Uses.eraseAll(*SubReg);
340      if (!MO.isDead())
341        Defs.eraseAll(*SubReg);
342    }
343    if (MO.isDead() && SU->isCall) {
344      // Calls will not be reordered because of chain dependencies (see
345      // below). Since call operands are dead, calls may continue to be added
346      // to the DefList making dependence checking quadratic in the size of
347      // the block. Instead, we leave only one call at the back of the
348      // DefList.
349      Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
350      Reg2SUnitsMap::iterator B = P.first;
351      Reg2SUnitsMap::iterator I = P.second;
352      for (bool isBegin = I == B; !isBegin; /* empty */) {
353        isBegin = (--I) == B;
354        if (!I->SU->isCall)
355          break;
356        I = Defs.erase(I);
357      }
358    }
359
360    // Defs are pushed in the order they are visited and never reordered.
361    Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
362  }
363}
364
365LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
366{
367  Register Reg = MO.getReg();
368  // No point in tracking lanemasks if we don't have interesting subregisters.
369  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
370  if (!RC.HasDisjunctSubRegs)
371    return LaneBitmask::getAll();
372
373  unsigned SubReg = MO.getSubReg();
374  if (SubReg == 0)
375    return RC.getLaneMask();
376  return TRI->getSubRegIndexLaneMask(SubReg);
377}
378
379bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
380  auto RegUse = CurrentVRegUses.find(MO.getReg());
381  if (RegUse == CurrentVRegUses.end())
382    return true;
383  return (RegUse->LaneMask & getLaneMaskForMO(MO)).none();
384}
385
386/// Adds register output and data dependencies from this SUnit to instructions
387/// that occur later in the same scheduling region if they read from or write to
388/// the virtual register defined at OperIdx.
389///
390/// TODO: Hoist loop induction variable increments. This has to be
391/// reevaluated. Generally, IV scheduling should be done before coalescing.
392void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
393  MachineInstr *MI = SU->getInstr();
394  MachineOperand &MO = MI->getOperand(OperIdx);
395  Register Reg = MO.getReg();
396
397  LaneBitmask DefLaneMask;
398  LaneBitmask KillLaneMask;
399  if (TrackLaneMasks) {
400    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
401    DefLaneMask = getLaneMaskForMO(MO);
402    // If we have a <read-undef> flag, none of the lane values comes from an
403    // earlier instruction.
404    KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
405
406    if (MO.getSubReg() != 0 && MO.isUndef()) {
407      // There may be other subregister defs on the same instruction of the same
408      // register in later operands. The lanes of other defs will now be live
409      // after this instruction, so these should not be treated as killed by the
410      // instruction even though they appear to be killed in this one operand.
411      for (const MachineOperand &OtherMO :
412           llvm::drop_begin(MI->operands(), OperIdx + 1))
413        if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
414          KillLaneMask &= ~getLaneMaskForMO(OtherMO);
415    }
416
417    // Clear undef flag, we'll re-add it later once we know which subregister
418    // Def is first.
419    MO.setIsUndef(false);
420  } else {
421    DefLaneMask = LaneBitmask::getAll();
422    KillLaneMask = LaneBitmask::getAll();
423  }
424
425  if (MO.isDead()) {
426    assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
427  } else {
428    // Add data dependence to all uses we found so far.
429    const TargetSubtargetInfo &ST = MF.getSubtarget();
430    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
431         E = CurrentVRegUses.end(); I != E; /*empty*/) {
432      LaneBitmask LaneMask = I->LaneMask;
433      // Ignore uses of other lanes.
434      if ((LaneMask & KillLaneMask).none()) {
435        ++I;
436        continue;
437      }
438
439      if ((LaneMask & DefLaneMask).any()) {
440        SUnit *UseSU = I->SU;
441        MachineInstr *Use = UseSU->getInstr();
442        SDep Dep(SU, SDep::Data, Reg);
443        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
444                                                        I->OperandIndex));
445        ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep);
446        UseSU->addPred(Dep);
447      }
448
449      LaneMask &= ~KillLaneMask;
450      // If we found a Def for all lanes of this use, remove it from the list.
451      if (LaneMask.any()) {
452        I->LaneMask = LaneMask;
453        ++I;
454      } else
455        I = CurrentVRegUses.erase(I);
456    }
457  }
458
459  // Shortcut: Singly defined vregs do not have output/anti dependencies.
460  if (MRI.hasOneDef(Reg))
461    return;
462
463  // Add output dependence to the next nearest defs of this vreg.
464  //
465  // Unless this definition is dead, the output dependence should be
466  // transitively redundant with antidependencies from this definition's
467  // uses. We're conservative for now until we have a way to guarantee the uses
468  // are not eliminated sometime during scheduling. The output dependence edge
469  // is also useful if output latency exceeds def-use latency.
470  LaneBitmask LaneMask = DefLaneMask;
471  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
472                                     CurrentVRegDefs.end())) {
473    // Ignore defs for other lanes.
474    if ((V2SU.LaneMask & LaneMask).none())
475      continue;
476    // Add an output dependence.
477    SUnit *DefSU = V2SU.SU;
478    // Ignore additional defs of the same lanes in one instruction. This can
479    // happen because lanemasks are shared for targets with too many
480    // subregisters. We also use some representration tricks/hacks where we
481    // add super-register defs/uses, to imply that although we only access parts
482    // of the reg we care about the full one.
483    if (DefSU == SU)
484      continue;
485    SDep Dep(SU, SDep::Output, Reg);
486    Dep.setLatency(
487      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
488    DefSU->addPred(Dep);
489
490    // Update current definition. This can get tricky if the def was about a
491    // bigger lanemask before. We then have to shrink it and create a new
492    // VReg2SUnit for the non-overlapping part.
493    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
494    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
495    V2SU.SU = SU;
496    V2SU.LaneMask = OverlapMask;
497    if (NonOverlapMask.any())
498      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
499  }
500  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
501  if (LaneMask.any())
502    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
503}
504
505/// Adds a register data dependency if the instruction that defines the
506/// virtual register used at OperIdx is mapped to an SUnit. Add a register
507/// antidependency from this SUnit to instructions that occur later in the same
508/// scheduling region if they write the virtual register.
509///
510/// TODO: Handle ExitSU "uses" properly.
511void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
512  const MachineInstr *MI = SU->getInstr();
513  assert(!MI->isDebugOrPseudoInstr());
514
515  const MachineOperand &MO = MI->getOperand(OperIdx);
516  Register Reg = MO.getReg();
517
518  // Remember the use. Data dependencies will be added when we find the def.
519  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
520                                        : LaneBitmask::getAll();
521  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
522
523  // Add antidependences to the following defs of the vreg.
524  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
525                                     CurrentVRegDefs.end())) {
526    // Ignore defs for unrelated lanes.
527    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
528    if ((PrevDefLaneMask & LaneMask).none())
529      continue;
530    if (V2SU.SU == SU)
531      continue;
532
533    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
534  }
535}
536
537/// Returns true if MI is an instruction we are unable to reason about
538/// (like a call or something with unmodeled side effects).
539static inline bool isGlobalMemoryObject(MachineInstr *MI) {
540  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
541         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
542}
543
544void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
545                                            unsigned Latency) {
546  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
547    SDep Dep(SUa, SDep::MayAliasMem);
548    Dep.setLatency(Latency);
549    SUb->addPred(Dep);
550  }
551}
552
553/// Creates an SUnit for each real instruction, numbered in top-down
554/// topological order. The instruction order A < B, implies that no edge exists
555/// from B to A.
556///
557/// Map each real instruction to its SUnit.
558///
559/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
560/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
561/// instead of pointers.
562///
563/// MachineScheduler relies on initSUnits numbering the nodes by their order in
564/// the original instruction list.
565void ScheduleDAGInstrs::initSUnits() {
566  // We'll be allocating one SUnit for each real instruction in the region,
567  // which is contained within a basic block.
568  SUnits.reserve(NumRegionInstrs);
569
570  for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
571    if (MI.isDebugOrPseudoInstr())
572      continue;
573
574    SUnit *SU = newSUnit(&MI);
575    MISUnitMap[&MI] = SU;
576
577    SU->isCall = MI.isCall();
578    SU->isCommutable = MI.isCommutable();
579
580    // Assign the Latency field of SU using target-provided information.
581    SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
582
583    // If this SUnit uses a reserved or unbuffered resource, mark it as such.
584    //
585    // Reserved resources block an instruction from issuing and stall the
586    // entire pipeline. These are identified by BufferSize=0.
587    //
588    // Unbuffered resources prevent execution of subsequent instructions that
589    // require the same resources. This is used for in-order execution pipelines
590    // within an out-of-order core. These are identified by BufferSize=1.
591    if (SchedModel.hasInstrSchedModel()) {
592      const MCSchedClassDesc *SC = getSchedClass(SU);
593      for (const MCWriteProcResEntry &PRE :
594           make_range(SchedModel.getWriteProcResBegin(SC),
595                      SchedModel.getWriteProcResEnd(SC))) {
596        switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) {
597        case 0:
598          SU->hasReservedResource = true;
599          break;
600        case 1:
601          SU->isUnbuffered = true;
602          break;
603        default:
604          break;
605        }
606      }
607    }
608  }
609}
610
611class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
612  /// Current total number of SUs in map.
613  unsigned NumNodes = 0;
614
615  /// 1 for loads, 0 for stores. (see comment in SUList)
616  unsigned TrueMemOrderLatency;
617
618public:
619  Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
620
621  /// To keep NumNodes up to date, insert() is used instead of
622  /// this operator w/ push_back().
623  ValueType &operator[](const SUList &Key) {
624    llvm_unreachable("Don't use. Use insert() instead."); };
625
626  /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
627  /// reduce().
628  void inline insert(SUnit *SU, ValueType V) {
629    MapVector::operator[](V).push_back(SU);
630    NumNodes++;
631  }
632
633  /// Clears the list of SUs mapped to V.
634  void inline clearList(ValueType V) {
635    iterator Itr = find(V);
636    if (Itr != end()) {
637      assert(NumNodes >= Itr->second.size());
638      NumNodes -= Itr->second.size();
639
640      Itr->second.clear();
641    }
642  }
643
644  /// Clears map from all contents.
645  void clear() {
646    MapVector<ValueType, SUList>::clear();
647    NumNodes = 0;
648  }
649
650  unsigned inline size() const { return NumNodes; }
651
652  /// Counts the number of SUs in this map after a reduction.
653  void reComputeSize() {
654    NumNodes = 0;
655    for (auto &I : *this)
656      NumNodes += I.second.size();
657  }
658
659  unsigned inline getTrueMemOrderLatency() const {
660    return TrueMemOrderLatency;
661  }
662
663  void dump();
664};
665
666void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
667                                             Value2SUsMap &Val2SUsMap) {
668  for (auto &I : Val2SUsMap)
669    addChainDependencies(SU, I.second,
670                         Val2SUsMap.getTrueMemOrderLatency());
671}
672
673void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
674                                             Value2SUsMap &Val2SUsMap,
675                                             ValueType V) {
676  Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
677  if (Itr != Val2SUsMap.end())
678    addChainDependencies(SU, Itr->second,
679                         Val2SUsMap.getTrueMemOrderLatency());
680}
681
682void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
683  assert(BarrierChain != nullptr);
684
685  for (auto &[V, SUs] : map) {
686    (void)V;
687    for (auto *SU : SUs)
688      SU->addPredBarrier(BarrierChain);
689  }
690  map.clear();
691}
692
693void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
694  assert(BarrierChain != nullptr);
695
696  // Go through all lists of SUs.
697  for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
698    Value2SUsMap::iterator CurrItr = I++;
699    SUList &sus = CurrItr->second;
700    SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
701    for (; SUItr != SUEE; ++SUItr) {
702      // Stop on BarrierChain or any instruction above it.
703      if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
704        break;
705
706      (*SUItr)->addPredBarrier(BarrierChain);
707    }
708
709    // Remove also the BarrierChain from list if present.
710    if (SUItr != SUEE && *SUItr == BarrierChain)
711      SUItr++;
712
713    // Remove all SUs that are now successors of BarrierChain.
714    if (SUItr != sus.begin())
715      sus.erase(sus.begin(), SUItr);
716  }
717
718  // Remove all entries with empty su lists.
719  map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
720      return (mapEntry.second.empty()); });
721
722  // Recompute the size of the map (NumNodes).
723  map.reComputeSize();
724}
725
726void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
727                                        RegPressureTracker *RPTracker,
728                                        PressureDiffs *PDiffs,
729                                        LiveIntervals *LIS,
730                                        bool TrackLaneMasks) {
731  const TargetSubtargetInfo &ST = MF.getSubtarget();
732  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
733                                                       : ST.useAA();
734  AAForDep = UseAA ? AA : nullptr;
735
736  BarrierChain = nullptr;
737
738  this->TrackLaneMasks = TrackLaneMasks;
739  MISUnitMap.clear();
740  ScheduleDAG::clearDAG();
741
742  // Create an SUnit for each real instruction.
743  initSUnits();
744
745  if (PDiffs)
746    PDiffs->init(SUnits.size());
747
748  // We build scheduling units by walking a block's instruction list
749  // from bottom to top.
750
751  // Each MIs' memory operand(s) is analyzed to a list of underlying
752  // objects. The SU is then inserted in the SUList(s) mapped from the
753  // Value(s). Each Value thus gets mapped to lists of SUs depending
754  // on it, stores and loads kept separately. Two SUs are trivially
755  // non-aliasing if they both depend on only identified Values and do
756  // not share any common Value.
757  Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
758
759  // Certain memory accesses are known to not alias any SU in Stores
760  // or Loads, and have therefore their own 'NonAlias'
761  // domain. E.g. spill / reload instructions never alias LLVM I/R
762  // Values. It would be nice to assume that this type of memory
763  // accesses always have a proper memory operand modelling, and are
764  // therefore never unanalyzable, but this is conservatively not
765  // done.
766  Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
767
768  // Track all instructions that may raise floating-point exceptions.
769  // These do not depend on one other (or normal loads or stores), but
770  // must not be rescheduled across global barriers.  Note that we don't
771  // really need a "map" here since we don't track those MIs by value;
772  // using the same Value2SUsMap data type here is simply a matter of
773  // convenience.
774  Value2SUsMap FPExceptions;
775
776  // Remove any stale debug info; sometimes BuildSchedGraph is called again
777  // without emitting the info from the previous call.
778  DbgValues.clear();
779  FirstDbgValue = nullptr;
780
781  assert(Defs.empty() && Uses.empty() &&
782         "Only BuildGraph should update Defs/Uses");
783  Defs.setUniverse(TRI->getNumRegs());
784  Uses.setUniverse(TRI->getNumRegs());
785
786  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
787  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
788  unsigned NumVirtRegs = MRI.getNumVirtRegs();
789  CurrentVRegDefs.setUniverse(NumVirtRegs);
790  CurrentVRegUses.setUniverse(NumVirtRegs);
791
792  // Model data dependencies between instructions being scheduled and the
793  // ExitSU.
794  addSchedBarrierDeps();
795
796  // Walk the list of instructions, from bottom moving up.
797  MachineInstr *DbgMI = nullptr;
798  for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
799       MII != MIE; --MII) {
800    MachineInstr &MI = *std::prev(MII);
801    if (DbgMI) {
802      DbgValues.emplace_back(DbgMI, &MI);
803      DbgMI = nullptr;
804    }
805
806    if (MI.isDebugValue() || MI.isDebugPHI()) {
807      DbgMI = &MI;
808      continue;
809    }
810
811    if (MI.isDebugLabel() || MI.isDebugRef() || MI.isPseudoProbe())
812      continue;
813
814    SUnit *SU = MISUnitMap[&MI];
815    assert(SU && "No SUnit mapped to this MI");
816
817    if (RPTracker) {
818      RegisterOperands RegOpers;
819      RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
820      if (TrackLaneMasks) {
821        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
822        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
823      }
824      if (PDiffs != nullptr)
825        PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
826
827      if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI)
828        RPTracker->recedeSkipDebugValues();
829      assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
830      RPTracker->recede(RegOpers);
831    }
832
833    assert(
834        (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
835        "Cannot schedule terminators or labels!");
836
837    // Add register-based dependencies (data, anti, and output).
838    // For some instructions (calls, returns, inline-asm, etc.) there can
839    // be explicit uses and implicit defs, in which case the use will appear
840    // on the operand list before the def. Do two passes over the operand
841    // list to make sure that defs are processed before any uses.
842    bool HasVRegDef = false;
843    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
844      const MachineOperand &MO = MI.getOperand(j);
845      if (!MO.isReg() || !MO.isDef())
846        continue;
847      Register Reg = MO.getReg();
848      if (Reg.isPhysical()) {
849        addPhysRegDeps(SU, j);
850      } else if (Reg.isVirtual()) {
851        HasVRegDef = true;
852        addVRegDefDeps(SU, j);
853      }
854    }
855    // Now process all uses.
856    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
857      const MachineOperand &MO = MI.getOperand(j);
858      // Only look at use operands.
859      // We do not need to check for MO.readsReg() here because subsequent
860      // subregister defs will get output dependence edges and need no
861      // additional use dependencies.
862      if (!MO.isReg() || !MO.isUse())
863        continue;
864      Register Reg = MO.getReg();
865      if (Reg.isPhysical()) {
866        addPhysRegDeps(SU, j);
867      } else if (Reg.isVirtual() && MO.readsReg()) {
868        addVRegUseDeps(SU, j);
869      }
870    }
871
872    // If we haven't seen any uses in this scheduling region, create a
873    // dependence edge to ExitSU to model the live-out latency. This is required
874    // for vreg defs with no in-region use, and prefetches with no vreg def.
875    //
876    // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
877    // check currently relies on being called before adding chain deps.
878    if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
879      SDep Dep(SU, SDep::Artificial);
880      Dep.setLatency(SU->Latency - 1);
881      ExitSU.addPred(Dep);
882    }
883
884    // Add memory dependencies (Note: isStoreToStackSlot and
885    // isLoadFromStackSLot are not usable after stack slots are lowered to
886    // actual addresses).
887
888    // This is a barrier event that acts as a pivotal node in the DAG.
889    if (isGlobalMemoryObject(&MI)) {
890
891      // Become the barrier chain.
892      if (BarrierChain)
893        BarrierChain->addPredBarrier(SU);
894      BarrierChain = SU;
895
896      LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
897                        << BarrierChain->NodeNum << ").\n";);
898
899      // Add dependencies against everything below it and clear maps.
900      addBarrierChain(Stores);
901      addBarrierChain(Loads);
902      addBarrierChain(NonAliasStores);
903      addBarrierChain(NonAliasLoads);
904      addBarrierChain(FPExceptions);
905
906      continue;
907    }
908
909    // Instructions that may raise FP exceptions may not be moved
910    // across any global barriers.
911    if (MI.mayRaiseFPException()) {
912      if (BarrierChain)
913        BarrierChain->addPredBarrier(SU);
914
915      FPExceptions.insert(SU, UnknownValue);
916
917      if (FPExceptions.size() >= HugeRegion) {
918        LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n";);
919        Value2SUsMap empty;
920        reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize());
921      }
922    }
923
924    // If it's not a store or a variant load, we're done.
925    if (!MI.mayStore() &&
926        !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad()))
927      continue;
928
929    // Always add dependecy edge to BarrierChain if present.
930    if (BarrierChain)
931      BarrierChain->addPredBarrier(SU);
932
933    // Find the underlying objects for MI. The Objs vector is either
934    // empty, or filled with the Values of memory locations which this
935    // SU depends on.
936    UnderlyingObjectsVector Objs;
937    bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs,
938                                                  MF.getDataLayout());
939
940    if (MI.mayStore()) {
941      if (!ObjsFound) {
942        // An unknown store depends on all stores and loads.
943        addChainDependencies(SU, Stores);
944        addChainDependencies(SU, NonAliasStores);
945        addChainDependencies(SU, Loads);
946        addChainDependencies(SU, NonAliasLoads);
947
948        // Map this store to 'UnknownValue'.
949        Stores.insert(SU, UnknownValue);
950      } else {
951        // Add precise dependencies against all previously seen memory
952        // accesses mapped to the same Value(s).
953        for (const UnderlyingObject &UnderlObj : Objs) {
954          ValueType V = UnderlObj.getValue();
955          bool ThisMayAlias = UnderlObj.mayAlias();
956
957          // Add dependencies to previous stores and loads mapped to V.
958          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
959          addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
960        }
961        // Update the store map after all chains have been added to avoid adding
962        // self-loop edge if multiple underlying objects are present.
963        for (const UnderlyingObject &UnderlObj : Objs) {
964          ValueType V = UnderlObj.getValue();
965          bool ThisMayAlias = UnderlObj.mayAlias();
966
967          // Map this store to V.
968          (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
969        }
970        // The store may have dependencies to unanalyzable loads and
971        // stores.
972        addChainDependencies(SU, Loads, UnknownValue);
973        addChainDependencies(SU, Stores, UnknownValue);
974      }
975    } else { // SU is a load.
976      if (!ObjsFound) {
977        // An unknown load depends on all stores.
978        addChainDependencies(SU, Stores);
979        addChainDependencies(SU, NonAliasStores);
980
981        Loads.insert(SU, UnknownValue);
982      } else {
983        for (const UnderlyingObject &UnderlObj : Objs) {
984          ValueType V = UnderlObj.getValue();
985          bool ThisMayAlias = UnderlObj.mayAlias();
986
987          // Add precise dependencies against all previously seen stores
988          // mapping to the same Value(s).
989          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
990
991          // Map this load to V.
992          (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
993        }
994        // The load may have dependencies to unanalyzable stores.
995        addChainDependencies(SU, Stores, UnknownValue);
996      }
997    }
998
999    // Reduce maps if they grow huge.
1000    if (Stores.size() + Loads.size() >= HugeRegion) {
1001      LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
1002      reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
1003    }
1004    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
1005      LLVM_DEBUG(
1006          dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
1007      reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
1008    }
1009  }
1010
1011  if (DbgMI)
1012    FirstDbgValue = DbgMI;
1013
1014  Defs.clear();
1015  Uses.clear();
1016  CurrentVRegDefs.clear();
1017  CurrentVRegUses.clear();
1018
1019  Topo.MarkDirty();
1020}
1021
1022raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
1023  PSV->printCustom(OS);
1024  return OS;
1025}
1026
1027void ScheduleDAGInstrs::Value2SUsMap::dump() {
1028  for (const auto &[ValType, SUs] : *this) {
1029    if (ValType.is<const Value*>()) {
1030      const Value *V = ValType.get<const Value*>();
1031      if (isa<UndefValue>(V))
1032        dbgs() << "Unknown";
1033      else
1034        V->printAsOperand(dbgs());
1035    }
1036    else if (ValType.is<const PseudoSourceValue*>())
1037      dbgs() << ValType.get<const PseudoSourceValue*>();
1038    else
1039      llvm_unreachable("Unknown Value type.");
1040
1041    dbgs() << " : ";
1042    dumpSUList(SUs);
1043  }
1044}
1045
1046void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
1047                                              Value2SUsMap &loads, unsigned N) {
1048  LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
1049             dbgs() << "Loading SUnits:\n"; loads.dump());
1050
1051  // Insert all SU's NodeNums into a vector and sort it.
1052  std::vector<unsigned> NodeNums;
1053  NodeNums.reserve(stores.size() + loads.size());
1054  for (const auto &[V, SUs] : stores) {
1055    (void)V;
1056    for (const auto *SU : SUs)
1057      NodeNums.push_back(SU->NodeNum);
1058  }
1059  for (const auto &[V, SUs] : loads) {
1060    (void)V;
1061    for (const auto *SU : SUs)
1062      NodeNums.push_back(SU->NodeNum);
1063  }
1064  llvm::sort(NodeNums);
1065
1066  // The N last elements in NodeNums will be removed, and the SU with
1067  // the lowest NodeNum of them will become the new BarrierChain to
1068  // let the not yet seen SUs have a dependency to the removed SUs.
1069  assert(N <= NodeNums.size());
1070  SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
1071  if (BarrierChain) {
1072    // The aliasing and non-aliasing maps reduce independently of each
1073    // other, but share a common BarrierChain. Check if the
1074    // newBarrierChain is above the former one. If it is not, it may
1075    // introduce a loop to use newBarrierChain, so keep the old one.
1076    if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
1077      BarrierChain->addPredBarrier(newBarrierChain);
1078      BarrierChain = newBarrierChain;
1079      LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
1080                        << BarrierChain->NodeNum << ").\n";);
1081    }
1082    else
1083      LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
1084                        << BarrierChain->NodeNum << ").\n";);
1085  }
1086  else
1087    BarrierChain = newBarrierChain;
1088
1089  insertBarrierChain(stores);
1090  insertBarrierChain(loads);
1091
1092  LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
1093             dbgs() << "Loading SUnits:\n"; loads.dump());
1094}
1095
1096static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
1097                        MachineInstr &MI, bool addToLiveRegs) {
1098  for (MachineOperand &MO : MI.operands()) {
1099    if (!MO.isReg() || !MO.readsReg())
1100      continue;
1101    Register Reg = MO.getReg();
1102    if (!Reg)
1103      continue;
1104
1105    // Things that are available after the instruction are killed by it.
1106    bool IsKill = LiveRegs.available(MRI, Reg);
1107    MO.setIsKill(IsKill);
1108    if (addToLiveRegs)
1109      LiveRegs.addReg(Reg);
1110  }
1111}
1112
1113void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
1114  LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
1115
1116  LiveRegs.init(*TRI);
1117  LiveRegs.addLiveOuts(MBB);
1118
1119  // Examine block from end to start...
1120  for (MachineInstr &MI : llvm::reverse(MBB)) {
1121    if (MI.isDebugOrPseudoInstr())
1122      continue;
1123
1124    // Update liveness.  Registers that are defed but not used in this
1125    // instruction are now dead. Mark register and all subregs as they
1126    // are completely defined.
1127    for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
1128      const MachineOperand &MO = *O;
1129      if (MO.isReg()) {
1130        if (!MO.isDef())
1131          continue;
1132        Register Reg = MO.getReg();
1133        if (!Reg)
1134          continue;
1135        LiveRegs.removeReg(Reg);
1136      } else if (MO.isRegMask()) {
1137        LiveRegs.removeRegsInMask(MO);
1138      }
1139    }
1140
1141    // If there is a bundle header fix it up first.
1142    if (!MI.isBundled()) {
1143      toggleKills(MRI, LiveRegs, MI, true);
1144    } else {
1145      MachineBasicBlock::instr_iterator Bundle = MI.getIterator();
1146      if (MI.isBundle())
1147        toggleKills(MRI, LiveRegs, MI, false);
1148
1149      // Some targets make the (questionable) assumtion that the instructions
1150      // inside the bundle are ordered and consequently only the last use of
1151      // a register inside the bundle can kill it.
1152      MachineBasicBlock::instr_iterator I = std::next(Bundle);
1153      while (I->isBundledWithSucc())
1154        ++I;
1155      do {
1156        if (!I->isDebugOrPseudoInstr())
1157          toggleKills(MRI, LiveRegs, *I, true);
1158        --I;
1159      } while (I != Bundle);
1160    }
1161  }
1162}
1163
1164void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const {
1165#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1166  dumpNodeName(SU);
1167  if (SchedPrintCycles)
1168    dbgs() << " [TopReadyCycle = " << SU.TopReadyCycle
1169           << ", BottomReadyCycle = " << SU.BotReadyCycle << "]";
1170  dbgs() << ": ";
1171  SU.getInstr()->dump();
1172#endif
1173}
1174
1175void ScheduleDAGInstrs::dump() const {
1176#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1177  if (EntrySU.getInstr() != nullptr)
1178    dumpNodeAll(EntrySU);
1179  for (const SUnit &SU : SUnits)
1180    dumpNodeAll(SU);
1181  if (ExitSU.getInstr() != nullptr)
1182    dumpNodeAll(ExitSU);
1183#endif
1184}
1185
1186std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
1187  std::string s;
1188  raw_string_ostream oss(s);
1189  if (SU == &EntrySU)
1190    oss << "<entry>";
1191  else if (SU == &ExitSU)
1192    oss << "<exit>";
1193  else
1194    SU->getInstr()->print(oss, /*IsStandalone=*/true);
1195  return oss.str();
1196}
1197
1198/// Return the basic block label. It is not necessarilly unique because a block
1199/// contains multiple scheduling regions. But it is fine for visualization.
1200std::string ScheduleDAGInstrs::getDAGName() const {
1201  return "dag." + BB->getFullName();
1202}
1203
1204bool ScheduleDAGInstrs::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
1205  return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
1206}
1207
1208bool ScheduleDAGInstrs::addEdge(SUnit *SuccSU, const SDep &PredDep) {
1209  if (SuccSU != &ExitSU) {
1210    // Do not use WillCreateCycle, it assumes SD scheduling.
1211    // If Pred is reachable from Succ, then the edge creates a cycle.
1212    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
1213      return false;
1214    Topo.AddPredQueued(SuccSU, PredDep.getSUnit());
1215  }
1216  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
1217  // Return true regardless of whether a new edge needed to be inserted.
1218  return true;
1219}
1220
1221//===----------------------------------------------------------------------===//
1222// SchedDFSResult Implementation
1223//===----------------------------------------------------------------------===//
1224
1225namespace llvm {
1226
1227/// Internal state used to compute SchedDFSResult.
1228class SchedDFSImpl {
1229  SchedDFSResult &R;
1230
1231  /// Join DAG nodes into equivalence classes by their subtree.
1232  IntEqClasses SubtreeClasses;
1233  /// List PredSU, SuccSU pairs that represent data edges between subtrees.
1234  std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
1235
1236  struct RootData {
1237    unsigned NodeID;
1238    unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
1239    unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
1240                                /// children.
1241
1242    RootData(unsigned id): NodeID(id),
1243                           ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
1244
1245    unsigned getSparseSetIndex() const { return NodeID; }
1246  };
1247
1248  SparseSet<RootData> RootSet;
1249
1250public:
1251  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
1252    RootSet.setUniverse(R.DFSNodeData.size());
1253  }
1254
1255  /// Returns true if this node been visited by the DFS traversal.
1256  ///
1257  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
1258  /// ID. Later, SubtreeID is updated but remains valid.
1259  bool isVisited(const SUnit *SU) const {
1260    return R.DFSNodeData[SU->NodeNum].SubtreeID
1261      != SchedDFSResult::InvalidSubtreeID;
1262  }
1263
1264  /// Initializes this node's instruction count. We don't need to flag the node
1265  /// visited until visitPostorder because the DAG cannot have cycles.
1266  void visitPreorder(const SUnit *SU) {
1267    R.DFSNodeData[SU->NodeNum].InstrCount =
1268      SU->getInstr()->isTransient() ? 0 : 1;
1269  }
1270
1271  /// Called once for each node after all predecessors are visited. Revisit this
1272  /// node's predecessors and potentially join them now that we know the ILP of
1273  /// the other predecessors.
1274  void visitPostorderNode(const SUnit *SU) {
1275    // Mark this node as the root of a subtree. It may be joined with its
1276    // successors later.
1277    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
1278    RootData RData(SU->NodeNum);
1279    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
1280
1281    // If any predecessors are still in their own subtree, they either cannot be
1282    // joined or are large enough to remain separate. If this parent node's
1283    // total instruction count is not greater than a child subtree by at least
1284    // the subtree limit, then try to join it now since splitting subtrees is
1285    // only useful if multiple high-pressure paths are possible.
1286    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
1287    for (const SDep &PredDep : SU->Preds) {
1288      if (PredDep.getKind() != SDep::Data)
1289        continue;
1290      unsigned PredNum = PredDep.getSUnit()->NodeNum;
1291      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
1292        joinPredSubtree(PredDep, SU, /*CheckLimit=*/false);
1293
1294      // Either link or merge the TreeData entry from the child to the parent.
1295      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
1296        // If the predecessor's parent is invalid, this is a tree edge and the
1297        // current node is the parent.
1298        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
1299          RootSet[PredNum].ParentNodeID = SU->NodeNum;
1300      }
1301      else if (RootSet.count(PredNum)) {
1302        // The predecessor is not a root, but is still in the root set. This
1303        // must be the new parent that it was just joined to. Note that
1304        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
1305        // set to the original parent.
1306        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
1307        RootSet.erase(PredNum);
1308      }
1309    }
1310    RootSet[SU->NodeNum] = RData;
1311  }
1312
1313  /// Called once for each tree edge after calling visitPostOrderNode on
1314  /// the predecessor. Increment the parent node's instruction count and
1315  /// preemptively join this subtree to its parent's if it is small enough.
1316  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
1317    R.DFSNodeData[Succ->NodeNum].InstrCount
1318      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
1319    joinPredSubtree(PredDep, Succ);
1320  }
1321
1322  /// Adds a connection for cross edges.
1323  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
1324    ConnectionPairs.emplace_back(PredDep.getSUnit(), Succ);
1325  }
1326
1327  /// Sets each node's subtree ID to the representative ID and record
1328  /// connections between trees.
1329  void finalize() {
1330    SubtreeClasses.compress();
1331    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
1332    assert(SubtreeClasses.getNumClasses() == RootSet.size()
1333           && "number of roots should match trees");
1334    for (const RootData &Root : RootSet) {
1335      unsigned TreeID = SubtreeClasses[Root.NodeID];
1336      if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
1337        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID];
1338      R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount;
1339      // Note that SubInstrCount may be greater than InstrCount if we joined
1340      // subtrees across a cross edge. InstrCount will be attributed to the
1341      // original parent, while SubInstrCount will be attributed to the joined
1342      // parent.
1343    }
1344    R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
1345    R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
1346    LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
1347    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
1348      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
1349      LLVM_DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
1350                        << R.DFSNodeData[Idx].SubtreeID << '\n');
1351    }
1352    for (const auto &[Pred, Succ] : ConnectionPairs) {
1353      unsigned PredTree = SubtreeClasses[Pred->NodeNum];
1354      unsigned SuccTree = SubtreeClasses[Succ->NodeNum];
1355      if (PredTree == SuccTree)
1356        continue;
1357      unsigned Depth = Pred->getDepth();
1358      addConnection(PredTree, SuccTree, Depth);
1359      addConnection(SuccTree, PredTree, Depth);
1360    }
1361  }
1362
1363protected:
1364  /// Joins the predecessor subtree with the successor that is its DFS parent.
1365  /// Applies some heuristics before joining.
1366  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
1367                       bool CheckLimit = true) {
1368    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
1369
1370    // Check if the predecessor is already joined.
1371    const SUnit *PredSU = PredDep.getSUnit();
1372    unsigned PredNum = PredSU->NodeNum;
1373    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
1374      return false;
1375
1376    // Four is the magic number of successors before a node is considered a
1377    // pinch point.
1378    unsigned NumDataSucs = 0;
1379    for (const SDep &SuccDep : PredSU->Succs) {
1380      if (SuccDep.getKind() == SDep::Data) {
1381        if (++NumDataSucs >= 4)
1382          return false;
1383      }
1384    }
1385    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
1386      return false;
1387    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
1388    SubtreeClasses.join(Succ->NodeNum, PredNum);
1389    return true;
1390  }
1391
1392  /// Called by finalize() to record a connection between trees.
1393  void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
1394    if (!Depth)
1395      return;
1396
1397    do {
1398      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
1399        R.SubtreeConnections[FromTree];
1400      for (SchedDFSResult::Connection &C : Connections) {
1401        if (C.TreeID == ToTree) {
1402          C.Level = std::max(C.Level, Depth);
1403          return;
1404        }
1405      }
1406      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
1407      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
1408    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
1409  }
1410};
1411
1412} // end namespace llvm
1413
1414namespace {
1415
1416/// Manage the stack used by a reverse depth-first search over the DAG.
1417class SchedDAGReverseDFS {
1418  std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
1419
1420public:
1421  bool isComplete() const { return DFSStack.empty(); }
1422
1423  void follow(const SUnit *SU) {
1424    DFSStack.emplace_back(SU, SU->Preds.begin());
1425  }
1426  void advance() { ++DFSStack.back().second; }
1427
1428  const SDep *backtrack() {
1429    DFSStack.pop_back();
1430    return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
1431  }
1432
1433  const SUnit *getCurr() const { return DFSStack.back().first; }
1434
1435  SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
1436
1437  SUnit::const_pred_iterator getPredEnd() const {
1438    return getCurr()->Preds.end();
1439  }
1440};
1441
1442} // end anonymous namespace
1443
1444static bool hasDataSucc(const SUnit *SU) {
1445  for (const SDep &SuccDep : SU->Succs) {
1446    if (SuccDep.getKind() == SDep::Data &&
1447        !SuccDep.getSUnit()->isBoundaryNode())
1448      return true;
1449  }
1450  return false;
1451}
1452
1453/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
1454/// search from this root.
1455void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
1456  if (!IsBottomUp)
1457    llvm_unreachable("Top-down ILP metric is unimplemented");
1458
1459  SchedDFSImpl Impl(*this);
1460  for (const SUnit &SU : SUnits) {
1461    if (Impl.isVisited(&SU) || hasDataSucc(&SU))
1462      continue;
1463
1464    SchedDAGReverseDFS DFS;
1465    Impl.visitPreorder(&SU);
1466    DFS.follow(&SU);
1467    while (true) {
1468      // Traverse the leftmost path as far as possible.
1469      while (DFS.getPred() != DFS.getPredEnd()) {
1470        const SDep &PredDep = *DFS.getPred();
1471        DFS.advance();
1472        // Ignore non-data edges.
1473        if (PredDep.getKind() != SDep::Data
1474            || PredDep.getSUnit()->isBoundaryNode()) {
1475          continue;
1476        }
1477        // An already visited edge is a cross edge, assuming an acyclic DAG.
1478        if (Impl.isVisited(PredDep.getSUnit())) {
1479          Impl.visitCrossEdge(PredDep, DFS.getCurr());
1480          continue;
1481        }
1482        Impl.visitPreorder(PredDep.getSUnit());
1483        DFS.follow(PredDep.getSUnit());
1484      }
1485      // Visit the top of the stack in postorder and backtrack.
1486      const SUnit *Child = DFS.getCurr();
1487      const SDep *PredDep = DFS.backtrack();
1488      Impl.visitPostorderNode(Child);
1489      if (PredDep)
1490        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
1491      if (DFS.isComplete())
1492        break;
1493    }
1494  }
1495  Impl.finalize();
1496}
1497
1498/// The root of the given SubtreeID was just scheduled. For all subtrees
1499/// connected to this tree, record the depth of the connection so that the
1500/// nearest connected subtrees can be prioritized.
1501void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
1502  for (const Connection &C : SubtreeConnections[SubtreeID]) {
1503    SubtreeConnectLevels[C.TreeID] =
1504      std::max(SubtreeConnectLevels[C.TreeID], C.Level);
1505    LLVM_DEBUG(dbgs() << "  Tree: " << C.TreeID << " @"
1506                      << SubtreeConnectLevels[C.TreeID] << '\n');
1507  }
1508}
1509
1510#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1511LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
1512  OS << InstrCount << " / " << Length << " = ";
1513  if (!Length)
1514    OS << "BADILP";
1515  else
1516    OS << format("%g", ((double)InstrCount / Length));
1517}
1518
1519LLVM_DUMP_METHOD void ILPValue::dump() const {
1520  dbgs() << *this << '\n';
1521}
1522
1523namespace llvm {
1524
1525LLVM_DUMP_METHOD
1526raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
1527  Val.print(OS);
1528  return OS;
1529}
1530
1531} // end namespace llvm
1532
1533#endif
1534