1314564Sdim//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2293838Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6293838Sdim//
7293838Sdim//===----------------------------------------------------------------------===//
8293838Sdim//
9293838Sdim/// \file
10341825Sdim/// SI Machine Scheduler interface
11293838Sdim//
12293838Sdim//===----------------------------------------------------------------------===//
13293838Sdim
14321369Sdim#include "SIMachineScheduler.h"
15309124Sdim#include "AMDGPU.h"
16314564Sdim#include "SIInstrInfo.h"
17314564Sdim#include "SIRegisterInfo.h"
18341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19314564Sdim#include "llvm/ADT/STLExtras.h"
20314564Sdim#include "llvm/ADT/SmallVector.h"
21293838Sdim#include "llvm/CodeGen/LiveInterval.h"
22327952Sdim#include "llvm/CodeGen/LiveIntervals.h"
23314564Sdim#include "llvm/CodeGen/MachineInstr.h"
24293838Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
25293838Sdim#include "llvm/CodeGen/MachineScheduler.h"
26293838Sdim#include "llvm/CodeGen/RegisterPressure.h"
27314564Sdim#include "llvm/CodeGen/SlotIndexes.h"
28327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h"
29314564Sdim#include "llvm/Support/Debug.h"
30314564Sdim#include "llvm/Support/ErrorHandling.h"
31314564Sdim#include "llvm/Support/raw_ostream.h"
32314564Sdim#include <algorithm>
33314564Sdim#include <cassert>
34314564Sdim#include <map>
35314564Sdim#include <set>
36314564Sdim#include <utility>
37314564Sdim#include <vector>
38293838Sdim
39293838Sdimusing namespace llvm;
40293838Sdim
41321369Sdim#define DEBUG_TYPE "machine-scheduler"
42293838Sdim
43293838Sdim// This scheduler implements a different scheduling algorithm than
44293838Sdim// GenericScheduler.
45293838Sdim//
46293838Sdim// There are several specific architecture behaviours that can't be modelled
47293838Sdim// for GenericScheduler:
48293838Sdim// . When accessing the result of an SGPR load instruction, you have to wait
49293838Sdim// for all the SGPR load instructions before your current instruction to
50293838Sdim// have finished.
51293838Sdim// . When accessing the result of an VGPR load instruction, you have to wait
52293838Sdim// for all the VGPR load instructions previous to the VGPR load instruction
53293838Sdim// you are interested in to finish.
54293838Sdim// . The less the register pressure, the best load latencies are hidden
55293838Sdim//
56293838Sdim// Moreover some specifities (like the fact a lot of instructions in the shader
57293838Sdim// have few dependencies) makes the generic scheduler have some unpredictable
58293838Sdim// behaviours. For example when register pressure becomes high, it can either
59293838Sdim// manage to prevent register pressure from going too high, or it can
60293838Sdim// increase register pressure even more than if it hadn't taken register
61293838Sdim// pressure into account.
62293838Sdim//
63293838Sdim// Also some other bad behaviours are generated, like loading at the beginning
64293838Sdim// of the shader a constant in VGPR you won't need until the end of the shader.
65293838Sdim//
66293838Sdim// The scheduling problem for SI can distinguish three main parts:
67293838Sdim// . Hiding high latencies (texture sampling, etc)
68293838Sdim// . Hiding low latencies (SGPR constant loading, etc)
69293838Sdim// . Keeping register usage low for better latency hiding and general
70293838Sdim//   performance
71293838Sdim//
72293838Sdim// Some other things can also affect performance, but are hard to predict
73293838Sdim// (cache usage, the fact the HW can issue several instructions from different
74293838Sdim// wavefronts if different types, etc)
75293838Sdim//
76293838Sdim// This scheduler tries to solve the scheduling problem by dividing it into
77293838Sdim// simpler sub-problems. It divides the instructions into blocks, schedules
78293838Sdim// locally inside the blocks where it takes care of low latencies, and then
79293838Sdim// chooses the order of the blocks by taking care of high latencies.
80293838Sdim// Dividing the instructions into blocks helps control keeping register
81293838Sdim// usage low.
82293838Sdim//
83293838Sdim// First the instructions are put into blocks.
84293838Sdim//   We want the blocks help control register usage and hide high latencies
85293838Sdim//   later. To help control register usage, we typically want all local
86293838Sdim//   computations, when for example you create a result that can be comsummed
87293838Sdim//   right away, to be contained in a block. Block inputs and outputs would
88293838Sdim//   typically be important results that are needed in several locations of
89293838Sdim//   the shader. Since we do want blocks to help hide high latencies, we want
90293838Sdim//   the instructions inside the block to have a minimal set of dependencies
91293838Sdim//   on high latencies. It will make it easy to pick blocks to hide specific
92293838Sdim//   high latencies.
93293838Sdim//   The block creation algorithm is divided into several steps, and several
94293838Sdim//   variants can be tried during the scheduling process.
95293838Sdim//
96314564Sdim// Second the order of the instructions inside the blocks is chosen.
97293838Sdim//   At that step we do take into account only register usage and hiding
98293838Sdim//   low latency instructions
99293838Sdim//
100314564Sdim// Third the block order is chosen, there we try to hide high latencies
101293838Sdim// and keep register usage low.
102293838Sdim//
103293838Sdim// After the third step, a pass is done to improve the hiding of low
104293838Sdim// latencies.
105293838Sdim//
106293838Sdim// Actually when talking about 'low latency' or 'high latency' it includes
107293838Sdim// both the latency to get the cache (or global mem) data go to the register,
108314564Sdim// and the bandwidth limitations.
109293838Sdim// Increasing the number of active wavefronts helps hide the former, but it
110293838Sdim// doesn't solve the latter, thus why even if wavefront count is high, we have
111293838Sdim// to try have as many instructions hiding high latencies as possible.
112293838Sdim// The OpenCL doc says for example latency of 400 cycles for a global mem access,
113293838Sdim// which is hidden by 10 instructions if the wavefront count is 10.
114293838Sdim
115293838Sdim// Some figures taken from AMD docs:
116293838Sdim// Both texture and constant L1 caches are 4-way associative with 64 bytes
117293838Sdim// lines.
118293838Sdim// Constant cache is shared with 4 CUs.
119293838Sdim// For texture sampling, the address generation unit receives 4 texture
120293838Sdim// addresses per cycle, thus we could expect texture sampling latency to be
121293838Sdim// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122293838Sdim// instructions in a wavefront group are executed every 4 cycles),
123293838Sdim// or 16 instructions if the other wavefronts associated to the 3 other VALUs
124293838Sdim// of the CU do texture sampling too. (Don't take these figures too seriously,
125293838Sdim// as I'm not 100% sure of the computation)
126293838Sdim// Data exports should get similar latency.
127293838Sdim// For constant loading, the cache is shader with 4 CUs.
128293838Sdim// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129293838Sdim// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130293838Sdim// It means a simple s_buffer_load should take one instruction to hide, as
131293838Sdim// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132293838Sdim// cache line.
133293838Sdim//
134293838Sdim// As of today the driver doesn't preload the constants in cache, thus the
135293838Sdim// first loads get extra latency. The doc says global memory access can be
136293838Sdim// 300-600 cycles. We do not specially take that into account when scheduling
137293838Sdim// As we expect the driver to be able to preload the constants soon.
138293838Sdim
139293838Sdim// common code //
140293838Sdim
141293838Sdim#ifndef NDEBUG
142293838Sdim
143293838Sdimstatic const char *getReasonStr(SIScheduleCandReason Reason) {
144293838Sdim  switch (Reason) {
145293838Sdim  case NoCand:         return "NOCAND";
146293838Sdim  case RegUsage:       return "REGUSAGE";
147293838Sdim  case Latency:        return "LATENCY";
148293838Sdim  case Successor:      return "SUCCESSOR";
149293838Sdim  case Depth:          return "DEPTH";
150293838Sdim  case NodeOrder:      return "ORDER";
151293838Sdim  }
152293838Sdim  llvm_unreachable("Unknown reason!");
153293838Sdim}
154293838Sdim
155293838Sdim#endif
156293838Sdim
157341825Sdimnamespace llvm {
158341825Sdimnamespace SISched {
159293838Sdimstatic bool tryLess(int TryVal, int CandVal,
160293838Sdim                    SISchedulerCandidate &TryCand,
161293838Sdim                    SISchedulerCandidate &Cand,
162293838Sdim                    SIScheduleCandReason Reason) {
163293838Sdim  if (TryVal < CandVal) {
164293838Sdim    TryCand.Reason = Reason;
165293838Sdim    return true;
166293838Sdim  }
167293838Sdim  if (TryVal > CandVal) {
168293838Sdim    if (Cand.Reason > Reason)
169293838Sdim      Cand.Reason = Reason;
170293838Sdim    return true;
171293838Sdim  }
172293838Sdim  Cand.setRepeat(Reason);
173293838Sdim  return false;
174293838Sdim}
175293838Sdim
176293838Sdimstatic bool tryGreater(int TryVal, int CandVal,
177293838Sdim                       SISchedulerCandidate &TryCand,
178293838Sdim                       SISchedulerCandidate &Cand,
179293838Sdim                       SIScheduleCandReason Reason) {
180293838Sdim  if (TryVal > CandVal) {
181293838Sdim    TryCand.Reason = Reason;
182293838Sdim    return true;
183293838Sdim  }
184293838Sdim  if (TryVal < CandVal) {
185293838Sdim    if (Cand.Reason > Reason)
186293838Sdim      Cand.Reason = Reason;
187293838Sdim    return true;
188293838Sdim  }
189293838Sdim  Cand.setRepeat(Reason);
190293838Sdim  return false;
191293838Sdim}
192341825Sdim} // end namespace SISched
193341825Sdim} // end namespace llvm
194293838Sdim
195293838Sdim// SIScheduleBlock //
196293838Sdim
197293838Sdimvoid SIScheduleBlock::addUnit(SUnit *SU) {
198293838Sdim  NodeNum2Index[SU->NodeNum] = SUnits.size();
199293838Sdim  SUnits.push_back(SU);
200293838Sdim}
201293838Sdim
202293838Sdim#ifndef NDEBUG
203293838Sdimvoid SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
204293838Sdim
205293838Sdim  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
206293838Sdim  dbgs() << '\n';
207293838Sdim}
208293838Sdim#endif
209293838Sdim
210293838Sdimvoid SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
211293838Sdim                                          SISchedCandidate &TryCand) {
212293838Sdim  // Initialize the candidate if needed.
213293838Sdim  if (!Cand.isValid()) {
214293838Sdim    TryCand.Reason = NodeOrder;
215293838Sdim    return;
216293838Sdim  }
217293838Sdim
218293838Sdim  if (Cand.SGPRUsage > 60 &&
219341825Sdim      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
220341825Sdim                       TryCand, Cand, RegUsage))
221293838Sdim    return;
222293838Sdim
223293838Sdim  // Schedule low latency instructions as top as possible.
224293838Sdim  // Order of priority is:
225293838Sdim  // . Low latency instructions which do not depend on other low latency
226293838Sdim  //   instructions we haven't waited for
227293838Sdim  // . Other instructions which do not depend on low latency instructions
228293838Sdim  //   we haven't waited for
229293838Sdim  // . Low latencies
230293838Sdim  // . All other instructions
231314564Sdim  // Goal is to get: low latency instructions - independent instructions
232293838Sdim  //     - (eventually some more low latency instructions)
233293838Sdim  //     - instructions that depend on the first low latency instructions.
234293838Sdim  // If in the block there is a lot of constant loads, the SGPR usage
235293838Sdim  // could go quite high, thus above the arbitrary limit of 60 will encourage
236293838Sdim  // use the already loaded constants (in order to release some SGPRs) before
237293838Sdim  // loading more.
238341825Sdim  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
239341825Sdim                       Cand.HasLowLatencyNonWaitedParent,
240341825Sdim                       TryCand, Cand, SIScheduleCandReason::Depth))
241293838Sdim    return;
242293838Sdim
243341825Sdim  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
244341825Sdim                          TryCand, Cand, SIScheduleCandReason::Depth))
245293838Sdim    return;
246293838Sdim
247293838Sdim  if (TryCand.IsLowLatency &&
248341825Sdim      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
249341825Sdim                       TryCand, Cand, SIScheduleCandReason::Depth))
250293838Sdim    return;
251293838Sdim
252341825Sdim  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
253341825Sdim                       TryCand, Cand, RegUsage))
254293838Sdim    return;
255293838Sdim
256293838Sdim  // Fall through to original instruction order.
257293838Sdim  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
258293838Sdim    TryCand.Reason = NodeOrder;
259293838Sdim  }
260293838Sdim}
261293838Sdim
262293838SdimSUnit* SIScheduleBlock::pickNode() {
263293838Sdim  SISchedCandidate TopCand;
264293838Sdim
265293838Sdim  for (SUnit* SU : TopReadySUs) {
266293838Sdim    SISchedCandidate TryCand;
267293838Sdim    std::vector<unsigned> pressure;
268293838Sdim    std::vector<unsigned> MaxPressure;
269293838Sdim    // Predict register usage after this instruction.
270293838Sdim    TryCand.SU = SU;
271293838Sdim    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
272293838Sdim    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
273293838Sdim    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
274293838Sdim    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
275293838Sdim    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
276293838Sdim    TryCand.HasLowLatencyNonWaitedParent =
277293838Sdim      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
278293838Sdim    tryCandidateTopDown(TopCand, TryCand);
279293838Sdim    if (TryCand.Reason != NoCand)
280293838Sdim      TopCand.setBest(TryCand);
281293838Sdim  }
282293838Sdim
283293838Sdim  return TopCand.SU;
284293838Sdim}
285293838Sdim
286293838Sdim
287293838Sdim// Schedule something valid.
288293838Sdimvoid SIScheduleBlock::fastSchedule() {
289293838Sdim  TopReadySUs.clear();
290293838Sdim  if (Scheduled)
291293838Sdim    undoSchedule();
292293838Sdim
293293838Sdim  for (SUnit* SU : SUnits) {
294293838Sdim    if (!SU->NumPredsLeft)
295293838Sdim      TopReadySUs.push_back(SU);
296293838Sdim  }
297293838Sdim
298293838Sdim  while (!TopReadySUs.empty()) {
299293838Sdim    SUnit *SU = TopReadySUs[0];
300293838Sdim    ScheduledSUnits.push_back(SU);
301293838Sdim    nodeScheduled(SU);
302293838Sdim  }
303293838Sdim
304293838Sdim  Scheduled = true;
305293838Sdim}
306293838Sdim
307293838Sdim// Returns if the register was set between first and last.
308293838Sdimstatic bool isDefBetween(unsigned Reg,
309293838Sdim                           SlotIndex First, SlotIndex Last,
310293838Sdim                           const MachineRegisterInfo *MRI,
311293838Sdim                           const LiveIntervals *LIS) {
312293838Sdim  for (MachineRegisterInfo::def_instr_iterator
313293838Sdim       UI = MRI->def_instr_begin(Reg),
314293838Sdim       UE = MRI->def_instr_end(); UI != UE; ++UI) {
315293838Sdim    const MachineInstr* MI = &*UI;
316293838Sdim    if (MI->isDebugValue())
317293838Sdim      continue;
318309124Sdim    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
319293838Sdim    if (InstSlot >= First && InstSlot <= Last)
320293838Sdim      return true;
321293838Sdim  }
322293838Sdim  return false;
323293838Sdim}
324293838Sdim
325293838Sdimvoid SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
326293838Sdim                                      MachineBasicBlock::iterator EndBlock) {
327293838Sdim  IntervalPressure Pressure, BotPressure;
328293838Sdim  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
329293838Sdim  LiveIntervals *LIS = DAG->getLIS();
330293838Sdim  MachineRegisterInfo *MRI = DAG->getMRI();
331293838Sdim  DAG->initRPTracker(TopRPTracker);
332293838Sdim  DAG->initRPTracker(BotRPTracker);
333293838Sdim  DAG->initRPTracker(RPTracker);
334293838Sdim
335293838Sdim  // Goes though all SU. RPTracker captures what had to be alive for the SUs
336293838Sdim  // to execute, and what is still alive at the end.
337293838Sdim  for (SUnit* SU : ScheduledSUnits) {
338293838Sdim    RPTracker.setPos(SU->getInstr());
339293838Sdim    RPTracker.advance();
340293838Sdim  }
341293838Sdim
342293838Sdim  // Close the RPTracker to finalize live ins/outs.
343293838Sdim  RPTracker.closeRegion();
344293838Sdim
345293838Sdim  // Initialize the live ins and live outs.
346293838Sdim  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
347293838Sdim  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
348293838Sdim
349293838Sdim  // Do not Track Physical Registers, because it messes up.
350309124Sdim  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
351360784Sdim    if (Register::isVirtualRegister(RegMaskPair.RegUnit))
352309124Sdim      LiveInRegs.insert(RegMaskPair.RegUnit);
353293838Sdim  }
354293838Sdim  LiveOutRegs.clear();
355293838Sdim  // There is several possibilities to distinguish:
356293838Sdim  // 1) Reg is not input to any instruction in the block, but is output of one
357293838Sdim  // 2) 1) + read in the block and not needed after it
358293838Sdim  // 3) 1) + read in the block but needed in another block
359293838Sdim  // 4) Reg is input of an instruction but another block will read it too
360293838Sdim  // 5) Reg is input of an instruction and then rewritten in the block.
361293838Sdim  //    result is not read in the block (implies used in another block)
362293838Sdim  // 6) Reg is input of an instruction and then rewritten in the block.
363293838Sdim  //    result is read in the block and not needed in another block
364293838Sdim  // 7) Reg is input of an instruction and then rewritten in the block.
365293838Sdim  //    result is read in the block but also needed in another block
366293838Sdim  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
367293838Sdim  // We want LiveOutRegs to contain only Regs whose content will be read after
368293838Sdim  // in another block, and whose content was written in the current block,
369293838Sdim  // that is we want it to get 1, 3, 5, 7
370293838Sdim  // Since we made the MIs of a block to be packed all together before
371293838Sdim  // scheduling, then the LiveIntervals were correct, and the RPTracker was
372293838Sdim  // able to correctly handle 5 vs 6, 2 vs 3.
373293838Sdim  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
374293838Sdim  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
375293838Sdim  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
376293838Sdim  // The use of findDefBetween removes the case 4.
377309124Sdim  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
378309124Sdim    unsigned Reg = RegMaskPair.RegUnit;
379360784Sdim    if (Register::isVirtualRegister(Reg) &&
380309124Sdim        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
381309124Sdim                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
382309124Sdim                     LIS)) {
383293838Sdim      LiveOutRegs.insert(Reg);
384293838Sdim    }
385293838Sdim  }
386293838Sdim
387293838Sdim  // Pressure = sum_alive_registers register size
388293838Sdim  // Internally llvm will represent some registers as big 128 bits registers
389293838Sdim  // for example, but they actually correspond to 4 actual 32 bits registers.
390293838Sdim  // Thus Pressure is not equal to num_alive_registers * constant.
391293838Sdim  LiveInPressure = TopPressure.MaxSetPressure;
392293838Sdim  LiveOutPressure = BotPressure.MaxSetPressure;
393293838Sdim
394293838Sdim  // Prepares TopRPTracker for top down scheduling.
395293838Sdim  TopRPTracker.closeTop();
396293838Sdim}
397293838Sdim
398293838Sdimvoid SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
399293838Sdim                               MachineBasicBlock::iterator EndBlock) {
400293838Sdim  if (!Scheduled)
401293838Sdim    fastSchedule();
402293838Sdim
403293838Sdim  // PreScheduling phase to set LiveIn and LiveOut.
404293838Sdim  initRegPressure(BeginBlock, EndBlock);
405293838Sdim  undoSchedule();
406293838Sdim
407293838Sdim  // Schedule for real now.
408293838Sdim
409293838Sdim  TopReadySUs.clear();
410293838Sdim
411293838Sdim  for (SUnit* SU : SUnits) {
412293838Sdim    if (!SU->NumPredsLeft)
413293838Sdim      TopReadySUs.push_back(SU);
414293838Sdim  }
415293838Sdim
416293838Sdim  while (!TopReadySUs.empty()) {
417293838Sdim    SUnit *SU = pickNode();
418293838Sdim    ScheduledSUnits.push_back(SU);
419293838Sdim    TopRPTracker.setPos(SU->getInstr());
420293838Sdim    TopRPTracker.advance();
421293838Sdim    nodeScheduled(SU);
422293838Sdim  }
423293838Sdim
424293838Sdim  // TODO: compute InternalAdditionnalPressure.
425293838Sdim  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
426293838Sdim
427293838Sdim  // Check everything is right.
428293838Sdim#ifndef NDEBUG
429293838Sdim  assert(SUnits.size() == ScheduledSUnits.size() &&
430293838Sdim            TopReadySUs.empty());
431293838Sdim  for (SUnit* SU : SUnits) {
432293838Sdim    assert(SU->isScheduled &&
433293838Sdim              SU->NumPredsLeft == 0);
434293838Sdim  }
435293838Sdim#endif
436293838Sdim
437293838Sdim  Scheduled = true;
438293838Sdim}
439293838Sdim
440293838Sdimvoid SIScheduleBlock::undoSchedule() {
441293838Sdim  for (SUnit* SU : SUnits) {
442293838Sdim    SU->isScheduled = false;
443293838Sdim    for (SDep& Succ : SU->Succs) {
444293838Sdim      if (BC->isSUInBlock(Succ.getSUnit(), ID))
445293838Sdim        undoReleaseSucc(SU, &Succ);
446293838Sdim    }
447293838Sdim  }
448293838Sdim  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
449293838Sdim  ScheduledSUnits.clear();
450293838Sdim  Scheduled = false;
451293838Sdim}
452293838Sdim
453293838Sdimvoid SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
454293838Sdim  SUnit *SuccSU = SuccEdge->getSUnit();
455293838Sdim
456293838Sdim  if (SuccEdge->isWeak()) {
457293838Sdim    ++SuccSU->WeakPredsLeft;
458293838Sdim    return;
459293838Sdim  }
460293838Sdim  ++SuccSU->NumPredsLeft;
461293838Sdim}
462293838Sdim
463293838Sdimvoid SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
464293838Sdim  SUnit *SuccSU = SuccEdge->getSUnit();
465293838Sdim
466293838Sdim  if (SuccEdge->isWeak()) {
467293838Sdim    --SuccSU->WeakPredsLeft;
468293838Sdim    return;
469293838Sdim  }
470293838Sdim#ifndef NDEBUG
471293838Sdim  if (SuccSU->NumPredsLeft == 0) {
472293838Sdim    dbgs() << "*** Scheduling failed! ***\n";
473344779Sdim    DAG->dumpNode(*SuccSU);
474293838Sdim    dbgs() << " has been released too many times!\n";
475293838Sdim    llvm_unreachable(nullptr);
476293838Sdim  }
477293838Sdim#endif
478293838Sdim
479293838Sdim  --SuccSU->NumPredsLeft;
480293838Sdim}
481293838Sdim
482293838Sdim/// Release Successors of the SU that are in the block or not.
483293838Sdimvoid SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
484293838Sdim  for (SDep& Succ : SU->Succs) {
485293838Sdim    SUnit *SuccSU = Succ.getSUnit();
486293838Sdim
487309124Sdim    if (SuccSU->NodeNum >= DAG->SUnits.size())
488309124Sdim        continue;
489309124Sdim
490293838Sdim    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
491293838Sdim      continue;
492293838Sdim
493293838Sdim    releaseSucc(SU, &Succ);
494293838Sdim    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
495293838Sdim      TopReadySUs.push_back(SuccSU);
496293838Sdim  }
497293838Sdim}
498293838Sdim
499293838Sdimvoid SIScheduleBlock::nodeScheduled(SUnit *SU) {
500293838Sdim  // Is in TopReadySUs
501293838Sdim  assert (!SU->NumPredsLeft);
502314564Sdim  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
503293838Sdim  if (I == TopReadySUs.end()) {
504293838Sdim    dbgs() << "Data Structure Bug in SI Scheduler\n";
505293838Sdim    llvm_unreachable(nullptr);
506293838Sdim  }
507293838Sdim  TopReadySUs.erase(I);
508293838Sdim
509293838Sdim  releaseSuccessors(SU, true);
510293838Sdim  // Scheduling this node will trigger a wait,
511293838Sdim  // thus propagate to other instructions that they do not need to wait either.
512293838Sdim  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
513293838Sdim    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
514293838Sdim
515293838Sdim  if (DAG->IsLowLatencySU[SU->NodeNum]) {
516293838Sdim     for (SDep& Succ : SU->Succs) {
517293838Sdim      std::map<unsigned, unsigned>::iterator I =
518293838Sdim        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
519293838Sdim      if (I != NodeNum2Index.end())
520293838Sdim        HasLowLatencyNonWaitedParent[I->second] = 1;
521293838Sdim    }
522293838Sdim  }
523293838Sdim  SU->isScheduled = true;
524293838Sdim}
525293838Sdim
526293838Sdimvoid SIScheduleBlock::finalizeUnits() {
527293838Sdim  // We remove links from outside blocks to enable scheduling inside the block.
528293838Sdim  for (SUnit* SU : SUnits) {
529293838Sdim    releaseSuccessors(SU, false);
530293838Sdim    if (DAG->IsHighLatencySU[SU->NodeNum])
531293838Sdim      HighLatencyBlock = true;
532293838Sdim  }
533293838Sdim  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
534293838Sdim}
535293838Sdim
536293838Sdim// we maintain ascending order of IDs
537293838Sdimvoid SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
538293838Sdim  unsigned PredID = Pred->getID();
539293838Sdim
540293838Sdim  // Check if not already predecessor.
541293838Sdim  for (SIScheduleBlock* P : Preds) {
542293838Sdim    if (PredID == P->getID())
543293838Sdim      return;
544293838Sdim  }
545293838Sdim  Preds.push_back(Pred);
546293838Sdim
547309124Sdim  assert(none_of(Succs,
548321369Sdim                 [=](std::pair<SIScheduleBlock*,
549321369Sdim                     SIScheduleBlockLinkKind> S) {
550321369Sdim                   return PredID == S.first->getID();
551321369Sdim                    }) &&
552309124Sdim         "Loop in the Block Graph!");
553293838Sdim}
554293838Sdim
555321369Sdimvoid SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
556321369Sdim                              SIScheduleBlockLinkKind Kind) {
557293838Sdim  unsigned SuccID = Succ->getID();
558293838Sdim
559293838Sdim  // Check if not already predecessor.
560321369Sdim  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
561321369Sdim    if (SuccID == S.first->getID()) {
562321369Sdim      if (S.second == SIScheduleBlockLinkKind::NoData &&
563321369Sdim          Kind == SIScheduleBlockLinkKind::Data)
564321369Sdim        S.second = Kind;
565293838Sdim      return;
566321369Sdim    }
567293838Sdim  }
568293838Sdim  if (Succ->isHighLatencyBlock())
569293838Sdim    ++NumHighLatencySuccessors;
570321369Sdim  Succs.push_back(std::make_pair(Succ, Kind));
571321369Sdim
572309124Sdim  assert(none_of(Preds,
573309124Sdim                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
574309124Sdim         "Loop in the Block Graph!");
575293838Sdim}
576293838Sdim
577293838Sdim#ifndef NDEBUG
578293838Sdimvoid SIScheduleBlock::printDebug(bool full) {
579293838Sdim  dbgs() << "Block (" << ID << ")\n";
580293838Sdim  if (!full)
581293838Sdim    return;
582293838Sdim
583293838Sdim  dbgs() << "\nContains High Latency Instruction: "
584293838Sdim         << HighLatencyBlock << '\n';
585293838Sdim  dbgs() << "\nDepends On:\n";
586293838Sdim  for (SIScheduleBlock* P : Preds) {
587293838Sdim    P->printDebug(false);
588293838Sdim  }
589293838Sdim
590293838Sdim  dbgs() << "\nSuccessors:\n";
591321369Sdim  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
592321369Sdim    if (S.second == SIScheduleBlockLinkKind::Data)
593321369Sdim      dbgs() << "(Data Dep) ";
594321369Sdim    S.first->printDebug(false);
595293838Sdim  }
596293838Sdim
597293838Sdim  if (Scheduled) {
598293838Sdim    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
599293838Sdim           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
600293838Sdim    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
601293838Sdim           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
602293838Sdim    dbgs() << "LiveIns:\n";
603293838Sdim    for (unsigned Reg : LiveInRegs)
604327952Sdim      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
605293838Sdim
606293838Sdim    dbgs() << "\nLiveOuts:\n";
607293838Sdim    for (unsigned Reg : LiveOutRegs)
608327952Sdim      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
609293838Sdim  }
610293838Sdim
611293838Sdim  dbgs() << "\nInstructions:\n";
612360784Sdim  for (const SUnit* SU : SUnits)
613344779Sdim      DAG->dumpNode(*SU);
614293838Sdim
615314564Sdim  dbgs() << "///////////////////////\n";
616293838Sdim}
617293838Sdim#endif
618293838Sdim
619293838Sdim// SIScheduleBlockCreator //
620293838Sdim
621360784SdimSIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
622360784Sdim    : DAG(DAG) {}
623293838Sdim
624293838SdimSIScheduleBlocks
625293838SdimSIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
626293838Sdim  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
627293838Sdim    Blocks.find(BlockVariant);
628293838Sdim  if (B == Blocks.end()) {
629293838Sdim    SIScheduleBlocks Res;
630293838Sdim    createBlocksForVariant(BlockVariant);
631293838Sdim    topologicalSort();
632293838Sdim    scheduleInsideBlocks();
633293838Sdim    fillStats();
634293838Sdim    Res.Blocks = CurrentBlocks;
635293838Sdim    Res.TopDownIndex2Block = TopDownIndex2Block;
636293838Sdim    Res.TopDownBlock2Index = TopDownBlock2Index;
637293838Sdim    Blocks[BlockVariant] = Res;
638293838Sdim    return Res;
639293838Sdim  } else {
640293838Sdim    return B->second;
641293838Sdim  }
642293838Sdim}
643293838Sdim
644293838Sdimbool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
645293838Sdim  if (SU->NodeNum >= DAG->SUnits.size())
646293838Sdim    return false;
647293838Sdim  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
648293838Sdim}
649293838Sdim
650293838Sdimvoid SIScheduleBlockCreator::colorHighLatenciesAlone() {
651293838Sdim  unsigned DAGSize = DAG->SUnits.size();
652293838Sdim
653293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
654293838Sdim    SUnit *SU = &DAG->SUnits[i];
655293838Sdim    if (DAG->IsHighLatencySU[SU->NodeNum]) {
656293838Sdim      CurrentColoring[SU->NodeNum] = NextReservedID++;
657293838Sdim    }
658293838Sdim  }
659293838Sdim}
660293838Sdim
661321369Sdimstatic bool
662321369SdimhasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
663321369Sdim  for (const auto &PredDep : SU.Preds) {
664321369Sdim    if (PredDep.getSUnit() == &FromSU &&
665321369Sdim        PredDep.getKind() == llvm::SDep::Data)
666321369Sdim      return true;
667321369Sdim  }
668321369Sdim  return false;
669321369Sdim}
670321369Sdim
671293838Sdimvoid SIScheduleBlockCreator::colorHighLatenciesGroups() {
672293838Sdim  unsigned DAGSize = DAG->SUnits.size();
673293838Sdim  unsigned NumHighLatencies = 0;
674293838Sdim  unsigned GroupSize;
675321369Sdim  int Color = NextReservedID;
676293838Sdim  unsigned Count = 0;
677293838Sdim  std::set<unsigned> FormingGroup;
678293838Sdim
679293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
680293838Sdim    SUnit *SU = &DAG->SUnits[i];
681293838Sdim    if (DAG->IsHighLatencySU[SU->NodeNum])
682293838Sdim      ++NumHighLatencies;
683293838Sdim  }
684293838Sdim
685293838Sdim  if (NumHighLatencies == 0)
686293838Sdim    return;
687293838Sdim
688293838Sdim  if (NumHighLatencies <= 6)
689293838Sdim    GroupSize = 2;
690293838Sdim  else if (NumHighLatencies <= 12)
691293838Sdim    GroupSize = 3;
692293838Sdim  else
693293838Sdim    GroupSize = 4;
694293838Sdim
695321369Sdim  for (unsigned SUNum : DAG->TopDownIndex2SU) {
696321369Sdim    const SUnit &SU = DAG->SUnits[SUNum];
697321369Sdim    if (DAG->IsHighLatencySU[SU.NodeNum]) {
698293838Sdim      unsigned CompatibleGroup = true;
699321369Sdim      int ProposedColor = Color;
700321369Sdim      std::vector<int> AdditionalElements;
701321369Sdim
702321369Sdim      // We don't want to put in the same block
703321369Sdim      // two high latency instructions that depend
704321369Sdim      // on each other.
705321369Sdim      // One way would be to check canAddEdge
706321369Sdim      // in both directions, but that currently is not
707321369Sdim      // enough because there the high latency order is
708321369Sdim      // enforced (via links).
709321369Sdim      // Instead, look at the dependencies between the
710321369Sdim      // high latency instructions and deduce if it is
711321369Sdim      // a data dependency or not.
712293838Sdim      for (unsigned j : FormingGroup) {
713321369Sdim        bool HasSubGraph;
714321369Sdim        std::vector<int> SubGraph;
715321369Sdim        // By construction (topological order), if SU and
716321369Sdim        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
717321369Sdim        // in the parent graph of SU.
718321369Sdim#ifndef NDEBUG
719321369Sdim        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
720321369Sdim                                               HasSubGraph);
721321369Sdim        assert(!HasSubGraph);
722321369Sdim#endif
723321369Sdim        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
724321369Sdim                                               HasSubGraph);
725321369Sdim        if (!HasSubGraph)
726321369Sdim          continue; // No dependencies between each other
727321369Sdim        else if (SubGraph.size() > 5) {
728321369Sdim          // Too many elements would be required to be added to the block.
729293838Sdim          CompatibleGroup = false;
730321369Sdim          break;
731321369Sdim        }
732321369Sdim        else {
733321369Sdim          // Check the type of dependency
734321369Sdim          for (unsigned k : SubGraph) {
735321369Sdim            // If in the path to join the two instructions,
736321369Sdim            // there is another high latency instruction,
737321369Sdim            // or instructions colored for another block
738321369Sdim            // abort the merge.
739321369Sdim            if (DAG->IsHighLatencySU[k] ||
740321369Sdim                (CurrentColoring[k] != ProposedColor &&
741321369Sdim                 CurrentColoring[k] != 0)) {
742321369Sdim              CompatibleGroup = false;
743321369Sdim              break;
744321369Sdim            }
745321369Sdim            // If one of the SU in the subgraph depends on the result of SU j,
746321369Sdim            // there'll be a data dependency.
747321369Sdim            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
748321369Sdim              CompatibleGroup = false;
749321369Sdim              break;
750321369Sdim            }
751321369Sdim          }
752321369Sdim          if (!CompatibleGroup)
753321369Sdim            break;
754321369Sdim          // Same check for the SU
755321369Sdim          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
756321369Sdim            CompatibleGroup = false;
757321369Sdim            break;
758321369Sdim          }
759321369Sdim          // Add all the required instructions to the block
760321369Sdim          // These cannot live in another block (because they
761321369Sdim          // depend (order dependency) on one of the
762321369Sdim          // instruction in the block, and are required for the
763321369Sdim          // high latency instruction we add.
764321369Sdim          AdditionalElements.insert(AdditionalElements.end(),
765321369Sdim                                    SubGraph.begin(), SubGraph.end());
766321369Sdim        }
767293838Sdim      }
768321369Sdim      if (CompatibleGroup) {
769321369Sdim        FormingGroup.insert(SU.NodeNum);
770321369Sdim        for (unsigned j : AdditionalElements)
771321369Sdim          CurrentColoring[j] = ProposedColor;
772321369Sdim        CurrentColoring[SU.NodeNum] = ProposedColor;
773321369Sdim        ++Count;
774321369Sdim      }
775321369Sdim      // Found one incompatible instruction,
776321369Sdim      // or has filled a big enough group.
777321369Sdim      // -> start a new one.
778321369Sdim      if (!CompatibleGroup) {
779293838Sdim        FormingGroup.clear();
780293838Sdim        Color = ++NextReservedID;
781321369Sdim        ProposedColor = Color;
782321369Sdim        FormingGroup.insert(SU.NodeNum);
783321369Sdim        CurrentColoring[SU.NodeNum] = ProposedColor;
784293838Sdim        Count = 0;
785321369Sdim      } else if (Count == GroupSize) {
786321369Sdim        FormingGroup.clear();
787321369Sdim        Color = ++NextReservedID;
788321369Sdim        ProposedColor = Color;
789321369Sdim        Count = 0;
790293838Sdim      }
791293838Sdim    }
792293838Sdim  }
793293838Sdim}
794293838Sdim
795293838Sdimvoid SIScheduleBlockCreator::colorComputeReservedDependencies() {
796293838Sdim  unsigned DAGSize = DAG->SUnits.size();
797293838Sdim  std::map<std::set<unsigned>, unsigned> ColorCombinations;
798293838Sdim
799293838Sdim  CurrentTopDownReservedDependencyColoring.clear();
800293838Sdim  CurrentBottomUpReservedDependencyColoring.clear();
801293838Sdim
802293838Sdim  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
803293838Sdim  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
804293838Sdim
805293838Sdim  // Traverse TopDown, and give different colors to SUs depending
806293838Sdim  // on which combination of High Latencies they depend on.
807293838Sdim
808309124Sdim  for (unsigned SUNum : DAG->TopDownIndex2SU) {
809309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
810293838Sdim    std::set<unsigned> SUColors;
811293838Sdim
812293838Sdim    // Already given.
813293838Sdim    if (CurrentColoring[SU->NodeNum]) {
814293838Sdim      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
815293838Sdim        CurrentColoring[SU->NodeNum];
816293838Sdim      continue;
817293838Sdim    }
818293838Sdim
819293838Sdim   for (SDep& PredDep : SU->Preds) {
820293838Sdim      SUnit *Pred = PredDep.getSUnit();
821293838Sdim      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
822293838Sdim        continue;
823293838Sdim      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
824293838Sdim        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
825293838Sdim    }
826293838Sdim    // Color 0 by default.
827293838Sdim    if (SUColors.empty())
828293838Sdim      continue;
829293838Sdim    // Same color than parents.
830293838Sdim    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
831293838Sdim      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
832293838Sdim        *SUColors.begin();
833293838Sdim    else {
834293838Sdim      std::map<std::set<unsigned>, unsigned>::iterator Pos =
835293838Sdim        ColorCombinations.find(SUColors);
836293838Sdim      if (Pos != ColorCombinations.end()) {
837293838Sdim          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
838293838Sdim      } else {
839293838Sdim        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
840293838Sdim          NextNonReservedID;
841293838Sdim        ColorCombinations[SUColors] = NextNonReservedID++;
842293838Sdim      }
843293838Sdim    }
844293838Sdim  }
845293838Sdim
846293838Sdim  ColorCombinations.clear();
847293838Sdim
848293838Sdim  // Same as before, but BottomUp.
849293838Sdim
850309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
851309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
852293838Sdim    std::set<unsigned> SUColors;
853293838Sdim
854293838Sdim    // Already given.
855293838Sdim    if (CurrentColoring[SU->NodeNum]) {
856293838Sdim      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
857293838Sdim        CurrentColoring[SU->NodeNum];
858293838Sdim      continue;
859293838Sdim    }
860293838Sdim
861293838Sdim    for (SDep& SuccDep : SU->Succs) {
862293838Sdim      SUnit *Succ = SuccDep.getSUnit();
863293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
864293838Sdim        continue;
865293838Sdim      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
866293838Sdim        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
867293838Sdim    }
868293838Sdim    // Keep color 0.
869293838Sdim    if (SUColors.empty())
870293838Sdim      continue;
871293838Sdim    // Same color than parents.
872293838Sdim    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
873293838Sdim      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
874293838Sdim        *SUColors.begin();
875293838Sdim    else {
876293838Sdim      std::map<std::set<unsigned>, unsigned>::iterator Pos =
877293838Sdim        ColorCombinations.find(SUColors);
878293838Sdim      if (Pos != ColorCombinations.end()) {
879293838Sdim        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
880293838Sdim      } else {
881293838Sdim        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
882293838Sdim          NextNonReservedID;
883293838Sdim        ColorCombinations[SUColors] = NextNonReservedID++;
884293838Sdim      }
885293838Sdim    }
886293838Sdim  }
887293838Sdim}
888293838Sdim
889293838Sdimvoid SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
890293838Sdim  unsigned DAGSize = DAG->SUnits.size();
891293838Sdim  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
892293838Sdim
893293838Sdim  // Every combination of colors given by the top down
894293838Sdim  // and bottom up Reserved node dependency
895293838Sdim
896293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
897293838Sdim    SUnit *SU = &DAG->SUnits[i];
898293838Sdim    std::pair<unsigned, unsigned> SUColors;
899293838Sdim
900293838Sdim    // High latency instructions: already given.
901293838Sdim    if (CurrentColoring[SU->NodeNum])
902293838Sdim      continue;
903293838Sdim
904293838Sdim    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
905293838Sdim    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
906293838Sdim
907293838Sdim    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
908293838Sdim      ColorCombinations.find(SUColors);
909293838Sdim    if (Pos != ColorCombinations.end()) {
910293838Sdim      CurrentColoring[SU->NodeNum] = Pos->second;
911293838Sdim    } else {
912293838Sdim      CurrentColoring[SU->NodeNum] = NextNonReservedID;
913293838Sdim      ColorCombinations[SUColors] = NextNonReservedID++;
914293838Sdim    }
915293838Sdim  }
916293838Sdim}
917293838Sdim
918293838Sdimvoid SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
919293838Sdim  unsigned DAGSize = DAG->SUnits.size();
920293838Sdim  std::vector<int> PendingColoring = CurrentColoring;
921293838Sdim
922321369Sdim  assert(DAGSize >= 1 &&
923321369Sdim         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
924321369Sdim         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
925321369Sdim  // If there is no reserved block at all, do nothing. We don't want
926321369Sdim  // everything in one block.
927321369Sdim  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
928321369Sdim                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
929321369Sdim      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
930321369Sdim                        CurrentTopDownReservedDependencyColoring.end()) == 0)
931321369Sdim    return;
932321369Sdim
933309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
934309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
935293838Sdim    std::set<unsigned> SUColors;
936293838Sdim    std::set<unsigned> SUColorsPending;
937293838Sdim
938293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
939293838Sdim      continue;
940293838Sdim
941293838Sdim    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
942293838Sdim        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
943293838Sdim      continue;
944293838Sdim
945293838Sdim    for (SDep& SuccDep : SU->Succs) {
946293838Sdim      SUnit *Succ = SuccDep.getSUnit();
947293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
948293838Sdim        continue;
949293838Sdim      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
950293838Sdim          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
951293838Sdim        SUColors.insert(CurrentColoring[Succ->NodeNum]);
952293838Sdim      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
953293838Sdim    }
954321369Sdim    // If there is only one child/parent block, and that block
955321369Sdim    // is not among the ones we are removing in this path, then
956321369Sdim    // merge the instruction to that block
957293838Sdim    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
958293838Sdim      PendingColoring[SU->NodeNum] = *SUColors.begin();
959293838Sdim    else // TODO: Attribute new colors depending on color
960293838Sdim         // combination of children.
961293838Sdim      PendingColoring[SU->NodeNum] = NextNonReservedID++;
962293838Sdim  }
963293838Sdim  CurrentColoring = PendingColoring;
964293838Sdim}
965293838Sdim
966293838Sdim
967293838Sdimvoid SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
968293838Sdim  unsigned DAGSize = DAG->SUnits.size();
969293838Sdim  unsigned PreviousColor;
970293838Sdim  std::set<unsigned> SeenColors;
971293838Sdim
972293838Sdim  if (DAGSize <= 1)
973293838Sdim    return;
974293838Sdim
975293838Sdim  PreviousColor = CurrentColoring[0];
976293838Sdim
977293838Sdim  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
978293838Sdim    SUnit *SU = &DAG->SUnits[i];
979293838Sdim    unsigned CurrentColor = CurrentColoring[i];
980293838Sdim    unsigned PreviousColorSave = PreviousColor;
981293838Sdim    assert(i == SU->NodeNum);
982293838Sdim
983293838Sdim    if (CurrentColor != PreviousColor)
984293838Sdim      SeenColors.insert(PreviousColor);
985293838Sdim    PreviousColor = CurrentColor;
986293838Sdim
987293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
988293838Sdim      continue;
989293838Sdim
990293838Sdim    if (SeenColors.find(CurrentColor) == SeenColors.end())
991293838Sdim      continue;
992293838Sdim
993293838Sdim    if (PreviousColorSave != CurrentColor)
994293838Sdim      CurrentColoring[i] = NextNonReservedID++;
995293838Sdim    else
996293838Sdim      CurrentColoring[i] = CurrentColoring[i-1];
997293838Sdim  }
998293838Sdim}
999293838Sdim
1000293838Sdimvoid SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1001293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1002293838Sdim
1003309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1004309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1005293838Sdim    std::set<unsigned> SUColors;
1006293838Sdim
1007293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1008293838Sdim      continue;
1009293838Sdim
1010293838Sdim    // No predecessor: Vgpr constant loading.
1011293838Sdim    // Low latency instructions usually have a predecessor (the address)
1012293838Sdim    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
1013293838Sdim      continue;
1014293838Sdim
1015293838Sdim    for (SDep& SuccDep : SU->Succs) {
1016293838Sdim      SUnit *Succ = SuccDep.getSUnit();
1017293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1018293838Sdim        continue;
1019293838Sdim      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1020293838Sdim    }
1021293838Sdim    if (SUColors.size() == 1)
1022293838Sdim      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1023293838Sdim  }
1024293838Sdim}
1025293838Sdim
1026293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1027293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1028293838Sdim
1029309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1030309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1031293838Sdim    std::set<unsigned> SUColors;
1032293838Sdim
1033293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1034293838Sdim      continue;
1035293838Sdim
1036293838Sdim    for (SDep& SuccDep : SU->Succs) {
1037293838Sdim       SUnit *Succ = SuccDep.getSUnit();
1038293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1039293838Sdim        continue;
1040293838Sdim      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1041293838Sdim    }
1042293838Sdim    if (SUColors.size() == 1)
1043293838Sdim      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1044293838Sdim  }
1045293838Sdim}
1046293838Sdim
1047293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1048293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1049293838Sdim
1050309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1051309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1052293838Sdim    std::set<unsigned> SUColors;
1053293838Sdim
1054293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1055293838Sdim      continue;
1056293838Sdim
1057293838Sdim    for (SDep& SuccDep : SU->Succs) {
1058293838Sdim       SUnit *Succ = SuccDep.getSUnit();
1059293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1060293838Sdim        continue;
1061293838Sdim      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1062293838Sdim    }
1063293838Sdim    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1064293838Sdim      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1065293838Sdim  }
1066293838Sdim}
1067293838Sdim
1068293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1069293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1070293838Sdim  std::map<unsigned, unsigned> ColorCount;
1071293838Sdim
1072309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1073309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1074293838Sdim    unsigned color = CurrentColoring[SU->NodeNum];
1075321369Sdim     ++ColorCount[color];
1076293838Sdim  }
1077293838Sdim
1078309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1079309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1080293838Sdim    unsigned color = CurrentColoring[SU->NodeNum];
1081293838Sdim    std::set<unsigned> SUColors;
1082293838Sdim
1083293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1084293838Sdim      continue;
1085293838Sdim
1086293838Sdim    if (ColorCount[color] > 1)
1087293838Sdim      continue;
1088293838Sdim
1089293838Sdim    for (SDep& SuccDep : SU->Succs) {
1090293838Sdim       SUnit *Succ = SuccDep.getSUnit();
1091293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1092293838Sdim        continue;
1093293838Sdim      SUColors.insert(CurrentColoring[Succ->NodeNum]);
1094293838Sdim    }
1095293838Sdim    if (SUColors.size() == 1 && *SUColors.begin() != color) {
1096293838Sdim      --ColorCount[color];
1097293838Sdim      CurrentColoring[SU->NodeNum] = *SUColors.begin();
1098293838Sdim      ++ColorCount[*SUColors.begin()];
1099293838Sdim    }
1100293838Sdim  }
1101293838Sdim}
1102293838Sdim
1103293838Sdimvoid SIScheduleBlockCreator::cutHugeBlocks() {
1104293838Sdim  // TODO
1105293838Sdim}
1106293838Sdim
1107293838Sdimvoid SIScheduleBlockCreator::regroupNoUserInstructions() {
1108293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1109293838Sdim  int GroupID = NextNonReservedID++;
1110293838Sdim
1111309124Sdim  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1112309124Sdim    SUnit *SU = &DAG->SUnits[SUNum];
1113293838Sdim    bool hasSuccessor = false;
1114293838Sdim
1115293838Sdim    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1116293838Sdim      continue;
1117293838Sdim
1118293838Sdim    for (SDep& SuccDep : SU->Succs) {
1119293838Sdim       SUnit *Succ = SuccDep.getSUnit();
1120293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1121293838Sdim        continue;
1122293838Sdim      hasSuccessor = true;
1123293838Sdim    }
1124293838Sdim    if (!hasSuccessor)
1125293838Sdim      CurrentColoring[SU->NodeNum] = GroupID;
1126293838Sdim  }
1127293838Sdim}
1128293838Sdim
1129327952Sdimvoid SIScheduleBlockCreator::colorExports() {
1130327952Sdim  unsigned ExportColor = NextNonReservedID++;
1131327952Sdim  SmallVector<unsigned, 8> ExpGroup;
1132327952Sdim
1133327952Sdim  // Put all exports together in a block.
1134327952Sdim  // The block will naturally end up being scheduled last,
1135327952Sdim  // thus putting exports at the end of the schedule, which
1136327952Sdim  // is better for performance.
1137327952Sdim  // However we must ensure, for safety, the exports can be put
1138327952Sdim  // together in the same block without any other instruction.
1139327952Sdim  // This could happen, for example, when scheduling after regalloc
1140327952Sdim  // if reloading a spilled register from memory using the same
1141327952Sdim  // register than used in a previous export.
1142327952Sdim  // If that happens, do not regroup the exports.
1143327952Sdim  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1144327952Sdim    const SUnit &SU = DAG->SUnits[SUNum];
1145327952Sdim    if (SIInstrInfo::isEXP(*SU.getInstr())) {
1146327952Sdim      // Check the EXP can be added to the group safely,
1147327952Sdim      // ie without needing any other instruction.
1148327952Sdim      // The EXP is allowed to depend on other EXP
1149327952Sdim      // (they will be in the same group).
1150327952Sdim      for (unsigned j : ExpGroup) {
1151327952Sdim        bool HasSubGraph;
1152327952Sdim        std::vector<int> SubGraph;
1153327952Sdim        // By construction (topological order), if SU and
1154327952Sdim        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1155327952Sdim        // in the parent graph of SU.
1156327952Sdim#ifndef NDEBUG
1157327952Sdim        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1158327952Sdim                                               HasSubGraph);
1159327952Sdim        assert(!HasSubGraph);
1160327952Sdim#endif
1161327952Sdim        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1162327952Sdim                                               HasSubGraph);
1163327952Sdim        if (!HasSubGraph)
1164327952Sdim          continue; // No dependencies between each other
1165327952Sdim
1166327952Sdim        // SubGraph contains all the instructions required
1167327952Sdim        // between EXP SUnits[j] and EXP SU.
1168327952Sdim        for (unsigned k : SubGraph) {
1169327952Sdim          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1170327952Sdim            // Other instructions than EXP would be required in the group.
1171327952Sdim            // Abort the groupping.
1172327952Sdim            return;
1173327952Sdim        }
1174327952Sdim      }
1175327952Sdim
1176327952Sdim      ExpGroup.push_back(SUNum);
1177327952Sdim    }
1178327952Sdim  }
1179327952Sdim
1180327952Sdim  // The group can be formed. Give the color.
1181327952Sdim  for (unsigned j : ExpGroup)
1182327952Sdim    CurrentColoring[j] = ExportColor;
1183327952Sdim}
1184327952Sdim
1185293838Sdimvoid SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1186293838Sdim  unsigned DAGSize = DAG->SUnits.size();
1187293838Sdim  std::map<unsigned,unsigned> RealID;
1188293838Sdim
1189293838Sdim  CurrentBlocks.clear();
1190293838Sdim  CurrentColoring.clear();
1191293838Sdim  CurrentColoring.resize(DAGSize, 0);
1192293838Sdim  Node2CurrentBlock.clear();
1193293838Sdim
1194293838Sdim  // Restore links previous scheduling variant has overridden.
1195293838Sdim  DAG->restoreSULinksLeft();
1196293838Sdim
1197293838Sdim  NextReservedID = 1;
1198293838Sdim  NextNonReservedID = DAGSize + 1;
1199293838Sdim
1200341825Sdim  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1201293838Sdim
1202293838Sdim  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1203293838Sdim    colorHighLatenciesGroups();
1204293838Sdim  else
1205293838Sdim    colorHighLatenciesAlone();
1206293838Sdim  colorComputeReservedDependencies();
1207293838Sdim  colorAccordingToReservedDependencies();
1208293838Sdim  colorEndsAccordingToDependencies();
1209293838Sdim  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1210293838Sdim    colorForceConsecutiveOrderInGroup();
1211293838Sdim  regroupNoUserInstructions();
1212293838Sdim  colorMergeConstantLoadsNextGroup();
1213293838Sdim  colorMergeIfPossibleNextGroupOnlyForReserved();
1214327952Sdim  colorExports();
1215293838Sdim
1216293838Sdim  // Put SUs of same color into same block
1217293838Sdim  Node2CurrentBlock.resize(DAGSize, -1);
1218293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1219293838Sdim    SUnit *SU = &DAG->SUnits[i];
1220293838Sdim    unsigned Color = CurrentColoring[SU->NodeNum];
1221293838Sdim    if (RealID.find(Color) == RealID.end()) {
1222293838Sdim      int ID = CurrentBlocks.size();
1223360784Sdim      BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
1224293838Sdim      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1225293838Sdim      RealID[Color] = ID;
1226293838Sdim    }
1227293838Sdim    CurrentBlocks[RealID[Color]]->addUnit(SU);
1228293838Sdim    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1229293838Sdim  }
1230293838Sdim
1231293838Sdim  // Build dependencies between blocks.
1232293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1233293838Sdim    SUnit *SU = &DAG->SUnits[i];
1234293838Sdim    int SUID = Node2CurrentBlock[i];
1235293838Sdim     for (SDep& SuccDep : SU->Succs) {
1236293838Sdim       SUnit *Succ = SuccDep.getSUnit();
1237293838Sdim      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1238293838Sdim        continue;
1239293838Sdim      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1240321369Sdim        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1241321369Sdim                                     SuccDep.isCtrl() ? NoData : Data);
1242293838Sdim    }
1243293838Sdim    for (SDep& PredDep : SU->Preds) {
1244293838Sdim      SUnit *Pred = PredDep.getSUnit();
1245293838Sdim      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1246293838Sdim        continue;
1247293838Sdim      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1248293838Sdim        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1249293838Sdim    }
1250293838Sdim  }
1251293838Sdim
1252293838Sdim  // Free root and leafs of all blocks to enable scheduling inside them.
1253293838Sdim  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1254293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1255293838Sdim    Block->finalizeUnits();
1256293838Sdim  }
1257341825Sdim  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1258341825Sdim             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1259341825Sdim               SIScheduleBlock *Block = CurrentBlocks[i];
1260341825Sdim               Block->printDebug(true);
1261341825Sdim             });
1262293838Sdim}
1263293838Sdim
1264293838Sdim// Two functions taken from Codegen/MachineScheduler.cpp
1265293838Sdim
1266314564Sdim/// Non-const version.
1267314564Sdimstatic MachineBasicBlock::iterator
1268314564SdimnextIfDebug(MachineBasicBlock::iterator I,
1269293838Sdim            MachineBasicBlock::const_iterator End) {
1270314564Sdim  for (; I != End; ++I) {
1271341825Sdim    if (!I->isDebugInstr())
1272293838Sdim      break;
1273293838Sdim  }
1274293838Sdim  return I;
1275293838Sdim}
1276293838Sdim
1277293838Sdimvoid SIScheduleBlockCreator::topologicalSort() {
1278293838Sdim  unsigned DAGSize = CurrentBlocks.size();
1279293838Sdim  std::vector<int> WorkList;
1280293838Sdim
1281341825Sdim  LLVM_DEBUG(dbgs() << "Topological Sort\n");
1282293838Sdim
1283293838Sdim  WorkList.reserve(DAGSize);
1284293838Sdim  TopDownIndex2Block.resize(DAGSize);
1285293838Sdim  TopDownBlock2Index.resize(DAGSize);
1286293838Sdim  BottomUpIndex2Block.resize(DAGSize);
1287293838Sdim
1288293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1289293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1290293838Sdim    unsigned Degree = Block->getSuccs().size();
1291293838Sdim    TopDownBlock2Index[i] = Degree;
1292293838Sdim    if (Degree == 0) {
1293293838Sdim      WorkList.push_back(i);
1294293838Sdim    }
1295293838Sdim  }
1296293838Sdim
1297293838Sdim  int Id = DAGSize;
1298293838Sdim  while (!WorkList.empty()) {
1299293838Sdim    int i = WorkList.back();
1300293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1301293838Sdim    WorkList.pop_back();
1302293838Sdim    TopDownBlock2Index[i] = --Id;
1303293838Sdim    TopDownIndex2Block[Id] = i;
1304293838Sdim    for (SIScheduleBlock* Pred : Block->getPreds()) {
1305293838Sdim      if (!--TopDownBlock2Index[Pred->getID()])
1306293838Sdim        WorkList.push_back(Pred->getID());
1307293838Sdim    }
1308293838Sdim  }
1309293838Sdim
1310293838Sdim#ifndef NDEBUG
1311293838Sdim  // Check correctness of the ordering.
1312293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1313293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1314293838Sdim    for (SIScheduleBlock* Pred : Block->getPreds()) {
1315293838Sdim      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1316293838Sdim      "Wrong Top Down topological sorting");
1317293838Sdim    }
1318293838Sdim  }
1319293838Sdim#endif
1320293838Sdim
1321293838Sdim  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1322293838Sdim                                         TopDownIndex2Block.rend());
1323293838Sdim}
1324293838Sdim
1325293838Sdimvoid SIScheduleBlockCreator::scheduleInsideBlocks() {
1326293838Sdim  unsigned DAGSize = CurrentBlocks.size();
1327293838Sdim
1328341825Sdim  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1329293838Sdim
1330293838Sdim  // We do schedule a valid scheduling such that a Block corresponds
1331293838Sdim  // to a range of instructions.
1332341825Sdim  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1333293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1334293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1335293838Sdim    Block->fastSchedule();
1336293838Sdim  }
1337293838Sdim
1338293838Sdim  // Note: the following code, and the part restoring previous position
1339293838Sdim  // is by far the most expensive operation of the Scheduler.
1340293838Sdim
1341293838Sdim  // Do not update CurrentTop.
1342293838Sdim  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1343293838Sdim  std::vector<MachineBasicBlock::iterator> PosOld;
1344293838Sdim  std::vector<MachineBasicBlock::iterator> PosNew;
1345293838Sdim  PosOld.reserve(DAG->SUnits.size());
1346293838Sdim  PosNew.reserve(DAG->SUnits.size());
1347293838Sdim
1348293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1349293838Sdim    int BlockIndice = TopDownIndex2Block[i];
1350293838Sdim    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1351293838Sdim    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1352293838Sdim
1353293838Sdim    for (SUnit* SU : SUs) {
1354293838Sdim      MachineInstr *MI = SU->getInstr();
1355293838Sdim      MachineBasicBlock::iterator Pos = MI;
1356293838Sdim      PosOld.push_back(Pos);
1357293838Sdim      if (&*CurrentTopFastSched == MI) {
1358293838Sdim        PosNew.push_back(Pos);
1359293838Sdim        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1360293838Sdim                                          DAG->getCurrentBottom());
1361293838Sdim      } else {
1362293838Sdim        // Update the instruction stream.
1363293838Sdim        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1364293838Sdim
1365293838Sdim        // Update LiveIntervals.
1366314564Sdim        // Note: Moving all instructions and calling handleMove every time
1367293838Sdim        // is the most cpu intensive operation of the scheduler.
1368293838Sdim        // It would gain a lot if there was a way to recompute the
1369293838Sdim        // LiveIntervals for the entire scheduling region.
1370309124Sdim        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1371293838Sdim        PosNew.push_back(CurrentTopFastSched);
1372293838Sdim      }
1373293838Sdim    }
1374293838Sdim  }
1375293838Sdim
1376293838Sdim  // Now we have Block of SUs == Block of MI.
1377293838Sdim  // We do the final schedule for the instructions inside the block.
1378293838Sdim  // The property that all the SUs of the Block are grouped together as MI
1379293838Sdim  // is used for correct reg usage tracking.
1380293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1381293838Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1382293838Sdim    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1383293838Sdim    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1384293838Sdim  }
1385293838Sdim
1386341825Sdim  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1387293838Sdim  // Restore old ordering (which prevents a LIS->handleMove bug).
1388293838Sdim  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1389293838Sdim    MachineBasicBlock::iterator POld = PosOld[i-1];
1390293838Sdim    MachineBasicBlock::iterator PNew = PosNew[i-1];
1391293838Sdim    if (PNew != POld) {
1392293838Sdim      // Update the instruction stream.
1393293838Sdim      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1394293838Sdim
1395293838Sdim      // Update LiveIntervals.
1396309124Sdim      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1397293838Sdim    }
1398293838Sdim  }
1399293838Sdim
1400341825Sdim  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1401341825Sdim    SIScheduleBlock *Block = CurrentBlocks[i];
1402341825Sdim    Block->printDebug(true);
1403341825Sdim  });
1404293838Sdim}
1405293838Sdim
1406293838Sdimvoid SIScheduleBlockCreator::fillStats() {
1407293838Sdim  unsigned DAGSize = CurrentBlocks.size();
1408293838Sdim
1409293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1410293838Sdim    int BlockIndice = TopDownIndex2Block[i];
1411293838Sdim    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1412314564Sdim    if (Block->getPreds().empty())
1413293838Sdim      Block->Depth = 0;
1414293838Sdim    else {
1415293838Sdim      unsigned Depth = 0;
1416293838Sdim      for (SIScheduleBlock *Pred : Block->getPreds()) {
1417327952Sdim        if (Depth < Pred->Depth + Pred->getCost())
1418327952Sdim          Depth = Pred->Depth + Pred->getCost();
1419293838Sdim      }
1420293838Sdim      Block->Depth = Depth;
1421293838Sdim    }
1422293838Sdim  }
1423293838Sdim
1424293838Sdim  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1425293838Sdim    int BlockIndice = BottomUpIndex2Block[i];
1426293838Sdim    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1427314564Sdim    if (Block->getSuccs().empty())
1428293838Sdim      Block->Height = 0;
1429293838Sdim    else {
1430293838Sdim      unsigned Height = 0;
1431321369Sdim      for (const auto &Succ : Block->getSuccs())
1432327952Sdim        Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1433293838Sdim      Block->Height = Height;
1434293838Sdim    }
1435293838Sdim  }
1436293838Sdim}
1437293838Sdim
1438293838Sdim// SIScheduleBlockScheduler //
1439293838Sdim
1440293838SdimSIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1441293838Sdim                                                   SISchedulerBlockSchedulerVariant Variant,
1442293838Sdim                                                   SIScheduleBlocks  BlocksStruct) :
1443293838Sdim  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1444293838Sdim  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1445293838Sdim  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1446293838Sdim
1447293838Sdim  // Fill the usage of every output
1448293838Sdim  // Warning: while by construction we always have a link between two blocks
1449293838Sdim  // when one needs a result from the other, the number of users of an output
1450293838Sdim  // is not the sum of child blocks having as input the same virtual register.
1451293838Sdim  // Here is an example. A produces x and y. B eats x and produces x'.
1452293838Sdim  // C eats x' and y. The register coalescer may have attributed the same
1453293838Sdim  // virtual register to x and x'.
1454293838Sdim  // To count accurately, we do a topological sort. In case the register is
1455293838Sdim  // found for several parents, we increment the usage of the one with the
1456293838Sdim  // highest topological index.
1457293838Sdim  LiveOutRegsNumUsages.resize(Blocks.size());
1458293838Sdim  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1459293838Sdim    SIScheduleBlock *Block = Blocks[i];
1460293838Sdim    for (unsigned Reg : Block->getInRegs()) {
1461293838Sdim      bool Found = false;
1462293838Sdim      int topoInd = -1;
1463293838Sdim      for (SIScheduleBlock* Pred: Block->getPreds()) {
1464293838Sdim        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1465293838Sdim        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1466293838Sdim
1467293838Sdim        if (RegPos != PredOutRegs.end()) {
1468293838Sdim          Found = true;
1469293838Sdim          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1470293838Sdim            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1471293838Sdim          }
1472293838Sdim        }
1473293838Sdim      }
1474293838Sdim
1475293838Sdim      if (!Found)
1476293838Sdim        continue;
1477293838Sdim
1478293838Sdim      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1479321369Sdim      ++LiveOutRegsNumUsages[PredID][Reg];
1480293838Sdim    }
1481293838Sdim  }
1482293838Sdim
1483293838Sdim  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1484293838Sdim  BlockNumPredsLeft.resize(Blocks.size());
1485293838Sdim  BlockNumSuccsLeft.resize(Blocks.size());
1486293838Sdim
1487293838Sdim  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1488293838Sdim    SIScheduleBlock *Block = Blocks[i];
1489293838Sdim    BlockNumPredsLeft[i] = Block->getPreds().size();
1490293838Sdim    BlockNumSuccsLeft[i] = Block->getSuccs().size();
1491293838Sdim  }
1492293838Sdim
1493293838Sdim#ifndef NDEBUG
1494293838Sdim  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1495293838Sdim    SIScheduleBlock *Block = Blocks[i];
1496293838Sdim    assert(Block->getID() == i);
1497293838Sdim  }
1498293838Sdim#endif
1499293838Sdim
1500293838Sdim  std::set<unsigned> InRegs = DAG->getInRegs();
1501293838Sdim  addLiveRegs(InRegs);
1502293838Sdim
1503321369Sdim  // Increase LiveOutRegsNumUsages for blocks
1504321369Sdim  // producing registers consumed in another
1505321369Sdim  // scheduling region.
1506321369Sdim  for (unsigned Reg : DAG->getOutRegs()) {
1507321369Sdim    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1508321369Sdim      // Do reverse traversal
1509321369Sdim      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1510321369Sdim      SIScheduleBlock *Block = Blocks[ID];
1511321369Sdim      const std::set<unsigned> &OutRegs = Block->getOutRegs();
1512321369Sdim
1513321369Sdim      if (OutRegs.find(Reg) == OutRegs.end())
1514321369Sdim        continue;
1515321369Sdim
1516321369Sdim      ++LiveOutRegsNumUsages[ID][Reg];
1517321369Sdim      break;
1518321369Sdim    }
1519321369Sdim  }
1520321369Sdim
1521293838Sdim  // Fill LiveRegsConsumers for regs that were already
1522293838Sdim  // defined before scheduling.
1523293838Sdim  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1524293838Sdim    SIScheduleBlock *Block = Blocks[i];
1525293838Sdim    for (unsigned Reg : Block->getInRegs()) {
1526293838Sdim      bool Found = false;
1527293838Sdim      for (SIScheduleBlock* Pred: Block->getPreds()) {
1528293838Sdim        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1529293838Sdim        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1530293838Sdim
1531293838Sdim        if (RegPos != PredOutRegs.end()) {
1532293838Sdim          Found = true;
1533293838Sdim          break;
1534293838Sdim        }
1535293838Sdim      }
1536293838Sdim
1537321369Sdim      if (!Found)
1538321369Sdim        ++LiveRegsConsumers[Reg];
1539293838Sdim    }
1540293838Sdim  }
1541293838Sdim
1542293838Sdim  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1543293838Sdim    SIScheduleBlock *Block = Blocks[i];
1544293838Sdim    if (BlockNumPredsLeft[i] == 0) {
1545293838Sdim      ReadyBlocks.push_back(Block);
1546293838Sdim    }
1547293838Sdim  }
1548293838Sdim
1549293838Sdim  while (SIScheduleBlock *Block = pickBlock()) {
1550293838Sdim    BlocksScheduled.push_back(Block);
1551293838Sdim    blockScheduled(Block);
1552293838Sdim  }
1553293838Sdim
1554341825Sdim  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1555341825Sdim                                            : BlocksScheduled) {
1556341825Sdim    dbgs() << ' ' << Block->getID();
1557341825Sdim  } dbgs() << '\n';);
1558293838Sdim}
1559293838Sdim
1560293838Sdimbool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1561293838Sdim                                                   SIBlockSchedCandidate &TryCand) {
1562293838Sdim  if (!Cand.isValid()) {
1563293838Sdim    TryCand.Reason = NodeOrder;
1564293838Sdim    return true;
1565293838Sdim  }
1566293838Sdim
1567293838Sdim  // Try to hide high latencies.
1568341825Sdim  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1569341825Sdim                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1570293838Sdim    return true;
1571293838Sdim  // Schedule high latencies early so you can hide them better.
1572341825Sdim  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1573341825Sdim                          TryCand, Cand, Latency))
1574293838Sdim    return true;
1575341825Sdim  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1576341825Sdim                                                   TryCand, Cand, Depth))
1577293838Sdim    return true;
1578341825Sdim  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1579341825Sdim                          Cand.NumHighLatencySuccessors,
1580341825Sdim                          TryCand, Cand, Successor))
1581293838Sdim    return true;
1582293838Sdim  return false;
1583293838Sdim}
1584293838Sdim
1585293838Sdimbool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1586293838Sdim                                                    SIBlockSchedCandidate &TryCand) {
1587293838Sdim  if (!Cand.isValid()) {
1588293838Sdim    TryCand.Reason = NodeOrder;
1589293838Sdim    return true;
1590293838Sdim  }
1591293838Sdim
1592341825Sdim  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1593341825Sdim                       TryCand, Cand, RegUsage))
1594293838Sdim    return true;
1595341825Sdim  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1596341825Sdim                          Cand.NumSuccessors > 0,
1597341825Sdim                          TryCand, Cand, Successor))
1598293838Sdim    return true;
1599341825Sdim  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1600293838Sdim    return true;
1601341825Sdim  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1602341825Sdim                       TryCand, Cand, RegUsage))
1603293838Sdim    return true;
1604293838Sdim  return false;
1605293838Sdim}
1606293838Sdim
1607293838SdimSIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1608293838Sdim  SIBlockSchedCandidate Cand;
1609293838Sdim  std::vector<SIScheduleBlock*>::iterator Best;
1610293838Sdim  SIScheduleBlock *Block;
1611293838Sdim  if (ReadyBlocks.empty())
1612293838Sdim    return nullptr;
1613293838Sdim
1614293838Sdim  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1615293838Sdim                        VregCurrentUsage, SregCurrentUsage);
1616293838Sdim  if (VregCurrentUsage > maxVregUsage)
1617293838Sdim    maxVregUsage = VregCurrentUsage;
1618321369Sdim  if (SregCurrentUsage > maxSregUsage)
1619321369Sdim    maxSregUsage = SregCurrentUsage;
1620341825Sdim  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1621341825Sdim             for (SIScheduleBlock *Block
1622341825Sdim                  : ReadyBlocks) dbgs()
1623341825Sdim             << Block->getID() << ' ';
1624341825Sdim             dbgs() << "\nCurrent Live:\n";
1625341825Sdim             for (unsigned Reg
1626341825Sdim                  : LiveRegs) dbgs()
1627341825Sdim             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1628341825Sdim             dbgs() << '\n';
1629341825Sdim             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1630341825Sdim             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1631293838Sdim
1632293838Sdim  Cand.Block = nullptr;
1633293838Sdim  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1634293838Sdim       E = ReadyBlocks.end(); I != E; ++I) {
1635293838Sdim    SIBlockSchedCandidate TryCand;
1636293838Sdim    TryCand.Block = *I;
1637293838Sdim    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1638293838Sdim    TryCand.VGPRUsageDiff =
1639293838Sdim      checkRegUsageImpact(TryCand.Block->getInRegs(),
1640293838Sdim                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1641293838Sdim    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1642293838Sdim    TryCand.NumHighLatencySuccessors =
1643293838Sdim      TryCand.Block->getNumHighLatencySuccessors();
1644293838Sdim    TryCand.LastPosHighLatParentScheduled =
1645293838Sdim      (unsigned int) std::max<int> (0,
1646293838Sdim         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1647293838Sdim           LastPosWaitedHighLatency);
1648293838Sdim    TryCand.Height = TryCand.Block->Height;
1649293838Sdim    // Try not to increase VGPR usage too much, else we may spill.
1650293838Sdim    if (VregCurrentUsage > 120 ||
1651293838Sdim        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1652293838Sdim      if (!tryCandidateRegUsage(Cand, TryCand) &&
1653293838Sdim          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1654293838Sdim        tryCandidateLatency(Cand, TryCand);
1655293838Sdim    } else {
1656293838Sdim      if (!tryCandidateLatency(Cand, TryCand))
1657293838Sdim        tryCandidateRegUsage(Cand, TryCand);
1658293838Sdim    }
1659293838Sdim    if (TryCand.Reason != NoCand) {
1660293838Sdim      Cand.setBest(TryCand);
1661293838Sdim      Best = I;
1662341825Sdim      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1663341825Sdim                        << getReasonStr(Cand.Reason) << '\n');
1664293838Sdim    }
1665293838Sdim  }
1666293838Sdim
1667341825Sdim  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1668341825Sdim             dbgs() << "Is a block with high latency instruction: "
1669341825Sdim                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
1670341825Sdim             dbgs() << "Position of last high latency dependency: "
1671341825Sdim                    << Cand.LastPosHighLatParentScheduled << '\n';
1672341825Sdim             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1673341825Sdim             dbgs() << '\n';);
1674293838Sdim
1675293838Sdim  Block = Cand.Block;
1676293838Sdim  ReadyBlocks.erase(Best);
1677293838Sdim  return Block;
1678293838Sdim}
1679293838Sdim
1680293838Sdim// Tracking of currently alive registers to determine VGPR Usage.
1681293838Sdim
1682293838Sdimvoid SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1683293838Sdim  for (unsigned Reg : Regs) {
1684293838Sdim    // For now only track virtual registers.
1685360784Sdim    if (!Register::isVirtualRegister(Reg))
1686293838Sdim      continue;
1687293838Sdim    // If not already in the live set, then add it.
1688293838Sdim    (void) LiveRegs.insert(Reg);
1689293838Sdim  }
1690293838Sdim}
1691293838Sdim
1692293838Sdimvoid SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1693293838Sdim                                       std::set<unsigned> &Regs) {
1694293838Sdim  for (unsigned Reg : Regs) {
1695293838Sdim    // For now only track virtual registers.
1696293838Sdim    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1697293838Sdim    assert (Pos != LiveRegs.end() && // Reg must be live.
1698293838Sdim               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1699293838Sdim               LiveRegsConsumers[Reg] >= 1);
1700293838Sdim    --LiveRegsConsumers[Reg];
1701293838Sdim    if (LiveRegsConsumers[Reg] == 0)
1702293838Sdim      LiveRegs.erase(Pos);
1703293838Sdim  }
1704293838Sdim}
1705293838Sdim
1706293838Sdimvoid SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1707321369Sdim  for (const auto &Block : Parent->getSuccs()) {
1708321369Sdim    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1709321369Sdim      ReadyBlocks.push_back(Block.first);
1710321369Sdim
1711321369Sdim    if (Parent->isHighLatencyBlock() &&
1712321369Sdim        Block.second == SIScheduleBlockLinkKind::Data)
1713321369Sdim      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1714293838Sdim  }
1715293838Sdim}
1716293838Sdim
1717293838Sdimvoid SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1718293838Sdim  decreaseLiveRegs(Block, Block->getInRegs());
1719293838Sdim  addLiveRegs(Block->getOutRegs());
1720293838Sdim  releaseBlockSuccs(Block);
1721293838Sdim  for (std::map<unsigned, unsigned>::iterator RegI =
1722293838Sdim       LiveOutRegsNumUsages[Block->getID()].begin(),
1723293838Sdim       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1724293838Sdim    std::pair<unsigned, unsigned> RegP = *RegI;
1725321369Sdim    // We produce this register, thus it must not be previously alive.
1726321369Sdim    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1727321369Sdim           LiveRegsConsumers[RegP.first] == 0);
1728321369Sdim    LiveRegsConsumers[RegP.first] += RegP.second;
1729293838Sdim  }
1730293838Sdim  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1731293838Sdim        (unsigned)LastPosWaitedHighLatency)
1732293838Sdim    LastPosWaitedHighLatency =
1733293838Sdim      LastPosHighLatencyParentScheduled[Block->getID()];
1734293838Sdim  ++NumBlockScheduled;
1735293838Sdim}
1736293838Sdim
1737293838Sdimstd::vector<int>
1738293838SdimSIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1739293838Sdim                                     std::set<unsigned> &OutRegs) {
1740293838Sdim  std::vector<int> DiffSetPressure;
1741293838Sdim  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1742293838Sdim
1743293838Sdim  for (unsigned Reg : InRegs) {
1744293838Sdim    // For now only track virtual registers.
1745360784Sdim    if (!Register::isVirtualRegister(Reg))
1746293838Sdim      continue;
1747293838Sdim    if (LiveRegsConsumers[Reg] > 1)
1748293838Sdim      continue;
1749293838Sdim    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1750293838Sdim    for (; PSetI.isValid(); ++PSetI) {
1751293838Sdim      DiffSetPressure[*PSetI] -= PSetI.getWeight();
1752293838Sdim    }
1753293838Sdim  }
1754293838Sdim
1755293838Sdim  for (unsigned Reg : OutRegs) {
1756293838Sdim    // For now only track virtual registers.
1757360784Sdim    if (!Register::isVirtualRegister(Reg))
1758293838Sdim      continue;
1759293838Sdim    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1760293838Sdim    for (; PSetI.isValid(); ++PSetI) {
1761293838Sdim      DiffSetPressure[*PSetI] += PSetI.getWeight();
1762293838Sdim    }
1763293838Sdim  }
1764293838Sdim
1765293838Sdim  return DiffSetPressure;
1766293838Sdim}
1767293838Sdim
1768293838Sdim// SIScheduler //
1769293838Sdim
1770293838Sdimstruct SIScheduleBlockResult
1771293838SdimSIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1772293838Sdim                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
1773293838Sdim  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1774293838Sdim  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1775293838Sdim  std::vector<SIScheduleBlock*> ScheduledBlocks;
1776293838Sdim  struct SIScheduleBlockResult Res;
1777293838Sdim
1778293838Sdim  ScheduledBlocks = Scheduler.getBlocks();
1779293838Sdim
1780293838Sdim  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1781293838Sdim    SIScheduleBlock *Block = ScheduledBlocks[b];
1782293838Sdim    std::vector<SUnit*> SUs = Block->getScheduledUnits();
1783293838Sdim
1784293838Sdim    for (SUnit* SU : SUs)
1785293838Sdim      Res.SUs.push_back(SU->NodeNum);
1786293838Sdim  }
1787293838Sdim
1788293838Sdim  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1789293838Sdim  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1790293838Sdim  return Res;
1791293838Sdim}
1792293838Sdim
1793293838Sdim// SIScheduleDAGMI //
1794293838Sdim
1795293838SdimSIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1796360784Sdim  ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
1797293838Sdim  SITII = static_cast<const SIInstrInfo*>(TII);
1798293838Sdim  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1799293838Sdim
1800314564Sdim  VGPRSetID = SITRI->getVGPRPressureSet();
1801314564Sdim  SGPRSetID = SITRI->getSGPRPressureSet();
1802293838Sdim}
1803293838Sdim
1804314564SdimSIScheduleDAGMI::~SIScheduleDAGMI() = default;
1805293838Sdim
1806293838Sdim// Code adapted from scheduleDAG.cpp
1807293838Sdim// Does a topological sort over the SUs.
1808293838Sdim// Both TopDown and BottomUp
1809293838Sdimvoid SIScheduleDAGMI::topologicalSort() {
1810309124Sdim  Topo.InitDAGTopologicalSorting();
1811293838Sdim
1812309124Sdim  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1813309124Sdim  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1814293838Sdim}
1815293838Sdim
1816293838Sdim// Move low latencies further from their user without
1817293838Sdim// increasing SGPR usage (in general)
1818293838Sdim// This is to be replaced by a better pass that would
1819293838Sdim// take into account SGPR usage (based on VGPR Usage
1820293838Sdim// and the corresponding wavefront count), that would
1821293838Sdim// try to merge groups of loads if it make sense, etc
1822293838Sdimvoid SIScheduleDAGMI::moveLowLatencies() {
1823293838Sdim   unsigned DAGSize = SUnits.size();
1824293838Sdim   int LastLowLatencyUser = -1;
1825293838Sdim   int LastLowLatencyPos = -1;
1826293838Sdim
1827293838Sdim   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1828293838Sdim    SUnit *SU = &SUnits[ScheduledSUnits[i]];
1829293838Sdim    bool IsLowLatencyUser = false;
1830293838Sdim    unsigned MinPos = 0;
1831293838Sdim
1832293838Sdim    for (SDep& PredDep : SU->Preds) {
1833293838Sdim      SUnit *Pred = PredDep.getSUnit();
1834309124Sdim      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1835293838Sdim        IsLowLatencyUser = true;
1836293838Sdim      }
1837293838Sdim      if (Pred->NodeNum >= DAGSize)
1838293838Sdim        continue;
1839293838Sdim      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1840293838Sdim      if (PredPos >= MinPos)
1841293838Sdim        MinPos = PredPos + 1;
1842293838Sdim    }
1843293838Sdim
1844309124Sdim    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1845293838Sdim      unsigned BestPos = LastLowLatencyUser + 1;
1846293838Sdim      if ((int)BestPos <= LastLowLatencyPos)
1847293838Sdim        BestPos = LastLowLatencyPos + 1;
1848293838Sdim      if (BestPos < MinPos)
1849293838Sdim        BestPos = MinPos;
1850293838Sdim      if (BestPos < i) {
1851293838Sdim        for (unsigned u = i; u > BestPos; --u) {
1852293838Sdim          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1853293838Sdim          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1854293838Sdim        }
1855293838Sdim        ScheduledSUnits[BestPos] = SU->NodeNum;
1856293838Sdim        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1857293838Sdim      }
1858293838Sdim      LastLowLatencyPos = BestPos;
1859293838Sdim      if (IsLowLatencyUser)
1860293838Sdim        LastLowLatencyUser = BestPos;
1861293838Sdim    } else if (IsLowLatencyUser) {
1862293838Sdim      LastLowLatencyUser = i;
1863293838Sdim    // Moves COPY instructions on which depends
1864293838Sdim    // the low latency instructions too.
1865293838Sdim    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1866293838Sdim      bool CopyForLowLat = false;
1867293838Sdim      for (SDep& SuccDep : SU->Succs) {
1868293838Sdim        SUnit *Succ = SuccDep.getSUnit();
1869353358Sdim        if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1870353358Sdim          continue;
1871309124Sdim        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1872293838Sdim          CopyForLowLat = true;
1873293838Sdim        }
1874293838Sdim      }
1875293838Sdim      if (!CopyForLowLat)
1876293838Sdim        continue;
1877293838Sdim      if (MinPos < i) {
1878293838Sdim        for (unsigned u = i; u > MinPos; --u) {
1879293838Sdim          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1880293838Sdim          ScheduledSUnits[u] = ScheduledSUnits[u-1];
1881293838Sdim        }
1882293838Sdim        ScheduledSUnits[MinPos] = SU->NodeNum;
1883293838Sdim        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1884293838Sdim      }
1885293838Sdim    }
1886293838Sdim  }
1887293838Sdim}
1888293838Sdim
1889293838Sdimvoid SIScheduleDAGMI::restoreSULinksLeft() {
1890293838Sdim  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1891293838Sdim    SUnits[i].isScheduled = false;
1892293838Sdim    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1893293838Sdim    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1894293838Sdim    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1895293838Sdim    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1896293838Sdim  }
1897293838Sdim}
1898293838Sdim
1899293838Sdim// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1900293838Sdimtemplate<typename _Iterator> void
1901293838SdimSIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1902293838Sdim                                  unsigned &VgprUsage, unsigned &SgprUsage) {
1903293838Sdim  VgprUsage = 0;
1904293838Sdim  SgprUsage = 0;
1905293838Sdim  for (_Iterator RegI = First; RegI != End; ++RegI) {
1906293838Sdim    unsigned Reg = *RegI;
1907293838Sdim    // For now only track virtual registers
1908360784Sdim    if (!Register::isVirtualRegister(Reg))
1909293838Sdim      continue;
1910293838Sdim    PSetIterator PSetI = MRI.getPressureSets(Reg);
1911293838Sdim    for (; PSetI.isValid(); ++PSetI) {
1912293838Sdim      if (*PSetI == VGPRSetID)
1913293838Sdim        VgprUsage += PSetI.getWeight();
1914293838Sdim      else if (*PSetI == SGPRSetID)
1915293838Sdim        SgprUsage += PSetI.getWeight();
1916293838Sdim    }
1917293838Sdim  }
1918293838Sdim}
1919293838Sdim
1920293838Sdimvoid SIScheduleDAGMI::schedule()
1921293838Sdim{
1922293838Sdim  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1923293838Sdim  SIScheduleBlockResult Best, Temp;
1924341825Sdim  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1925293838Sdim
1926293838Sdim  buildDAGWithRegPressure();
1927344779Sdim  LLVM_DEBUG(dump());
1928293838Sdim
1929293838Sdim  topologicalSort();
1930293838Sdim  findRootsAndBiasEdges(TopRoots, BotRoots);
1931293838Sdim  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1932293838Sdim  // functions, but to make them happy we must initialize
1933293838Sdim  // the default Scheduler implementation (even if we do not
1934293838Sdim  // run it)
1935293838Sdim  SchedImpl->initialize(this);
1936293838Sdim  initQueues(TopRoots, BotRoots);
1937293838Sdim
1938293838Sdim  // Fill some stats to help scheduling.
1939293838Sdim
1940293838Sdim  SUnitsLinksBackup = SUnits;
1941293838Sdim  IsLowLatencySU.clear();
1942293838Sdim  LowLatencyOffset.clear();
1943293838Sdim  IsHighLatencySU.clear();
1944293838Sdim
1945293838Sdim  IsLowLatencySU.resize(SUnits.size(), 0);
1946293838Sdim  LowLatencyOffset.resize(SUnits.size(), 0);
1947293838Sdim  IsHighLatencySU.resize(SUnits.size(), 0);
1948293838Sdim
1949293838Sdim  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1950293838Sdim    SUnit *SU = &SUnits[i];
1951353358Sdim    const MachineOperand *BaseLatOp;
1952309124Sdim    int64_t OffLatReg;
1953309124Sdim    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1954293838Sdim      IsLowLatencySU[i] = 1;
1955344779Sdim      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1956344779Sdim                                         TRI))
1957293838Sdim        LowLatencyOffset[i] = OffLatReg;
1958309124Sdim    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1959293838Sdim      IsHighLatencySU[i] = 1;
1960293838Sdim  }
1961293838Sdim
1962293838Sdim  SIScheduler Scheduler(this);
1963293838Sdim  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1964293838Sdim                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1965309124Sdim
1966293838Sdim  // if VGPR usage is extremely high, try other good performing variants
1967293838Sdim  // which could lead to lower VGPR usage
1968293838Sdim  if (Best.MaxVGPRUsage > 180) {
1969321369Sdim    static const std::pair<SISchedulerBlockCreatorVariant,
1970321369Sdim                           SISchedulerBlockSchedulerVariant>
1971321369Sdim        Variants[] = {
1972293838Sdim      { LatenciesAlone, BlockRegUsageLatency },
1973293838Sdim//      { LatenciesAlone, BlockRegUsage },
1974293838Sdim      { LatenciesGrouped, BlockLatencyRegUsage },
1975293838Sdim//      { LatenciesGrouped, BlockRegUsageLatency },
1976293838Sdim//      { LatenciesGrouped, BlockRegUsage },
1977293838Sdim      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1978293838Sdim//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1979293838Sdim//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
1980293838Sdim    };
1981293838Sdim    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1982293838Sdim      Temp = Scheduler.scheduleVariant(v.first, v.second);
1983293838Sdim      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1984293838Sdim        Best = Temp;
1985293838Sdim    }
1986293838Sdim  }
1987293838Sdim  // if VGPR usage is still extremely high, we may spill. Try other variants
1988293838Sdim  // which are less performing, but that could lead to lower VGPR usage.
1989293838Sdim  if (Best.MaxVGPRUsage > 200) {
1990321369Sdim    static const std::pair<SISchedulerBlockCreatorVariant,
1991321369Sdim                           SISchedulerBlockSchedulerVariant>
1992321369Sdim        Variants[] = {
1993293838Sdim//      { LatenciesAlone, BlockRegUsageLatency },
1994293838Sdim      { LatenciesAlone, BlockRegUsage },
1995293838Sdim//      { LatenciesGrouped, BlockLatencyRegUsage },
1996293838Sdim      { LatenciesGrouped, BlockRegUsageLatency },
1997293838Sdim      { LatenciesGrouped, BlockRegUsage },
1998293838Sdim//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1999293838Sdim      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2000293838Sdim      { LatenciesAlonePlusConsecutive, BlockRegUsage }
2001293838Sdim    };
2002293838Sdim    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2003293838Sdim      Temp = Scheduler.scheduleVariant(v.first, v.second);
2004293838Sdim      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2005293838Sdim        Best = Temp;
2006293838Sdim    }
2007293838Sdim  }
2008309124Sdim
2009293838Sdim  ScheduledSUnits = Best.SUs;
2010293838Sdim  ScheduledSUnitsInv.resize(SUnits.size());
2011293838Sdim
2012293838Sdim  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
2013293838Sdim    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2014293838Sdim  }
2015293838Sdim
2016293838Sdim  moveLowLatencies();
2017293838Sdim
2018293838Sdim  // Tell the outside world about the result of the scheduling.
2019293838Sdim
2020293838Sdim  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2021293838Sdim  TopRPTracker.setPos(CurrentTop);
2022293838Sdim
2023293838Sdim  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2024293838Sdim       E = ScheduledSUnits.end(); I != E; ++I) {
2025293838Sdim    SUnit *SU = &SUnits[*I];
2026293838Sdim
2027293838Sdim    scheduleMI(SU, true);
2028293838Sdim
2029341825Sdim    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2030341825Sdim                      << *SU->getInstr());
2031293838Sdim  }
2032293838Sdim
2033293838Sdim  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2034293838Sdim
2035293838Sdim  placeDebugValues();
2036293838Sdim
2037341825Sdim  LLVM_DEBUG({
2038327952Sdim    dbgs() << "*** Final schedule for "
2039327952Sdim           << printMBBReference(*begin()->getParent()) << " ***\n";
2040327952Sdim    dumpSchedule();
2041327952Sdim    dbgs() << '\n';
2042327952Sdim  });
2043293838Sdim}
2044