1314564Sdim//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// 2293838Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6293838Sdim// 7293838Sdim//===----------------------------------------------------------------------===// 8293838Sdim// 9293838Sdim/// \file 10341825Sdim/// SI Machine Scheduler interface 11293838Sdim// 12293838Sdim//===----------------------------------------------------------------------===// 13293838Sdim 14321369Sdim#include "SIMachineScheduler.h" 15309124Sdim#include "AMDGPU.h" 16314564Sdim#include "SIInstrInfo.h" 17314564Sdim#include "SIRegisterInfo.h" 18341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19314564Sdim#include "llvm/ADT/STLExtras.h" 20314564Sdim#include "llvm/ADT/SmallVector.h" 21293838Sdim#include "llvm/CodeGen/LiveInterval.h" 22327952Sdim#include "llvm/CodeGen/LiveIntervals.h" 23314564Sdim#include "llvm/CodeGen/MachineInstr.h" 24293838Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 25293838Sdim#include "llvm/CodeGen/MachineScheduler.h" 26293838Sdim#include "llvm/CodeGen/RegisterPressure.h" 27314564Sdim#include "llvm/CodeGen/SlotIndexes.h" 28327952Sdim#include "llvm/CodeGen/TargetRegisterInfo.h" 29314564Sdim#include "llvm/Support/Debug.h" 30314564Sdim#include "llvm/Support/ErrorHandling.h" 31314564Sdim#include "llvm/Support/raw_ostream.h" 32314564Sdim#include <algorithm> 33314564Sdim#include <cassert> 34314564Sdim#include <map> 35314564Sdim#include <set> 36314564Sdim#include <utility> 37314564Sdim#include <vector> 38293838Sdim 39293838Sdimusing namespace llvm; 40293838Sdim 41321369Sdim#define DEBUG_TYPE "machine-scheduler" 42293838Sdim 43293838Sdim// This scheduler implements a different scheduling algorithm than 44293838Sdim// GenericScheduler. 45293838Sdim// 46293838Sdim// There are several specific architecture behaviours that can't be modelled 47293838Sdim// for GenericScheduler: 48293838Sdim// . When accessing the result of an SGPR load instruction, you have to wait 49293838Sdim// for all the SGPR load instructions before your current instruction to 50293838Sdim// have finished. 51293838Sdim// . When accessing the result of an VGPR load instruction, you have to wait 52293838Sdim// for all the VGPR load instructions previous to the VGPR load instruction 53293838Sdim// you are interested in to finish. 54293838Sdim// . The less the register pressure, the best load latencies are hidden 55293838Sdim// 56293838Sdim// Moreover some specifities (like the fact a lot of instructions in the shader 57293838Sdim// have few dependencies) makes the generic scheduler have some unpredictable 58293838Sdim// behaviours. For example when register pressure becomes high, it can either 59293838Sdim// manage to prevent register pressure from going too high, or it can 60293838Sdim// increase register pressure even more than if it hadn't taken register 61293838Sdim// pressure into account. 62293838Sdim// 63293838Sdim// Also some other bad behaviours are generated, like loading at the beginning 64293838Sdim// of the shader a constant in VGPR you won't need until the end of the shader. 65293838Sdim// 66293838Sdim// The scheduling problem for SI can distinguish three main parts: 67293838Sdim// . Hiding high latencies (texture sampling, etc) 68293838Sdim// . Hiding low latencies (SGPR constant loading, etc) 69293838Sdim// . Keeping register usage low for better latency hiding and general 70293838Sdim// performance 71293838Sdim// 72293838Sdim// Some other things can also affect performance, but are hard to predict 73293838Sdim// (cache usage, the fact the HW can issue several instructions from different 74293838Sdim// wavefronts if different types, etc) 75293838Sdim// 76293838Sdim// This scheduler tries to solve the scheduling problem by dividing it into 77293838Sdim// simpler sub-problems. It divides the instructions into blocks, schedules 78293838Sdim// locally inside the blocks where it takes care of low latencies, and then 79293838Sdim// chooses the order of the blocks by taking care of high latencies. 80293838Sdim// Dividing the instructions into blocks helps control keeping register 81293838Sdim// usage low. 82293838Sdim// 83293838Sdim// First the instructions are put into blocks. 84293838Sdim// We want the blocks help control register usage and hide high latencies 85293838Sdim// later. To help control register usage, we typically want all local 86293838Sdim// computations, when for example you create a result that can be comsummed 87293838Sdim// right away, to be contained in a block. Block inputs and outputs would 88293838Sdim// typically be important results that are needed in several locations of 89293838Sdim// the shader. Since we do want blocks to help hide high latencies, we want 90293838Sdim// the instructions inside the block to have a minimal set of dependencies 91293838Sdim// on high latencies. It will make it easy to pick blocks to hide specific 92293838Sdim// high latencies. 93293838Sdim// The block creation algorithm is divided into several steps, and several 94293838Sdim// variants can be tried during the scheduling process. 95293838Sdim// 96314564Sdim// Second the order of the instructions inside the blocks is chosen. 97293838Sdim// At that step we do take into account only register usage and hiding 98293838Sdim// low latency instructions 99293838Sdim// 100314564Sdim// Third the block order is chosen, there we try to hide high latencies 101293838Sdim// and keep register usage low. 102293838Sdim// 103293838Sdim// After the third step, a pass is done to improve the hiding of low 104293838Sdim// latencies. 105293838Sdim// 106293838Sdim// Actually when talking about 'low latency' or 'high latency' it includes 107293838Sdim// both the latency to get the cache (or global mem) data go to the register, 108314564Sdim// and the bandwidth limitations. 109293838Sdim// Increasing the number of active wavefronts helps hide the former, but it 110293838Sdim// doesn't solve the latter, thus why even if wavefront count is high, we have 111293838Sdim// to try have as many instructions hiding high latencies as possible. 112293838Sdim// The OpenCL doc says for example latency of 400 cycles for a global mem access, 113293838Sdim// which is hidden by 10 instructions if the wavefront count is 10. 114293838Sdim 115293838Sdim// Some figures taken from AMD docs: 116293838Sdim// Both texture and constant L1 caches are 4-way associative with 64 bytes 117293838Sdim// lines. 118293838Sdim// Constant cache is shared with 4 CUs. 119293838Sdim// For texture sampling, the address generation unit receives 4 texture 120293838Sdim// addresses per cycle, thus we could expect texture sampling latency to be 121293838Sdim// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, 122293838Sdim// instructions in a wavefront group are executed every 4 cycles), 123293838Sdim// or 16 instructions if the other wavefronts associated to the 3 other VALUs 124293838Sdim// of the CU do texture sampling too. (Don't take these figures too seriously, 125293838Sdim// as I'm not 100% sure of the computation) 126293838Sdim// Data exports should get similar latency. 127293838Sdim// For constant loading, the cache is shader with 4 CUs. 128293838Sdim// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" 129293838Sdim// I guess if the other CU don't read the cache, it can go up to 64B/cycle. 130293838Sdim// It means a simple s_buffer_load should take one instruction to hide, as 131293838Sdim// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same 132293838Sdim// cache line. 133293838Sdim// 134293838Sdim// As of today the driver doesn't preload the constants in cache, thus the 135293838Sdim// first loads get extra latency. The doc says global memory access can be 136293838Sdim// 300-600 cycles. We do not specially take that into account when scheduling 137293838Sdim// As we expect the driver to be able to preload the constants soon. 138293838Sdim 139293838Sdim// common code // 140293838Sdim 141293838Sdim#ifndef NDEBUG 142293838Sdim 143293838Sdimstatic const char *getReasonStr(SIScheduleCandReason Reason) { 144293838Sdim switch (Reason) { 145293838Sdim case NoCand: return "NOCAND"; 146293838Sdim case RegUsage: return "REGUSAGE"; 147293838Sdim case Latency: return "LATENCY"; 148293838Sdim case Successor: return "SUCCESSOR"; 149293838Sdim case Depth: return "DEPTH"; 150293838Sdim case NodeOrder: return "ORDER"; 151293838Sdim } 152293838Sdim llvm_unreachable("Unknown reason!"); 153293838Sdim} 154293838Sdim 155293838Sdim#endif 156293838Sdim 157341825Sdimnamespace llvm { 158341825Sdimnamespace SISched { 159293838Sdimstatic bool tryLess(int TryVal, int CandVal, 160293838Sdim SISchedulerCandidate &TryCand, 161293838Sdim SISchedulerCandidate &Cand, 162293838Sdim SIScheduleCandReason Reason) { 163293838Sdim if (TryVal < CandVal) { 164293838Sdim TryCand.Reason = Reason; 165293838Sdim return true; 166293838Sdim } 167293838Sdim if (TryVal > CandVal) { 168293838Sdim if (Cand.Reason > Reason) 169293838Sdim Cand.Reason = Reason; 170293838Sdim return true; 171293838Sdim } 172293838Sdim Cand.setRepeat(Reason); 173293838Sdim return false; 174293838Sdim} 175293838Sdim 176293838Sdimstatic bool tryGreater(int TryVal, int CandVal, 177293838Sdim SISchedulerCandidate &TryCand, 178293838Sdim SISchedulerCandidate &Cand, 179293838Sdim SIScheduleCandReason Reason) { 180293838Sdim if (TryVal > CandVal) { 181293838Sdim TryCand.Reason = Reason; 182293838Sdim return true; 183293838Sdim } 184293838Sdim if (TryVal < CandVal) { 185293838Sdim if (Cand.Reason > Reason) 186293838Sdim Cand.Reason = Reason; 187293838Sdim return true; 188293838Sdim } 189293838Sdim Cand.setRepeat(Reason); 190293838Sdim return false; 191293838Sdim} 192341825Sdim} // end namespace SISched 193341825Sdim} // end namespace llvm 194293838Sdim 195293838Sdim// SIScheduleBlock // 196293838Sdim 197293838Sdimvoid SIScheduleBlock::addUnit(SUnit *SU) { 198293838Sdim NodeNum2Index[SU->NodeNum] = SUnits.size(); 199293838Sdim SUnits.push_back(SU); 200293838Sdim} 201293838Sdim 202293838Sdim#ifndef NDEBUG 203293838Sdimvoid SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { 204293838Sdim 205293838Sdim dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); 206293838Sdim dbgs() << '\n'; 207293838Sdim} 208293838Sdim#endif 209293838Sdim 210293838Sdimvoid SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, 211293838Sdim SISchedCandidate &TryCand) { 212293838Sdim // Initialize the candidate if needed. 213293838Sdim if (!Cand.isValid()) { 214293838Sdim TryCand.Reason = NodeOrder; 215293838Sdim return; 216293838Sdim } 217293838Sdim 218293838Sdim if (Cand.SGPRUsage > 60 && 219341825Sdim SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, 220341825Sdim TryCand, Cand, RegUsage)) 221293838Sdim return; 222293838Sdim 223293838Sdim // Schedule low latency instructions as top as possible. 224293838Sdim // Order of priority is: 225293838Sdim // . Low latency instructions which do not depend on other low latency 226293838Sdim // instructions we haven't waited for 227293838Sdim // . Other instructions which do not depend on low latency instructions 228293838Sdim // we haven't waited for 229293838Sdim // . Low latencies 230293838Sdim // . All other instructions 231314564Sdim // Goal is to get: low latency instructions - independent instructions 232293838Sdim // - (eventually some more low latency instructions) 233293838Sdim // - instructions that depend on the first low latency instructions. 234293838Sdim // If in the block there is a lot of constant loads, the SGPR usage 235293838Sdim // could go quite high, thus above the arbitrary limit of 60 will encourage 236293838Sdim // use the already loaded constants (in order to release some SGPRs) before 237293838Sdim // loading more. 238341825Sdim if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, 239341825Sdim Cand.HasLowLatencyNonWaitedParent, 240341825Sdim TryCand, Cand, SIScheduleCandReason::Depth)) 241293838Sdim return; 242293838Sdim 243341825Sdim if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, 244341825Sdim TryCand, Cand, SIScheduleCandReason::Depth)) 245293838Sdim return; 246293838Sdim 247293838Sdim if (TryCand.IsLowLatency && 248341825Sdim SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, 249341825Sdim TryCand, Cand, SIScheduleCandReason::Depth)) 250293838Sdim return; 251293838Sdim 252341825Sdim if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, 253341825Sdim TryCand, Cand, RegUsage)) 254293838Sdim return; 255293838Sdim 256293838Sdim // Fall through to original instruction order. 257293838Sdim if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { 258293838Sdim TryCand.Reason = NodeOrder; 259293838Sdim } 260293838Sdim} 261293838Sdim 262293838SdimSUnit* SIScheduleBlock::pickNode() { 263293838Sdim SISchedCandidate TopCand; 264293838Sdim 265293838Sdim for (SUnit* SU : TopReadySUs) { 266293838Sdim SISchedCandidate TryCand; 267293838Sdim std::vector<unsigned> pressure; 268293838Sdim std::vector<unsigned> MaxPressure; 269293838Sdim // Predict register usage after this instruction. 270293838Sdim TryCand.SU = SU; 271293838Sdim TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); 272293838Sdim TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; 273293838Sdim TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; 274293838Sdim TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; 275293838Sdim TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; 276293838Sdim TryCand.HasLowLatencyNonWaitedParent = 277293838Sdim HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; 278293838Sdim tryCandidateTopDown(TopCand, TryCand); 279293838Sdim if (TryCand.Reason != NoCand) 280293838Sdim TopCand.setBest(TryCand); 281293838Sdim } 282293838Sdim 283293838Sdim return TopCand.SU; 284293838Sdim} 285293838Sdim 286293838Sdim 287293838Sdim// Schedule something valid. 288293838Sdimvoid SIScheduleBlock::fastSchedule() { 289293838Sdim TopReadySUs.clear(); 290293838Sdim if (Scheduled) 291293838Sdim undoSchedule(); 292293838Sdim 293293838Sdim for (SUnit* SU : SUnits) { 294293838Sdim if (!SU->NumPredsLeft) 295293838Sdim TopReadySUs.push_back(SU); 296293838Sdim } 297293838Sdim 298293838Sdim while (!TopReadySUs.empty()) { 299293838Sdim SUnit *SU = TopReadySUs[0]; 300293838Sdim ScheduledSUnits.push_back(SU); 301293838Sdim nodeScheduled(SU); 302293838Sdim } 303293838Sdim 304293838Sdim Scheduled = true; 305293838Sdim} 306293838Sdim 307293838Sdim// Returns if the register was set between first and last. 308293838Sdimstatic bool isDefBetween(unsigned Reg, 309293838Sdim SlotIndex First, SlotIndex Last, 310293838Sdim const MachineRegisterInfo *MRI, 311293838Sdim const LiveIntervals *LIS) { 312293838Sdim for (MachineRegisterInfo::def_instr_iterator 313293838Sdim UI = MRI->def_instr_begin(Reg), 314293838Sdim UE = MRI->def_instr_end(); UI != UE; ++UI) { 315293838Sdim const MachineInstr* MI = &*UI; 316293838Sdim if (MI->isDebugValue()) 317293838Sdim continue; 318309124Sdim SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); 319293838Sdim if (InstSlot >= First && InstSlot <= Last) 320293838Sdim return true; 321293838Sdim } 322293838Sdim return false; 323293838Sdim} 324293838Sdim 325293838Sdimvoid SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, 326293838Sdim MachineBasicBlock::iterator EndBlock) { 327293838Sdim IntervalPressure Pressure, BotPressure; 328293838Sdim RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); 329293838Sdim LiveIntervals *LIS = DAG->getLIS(); 330293838Sdim MachineRegisterInfo *MRI = DAG->getMRI(); 331293838Sdim DAG->initRPTracker(TopRPTracker); 332293838Sdim DAG->initRPTracker(BotRPTracker); 333293838Sdim DAG->initRPTracker(RPTracker); 334293838Sdim 335293838Sdim // Goes though all SU. RPTracker captures what had to be alive for the SUs 336293838Sdim // to execute, and what is still alive at the end. 337293838Sdim for (SUnit* SU : ScheduledSUnits) { 338293838Sdim RPTracker.setPos(SU->getInstr()); 339293838Sdim RPTracker.advance(); 340293838Sdim } 341293838Sdim 342293838Sdim // Close the RPTracker to finalize live ins/outs. 343293838Sdim RPTracker.closeRegion(); 344293838Sdim 345293838Sdim // Initialize the live ins and live outs. 346293838Sdim TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); 347293838Sdim BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); 348293838Sdim 349293838Sdim // Do not Track Physical Registers, because it messes up. 350309124Sdim for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { 351360784Sdim if (Register::isVirtualRegister(RegMaskPair.RegUnit)) 352309124Sdim LiveInRegs.insert(RegMaskPair.RegUnit); 353293838Sdim } 354293838Sdim LiveOutRegs.clear(); 355293838Sdim // There is several possibilities to distinguish: 356293838Sdim // 1) Reg is not input to any instruction in the block, but is output of one 357293838Sdim // 2) 1) + read in the block and not needed after it 358293838Sdim // 3) 1) + read in the block but needed in another block 359293838Sdim // 4) Reg is input of an instruction but another block will read it too 360293838Sdim // 5) Reg is input of an instruction and then rewritten in the block. 361293838Sdim // result is not read in the block (implies used in another block) 362293838Sdim // 6) Reg is input of an instruction and then rewritten in the block. 363293838Sdim // result is read in the block and not needed in another block 364293838Sdim // 7) Reg is input of an instruction and then rewritten in the block. 365293838Sdim // result is read in the block but also needed in another block 366293838Sdim // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 367293838Sdim // We want LiveOutRegs to contain only Regs whose content will be read after 368293838Sdim // in another block, and whose content was written in the current block, 369293838Sdim // that is we want it to get 1, 3, 5, 7 370293838Sdim // Since we made the MIs of a block to be packed all together before 371293838Sdim // scheduling, then the LiveIntervals were correct, and the RPTracker was 372293838Sdim // able to correctly handle 5 vs 6, 2 vs 3. 373293838Sdim // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) 374293838Sdim // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 375293838Sdim // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 376293838Sdim // The use of findDefBetween removes the case 4. 377309124Sdim for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { 378309124Sdim unsigned Reg = RegMaskPair.RegUnit; 379360784Sdim if (Register::isVirtualRegister(Reg) && 380309124Sdim isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), 381309124Sdim LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, 382309124Sdim LIS)) { 383293838Sdim LiveOutRegs.insert(Reg); 384293838Sdim } 385293838Sdim } 386293838Sdim 387293838Sdim // Pressure = sum_alive_registers register size 388293838Sdim // Internally llvm will represent some registers as big 128 bits registers 389293838Sdim // for example, but they actually correspond to 4 actual 32 bits registers. 390293838Sdim // Thus Pressure is not equal to num_alive_registers * constant. 391293838Sdim LiveInPressure = TopPressure.MaxSetPressure; 392293838Sdim LiveOutPressure = BotPressure.MaxSetPressure; 393293838Sdim 394293838Sdim // Prepares TopRPTracker for top down scheduling. 395293838Sdim TopRPTracker.closeTop(); 396293838Sdim} 397293838Sdim 398293838Sdimvoid SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, 399293838Sdim MachineBasicBlock::iterator EndBlock) { 400293838Sdim if (!Scheduled) 401293838Sdim fastSchedule(); 402293838Sdim 403293838Sdim // PreScheduling phase to set LiveIn and LiveOut. 404293838Sdim initRegPressure(BeginBlock, EndBlock); 405293838Sdim undoSchedule(); 406293838Sdim 407293838Sdim // Schedule for real now. 408293838Sdim 409293838Sdim TopReadySUs.clear(); 410293838Sdim 411293838Sdim for (SUnit* SU : SUnits) { 412293838Sdim if (!SU->NumPredsLeft) 413293838Sdim TopReadySUs.push_back(SU); 414293838Sdim } 415293838Sdim 416293838Sdim while (!TopReadySUs.empty()) { 417293838Sdim SUnit *SU = pickNode(); 418293838Sdim ScheduledSUnits.push_back(SU); 419293838Sdim TopRPTracker.setPos(SU->getInstr()); 420293838Sdim TopRPTracker.advance(); 421293838Sdim nodeScheduled(SU); 422293838Sdim } 423293838Sdim 424293838Sdim // TODO: compute InternalAdditionnalPressure. 425293838Sdim InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); 426293838Sdim 427293838Sdim // Check everything is right. 428293838Sdim#ifndef NDEBUG 429293838Sdim assert(SUnits.size() == ScheduledSUnits.size() && 430293838Sdim TopReadySUs.empty()); 431293838Sdim for (SUnit* SU : SUnits) { 432293838Sdim assert(SU->isScheduled && 433293838Sdim SU->NumPredsLeft == 0); 434293838Sdim } 435293838Sdim#endif 436293838Sdim 437293838Sdim Scheduled = true; 438293838Sdim} 439293838Sdim 440293838Sdimvoid SIScheduleBlock::undoSchedule() { 441293838Sdim for (SUnit* SU : SUnits) { 442293838Sdim SU->isScheduled = false; 443293838Sdim for (SDep& Succ : SU->Succs) { 444293838Sdim if (BC->isSUInBlock(Succ.getSUnit(), ID)) 445293838Sdim undoReleaseSucc(SU, &Succ); 446293838Sdim } 447293838Sdim } 448293838Sdim HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 449293838Sdim ScheduledSUnits.clear(); 450293838Sdim Scheduled = false; 451293838Sdim} 452293838Sdim 453293838Sdimvoid SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { 454293838Sdim SUnit *SuccSU = SuccEdge->getSUnit(); 455293838Sdim 456293838Sdim if (SuccEdge->isWeak()) { 457293838Sdim ++SuccSU->WeakPredsLeft; 458293838Sdim return; 459293838Sdim } 460293838Sdim ++SuccSU->NumPredsLeft; 461293838Sdim} 462293838Sdim 463293838Sdimvoid SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { 464293838Sdim SUnit *SuccSU = SuccEdge->getSUnit(); 465293838Sdim 466293838Sdim if (SuccEdge->isWeak()) { 467293838Sdim --SuccSU->WeakPredsLeft; 468293838Sdim return; 469293838Sdim } 470293838Sdim#ifndef NDEBUG 471293838Sdim if (SuccSU->NumPredsLeft == 0) { 472293838Sdim dbgs() << "*** Scheduling failed! ***\n"; 473344779Sdim DAG->dumpNode(*SuccSU); 474293838Sdim dbgs() << " has been released too many times!\n"; 475293838Sdim llvm_unreachable(nullptr); 476293838Sdim } 477293838Sdim#endif 478293838Sdim 479293838Sdim --SuccSU->NumPredsLeft; 480293838Sdim} 481293838Sdim 482293838Sdim/// Release Successors of the SU that are in the block or not. 483293838Sdimvoid SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { 484293838Sdim for (SDep& Succ : SU->Succs) { 485293838Sdim SUnit *SuccSU = Succ.getSUnit(); 486293838Sdim 487309124Sdim if (SuccSU->NodeNum >= DAG->SUnits.size()) 488309124Sdim continue; 489309124Sdim 490293838Sdim if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) 491293838Sdim continue; 492293838Sdim 493293838Sdim releaseSucc(SU, &Succ); 494293838Sdim if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) 495293838Sdim TopReadySUs.push_back(SuccSU); 496293838Sdim } 497293838Sdim} 498293838Sdim 499293838Sdimvoid SIScheduleBlock::nodeScheduled(SUnit *SU) { 500293838Sdim // Is in TopReadySUs 501293838Sdim assert (!SU->NumPredsLeft); 502314564Sdim std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); 503293838Sdim if (I == TopReadySUs.end()) { 504293838Sdim dbgs() << "Data Structure Bug in SI Scheduler\n"; 505293838Sdim llvm_unreachable(nullptr); 506293838Sdim } 507293838Sdim TopReadySUs.erase(I); 508293838Sdim 509293838Sdim releaseSuccessors(SU, true); 510293838Sdim // Scheduling this node will trigger a wait, 511293838Sdim // thus propagate to other instructions that they do not need to wait either. 512293838Sdim if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) 513293838Sdim HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); 514293838Sdim 515293838Sdim if (DAG->IsLowLatencySU[SU->NodeNum]) { 516293838Sdim for (SDep& Succ : SU->Succs) { 517293838Sdim std::map<unsigned, unsigned>::iterator I = 518293838Sdim NodeNum2Index.find(Succ.getSUnit()->NodeNum); 519293838Sdim if (I != NodeNum2Index.end()) 520293838Sdim HasLowLatencyNonWaitedParent[I->second] = 1; 521293838Sdim } 522293838Sdim } 523293838Sdim SU->isScheduled = true; 524293838Sdim} 525293838Sdim 526293838Sdimvoid SIScheduleBlock::finalizeUnits() { 527293838Sdim // We remove links from outside blocks to enable scheduling inside the block. 528293838Sdim for (SUnit* SU : SUnits) { 529293838Sdim releaseSuccessors(SU, false); 530293838Sdim if (DAG->IsHighLatencySU[SU->NodeNum]) 531293838Sdim HighLatencyBlock = true; 532293838Sdim } 533293838Sdim HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); 534293838Sdim} 535293838Sdim 536293838Sdim// we maintain ascending order of IDs 537293838Sdimvoid SIScheduleBlock::addPred(SIScheduleBlock *Pred) { 538293838Sdim unsigned PredID = Pred->getID(); 539293838Sdim 540293838Sdim // Check if not already predecessor. 541293838Sdim for (SIScheduleBlock* P : Preds) { 542293838Sdim if (PredID == P->getID()) 543293838Sdim return; 544293838Sdim } 545293838Sdim Preds.push_back(Pred); 546293838Sdim 547309124Sdim assert(none_of(Succs, 548321369Sdim [=](std::pair<SIScheduleBlock*, 549321369Sdim SIScheduleBlockLinkKind> S) { 550321369Sdim return PredID == S.first->getID(); 551321369Sdim }) && 552309124Sdim "Loop in the Block Graph!"); 553293838Sdim} 554293838Sdim 555321369Sdimvoid SIScheduleBlock::addSucc(SIScheduleBlock *Succ, 556321369Sdim SIScheduleBlockLinkKind Kind) { 557293838Sdim unsigned SuccID = Succ->getID(); 558293838Sdim 559293838Sdim // Check if not already predecessor. 560321369Sdim for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { 561321369Sdim if (SuccID == S.first->getID()) { 562321369Sdim if (S.second == SIScheduleBlockLinkKind::NoData && 563321369Sdim Kind == SIScheduleBlockLinkKind::Data) 564321369Sdim S.second = Kind; 565293838Sdim return; 566321369Sdim } 567293838Sdim } 568293838Sdim if (Succ->isHighLatencyBlock()) 569293838Sdim ++NumHighLatencySuccessors; 570321369Sdim Succs.push_back(std::make_pair(Succ, Kind)); 571321369Sdim 572309124Sdim assert(none_of(Preds, 573309124Sdim [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && 574309124Sdim "Loop in the Block Graph!"); 575293838Sdim} 576293838Sdim 577293838Sdim#ifndef NDEBUG 578293838Sdimvoid SIScheduleBlock::printDebug(bool full) { 579293838Sdim dbgs() << "Block (" << ID << ")\n"; 580293838Sdim if (!full) 581293838Sdim return; 582293838Sdim 583293838Sdim dbgs() << "\nContains High Latency Instruction: " 584293838Sdim << HighLatencyBlock << '\n'; 585293838Sdim dbgs() << "\nDepends On:\n"; 586293838Sdim for (SIScheduleBlock* P : Preds) { 587293838Sdim P->printDebug(false); 588293838Sdim } 589293838Sdim 590293838Sdim dbgs() << "\nSuccessors:\n"; 591321369Sdim for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { 592321369Sdim if (S.second == SIScheduleBlockLinkKind::Data) 593321369Sdim dbgs() << "(Data Dep) "; 594321369Sdim S.first->printDebug(false); 595293838Sdim } 596293838Sdim 597293838Sdim if (Scheduled) { 598293838Sdim dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' 599293838Sdim << LiveInPressure[DAG->getVGPRSetID()] << '\n'; 600293838Sdim dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' 601293838Sdim << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; 602293838Sdim dbgs() << "LiveIns:\n"; 603293838Sdim for (unsigned Reg : LiveInRegs) 604327952Sdim dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 605293838Sdim 606293838Sdim dbgs() << "\nLiveOuts:\n"; 607293838Sdim for (unsigned Reg : LiveOutRegs) 608327952Sdim dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 609293838Sdim } 610293838Sdim 611293838Sdim dbgs() << "\nInstructions:\n"; 612360784Sdim for (const SUnit* SU : SUnits) 613344779Sdim DAG->dumpNode(*SU); 614293838Sdim 615314564Sdim dbgs() << "///////////////////////\n"; 616293838Sdim} 617293838Sdim#endif 618293838Sdim 619293838Sdim// SIScheduleBlockCreator // 620293838Sdim 621360784SdimSIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) 622360784Sdim : DAG(DAG) {} 623293838Sdim 624293838SdimSIScheduleBlocks 625293838SdimSIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { 626293838Sdim std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = 627293838Sdim Blocks.find(BlockVariant); 628293838Sdim if (B == Blocks.end()) { 629293838Sdim SIScheduleBlocks Res; 630293838Sdim createBlocksForVariant(BlockVariant); 631293838Sdim topologicalSort(); 632293838Sdim scheduleInsideBlocks(); 633293838Sdim fillStats(); 634293838Sdim Res.Blocks = CurrentBlocks; 635293838Sdim Res.TopDownIndex2Block = TopDownIndex2Block; 636293838Sdim Res.TopDownBlock2Index = TopDownBlock2Index; 637293838Sdim Blocks[BlockVariant] = Res; 638293838Sdim return Res; 639293838Sdim } else { 640293838Sdim return B->second; 641293838Sdim } 642293838Sdim} 643293838Sdim 644293838Sdimbool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { 645293838Sdim if (SU->NodeNum >= DAG->SUnits.size()) 646293838Sdim return false; 647293838Sdim return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; 648293838Sdim} 649293838Sdim 650293838Sdimvoid SIScheduleBlockCreator::colorHighLatenciesAlone() { 651293838Sdim unsigned DAGSize = DAG->SUnits.size(); 652293838Sdim 653293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 654293838Sdim SUnit *SU = &DAG->SUnits[i]; 655293838Sdim if (DAG->IsHighLatencySU[SU->NodeNum]) { 656293838Sdim CurrentColoring[SU->NodeNum] = NextReservedID++; 657293838Sdim } 658293838Sdim } 659293838Sdim} 660293838Sdim 661321369Sdimstatic bool 662321369SdimhasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { 663321369Sdim for (const auto &PredDep : SU.Preds) { 664321369Sdim if (PredDep.getSUnit() == &FromSU && 665321369Sdim PredDep.getKind() == llvm::SDep::Data) 666321369Sdim return true; 667321369Sdim } 668321369Sdim return false; 669321369Sdim} 670321369Sdim 671293838Sdimvoid SIScheduleBlockCreator::colorHighLatenciesGroups() { 672293838Sdim unsigned DAGSize = DAG->SUnits.size(); 673293838Sdim unsigned NumHighLatencies = 0; 674293838Sdim unsigned GroupSize; 675321369Sdim int Color = NextReservedID; 676293838Sdim unsigned Count = 0; 677293838Sdim std::set<unsigned> FormingGroup; 678293838Sdim 679293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 680293838Sdim SUnit *SU = &DAG->SUnits[i]; 681293838Sdim if (DAG->IsHighLatencySU[SU->NodeNum]) 682293838Sdim ++NumHighLatencies; 683293838Sdim } 684293838Sdim 685293838Sdim if (NumHighLatencies == 0) 686293838Sdim return; 687293838Sdim 688293838Sdim if (NumHighLatencies <= 6) 689293838Sdim GroupSize = 2; 690293838Sdim else if (NumHighLatencies <= 12) 691293838Sdim GroupSize = 3; 692293838Sdim else 693293838Sdim GroupSize = 4; 694293838Sdim 695321369Sdim for (unsigned SUNum : DAG->TopDownIndex2SU) { 696321369Sdim const SUnit &SU = DAG->SUnits[SUNum]; 697321369Sdim if (DAG->IsHighLatencySU[SU.NodeNum]) { 698293838Sdim unsigned CompatibleGroup = true; 699321369Sdim int ProposedColor = Color; 700321369Sdim std::vector<int> AdditionalElements; 701321369Sdim 702321369Sdim // We don't want to put in the same block 703321369Sdim // two high latency instructions that depend 704321369Sdim // on each other. 705321369Sdim // One way would be to check canAddEdge 706321369Sdim // in both directions, but that currently is not 707321369Sdim // enough because there the high latency order is 708321369Sdim // enforced (via links). 709321369Sdim // Instead, look at the dependencies between the 710321369Sdim // high latency instructions and deduce if it is 711321369Sdim // a data dependency or not. 712293838Sdim for (unsigned j : FormingGroup) { 713321369Sdim bool HasSubGraph; 714321369Sdim std::vector<int> SubGraph; 715321369Sdim // By construction (topological order), if SU and 716321369Sdim // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 717321369Sdim // in the parent graph of SU. 718321369Sdim#ifndef NDEBUG 719321369Sdim SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 720321369Sdim HasSubGraph); 721321369Sdim assert(!HasSubGraph); 722321369Sdim#endif 723321369Sdim SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 724321369Sdim HasSubGraph); 725321369Sdim if (!HasSubGraph) 726321369Sdim continue; // No dependencies between each other 727321369Sdim else if (SubGraph.size() > 5) { 728321369Sdim // Too many elements would be required to be added to the block. 729293838Sdim CompatibleGroup = false; 730321369Sdim break; 731321369Sdim } 732321369Sdim else { 733321369Sdim // Check the type of dependency 734321369Sdim for (unsigned k : SubGraph) { 735321369Sdim // If in the path to join the two instructions, 736321369Sdim // there is another high latency instruction, 737321369Sdim // or instructions colored for another block 738321369Sdim // abort the merge. 739321369Sdim if (DAG->IsHighLatencySU[k] || 740321369Sdim (CurrentColoring[k] != ProposedColor && 741321369Sdim CurrentColoring[k] != 0)) { 742321369Sdim CompatibleGroup = false; 743321369Sdim break; 744321369Sdim } 745321369Sdim // If one of the SU in the subgraph depends on the result of SU j, 746321369Sdim // there'll be a data dependency. 747321369Sdim if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { 748321369Sdim CompatibleGroup = false; 749321369Sdim break; 750321369Sdim } 751321369Sdim } 752321369Sdim if (!CompatibleGroup) 753321369Sdim break; 754321369Sdim // Same check for the SU 755321369Sdim if (hasDataDependencyPred(SU, DAG->SUnits[j])) { 756321369Sdim CompatibleGroup = false; 757321369Sdim break; 758321369Sdim } 759321369Sdim // Add all the required instructions to the block 760321369Sdim // These cannot live in another block (because they 761321369Sdim // depend (order dependency) on one of the 762321369Sdim // instruction in the block, and are required for the 763321369Sdim // high latency instruction we add. 764321369Sdim AdditionalElements.insert(AdditionalElements.end(), 765321369Sdim SubGraph.begin(), SubGraph.end()); 766321369Sdim } 767293838Sdim } 768321369Sdim if (CompatibleGroup) { 769321369Sdim FormingGroup.insert(SU.NodeNum); 770321369Sdim for (unsigned j : AdditionalElements) 771321369Sdim CurrentColoring[j] = ProposedColor; 772321369Sdim CurrentColoring[SU.NodeNum] = ProposedColor; 773321369Sdim ++Count; 774321369Sdim } 775321369Sdim // Found one incompatible instruction, 776321369Sdim // or has filled a big enough group. 777321369Sdim // -> start a new one. 778321369Sdim if (!CompatibleGroup) { 779293838Sdim FormingGroup.clear(); 780293838Sdim Color = ++NextReservedID; 781321369Sdim ProposedColor = Color; 782321369Sdim FormingGroup.insert(SU.NodeNum); 783321369Sdim CurrentColoring[SU.NodeNum] = ProposedColor; 784293838Sdim Count = 0; 785321369Sdim } else if (Count == GroupSize) { 786321369Sdim FormingGroup.clear(); 787321369Sdim Color = ++NextReservedID; 788321369Sdim ProposedColor = Color; 789321369Sdim Count = 0; 790293838Sdim } 791293838Sdim } 792293838Sdim } 793293838Sdim} 794293838Sdim 795293838Sdimvoid SIScheduleBlockCreator::colorComputeReservedDependencies() { 796293838Sdim unsigned DAGSize = DAG->SUnits.size(); 797293838Sdim std::map<std::set<unsigned>, unsigned> ColorCombinations; 798293838Sdim 799293838Sdim CurrentTopDownReservedDependencyColoring.clear(); 800293838Sdim CurrentBottomUpReservedDependencyColoring.clear(); 801293838Sdim 802293838Sdim CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); 803293838Sdim CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); 804293838Sdim 805293838Sdim // Traverse TopDown, and give different colors to SUs depending 806293838Sdim // on which combination of High Latencies they depend on. 807293838Sdim 808309124Sdim for (unsigned SUNum : DAG->TopDownIndex2SU) { 809309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 810293838Sdim std::set<unsigned> SUColors; 811293838Sdim 812293838Sdim // Already given. 813293838Sdim if (CurrentColoring[SU->NodeNum]) { 814293838Sdim CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 815293838Sdim CurrentColoring[SU->NodeNum]; 816293838Sdim continue; 817293838Sdim } 818293838Sdim 819293838Sdim for (SDep& PredDep : SU->Preds) { 820293838Sdim SUnit *Pred = PredDep.getSUnit(); 821293838Sdim if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 822293838Sdim continue; 823293838Sdim if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) 824293838Sdim SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); 825293838Sdim } 826293838Sdim // Color 0 by default. 827293838Sdim if (SUColors.empty()) 828293838Sdim continue; 829293838Sdim // Same color than parents. 830293838Sdim if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 831293838Sdim CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 832293838Sdim *SUColors.begin(); 833293838Sdim else { 834293838Sdim std::map<std::set<unsigned>, unsigned>::iterator Pos = 835293838Sdim ColorCombinations.find(SUColors); 836293838Sdim if (Pos != ColorCombinations.end()) { 837293838Sdim CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; 838293838Sdim } else { 839293838Sdim CurrentTopDownReservedDependencyColoring[SU->NodeNum] = 840293838Sdim NextNonReservedID; 841293838Sdim ColorCombinations[SUColors] = NextNonReservedID++; 842293838Sdim } 843293838Sdim } 844293838Sdim } 845293838Sdim 846293838Sdim ColorCombinations.clear(); 847293838Sdim 848293838Sdim // Same as before, but BottomUp. 849293838Sdim 850309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 851309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 852293838Sdim std::set<unsigned> SUColors; 853293838Sdim 854293838Sdim // Already given. 855293838Sdim if (CurrentColoring[SU->NodeNum]) { 856293838Sdim CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 857293838Sdim CurrentColoring[SU->NodeNum]; 858293838Sdim continue; 859293838Sdim } 860293838Sdim 861293838Sdim for (SDep& SuccDep : SU->Succs) { 862293838Sdim SUnit *Succ = SuccDep.getSUnit(); 863293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 864293838Sdim continue; 865293838Sdim if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) 866293838Sdim SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); 867293838Sdim } 868293838Sdim // Keep color 0. 869293838Sdim if (SUColors.empty()) 870293838Sdim continue; 871293838Sdim // Same color than parents. 872293838Sdim if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) 873293838Sdim CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 874293838Sdim *SUColors.begin(); 875293838Sdim else { 876293838Sdim std::map<std::set<unsigned>, unsigned>::iterator Pos = 877293838Sdim ColorCombinations.find(SUColors); 878293838Sdim if (Pos != ColorCombinations.end()) { 879293838Sdim CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; 880293838Sdim } else { 881293838Sdim CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = 882293838Sdim NextNonReservedID; 883293838Sdim ColorCombinations[SUColors] = NextNonReservedID++; 884293838Sdim } 885293838Sdim } 886293838Sdim } 887293838Sdim} 888293838Sdim 889293838Sdimvoid SIScheduleBlockCreator::colorAccordingToReservedDependencies() { 890293838Sdim unsigned DAGSize = DAG->SUnits.size(); 891293838Sdim std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; 892293838Sdim 893293838Sdim // Every combination of colors given by the top down 894293838Sdim // and bottom up Reserved node dependency 895293838Sdim 896293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 897293838Sdim SUnit *SU = &DAG->SUnits[i]; 898293838Sdim std::pair<unsigned, unsigned> SUColors; 899293838Sdim 900293838Sdim // High latency instructions: already given. 901293838Sdim if (CurrentColoring[SU->NodeNum]) 902293838Sdim continue; 903293838Sdim 904293838Sdim SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; 905293838Sdim SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; 906293838Sdim 907293838Sdim std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = 908293838Sdim ColorCombinations.find(SUColors); 909293838Sdim if (Pos != ColorCombinations.end()) { 910293838Sdim CurrentColoring[SU->NodeNum] = Pos->second; 911293838Sdim } else { 912293838Sdim CurrentColoring[SU->NodeNum] = NextNonReservedID; 913293838Sdim ColorCombinations[SUColors] = NextNonReservedID++; 914293838Sdim } 915293838Sdim } 916293838Sdim} 917293838Sdim 918293838Sdimvoid SIScheduleBlockCreator::colorEndsAccordingToDependencies() { 919293838Sdim unsigned DAGSize = DAG->SUnits.size(); 920293838Sdim std::vector<int> PendingColoring = CurrentColoring; 921293838Sdim 922321369Sdim assert(DAGSize >= 1 && 923321369Sdim CurrentBottomUpReservedDependencyColoring.size() == DAGSize && 924321369Sdim CurrentTopDownReservedDependencyColoring.size() == DAGSize); 925321369Sdim // If there is no reserved block at all, do nothing. We don't want 926321369Sdim // everything in one block. 927321369Sdim if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), 928321369Sdim CurrentBottomUpReservedDependencyColoring.end()) == 0 && 929321369Sdim *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), 930321369Sdim CurrentTopDownReservedDependencyColoring.end()) == 0) 931321369Sdim return; 932321369Sdim 933309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 934309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 935293838Sdim std::set<unsigned> SUColors; 936293838Sdim std::set<unsigned> SUColorsPending; 937293838Sdim 938293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 939293838Sdim continue; 940293838Sdim 941293838Sdim if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || 942293838Sdim CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) 943293838Sdim continue; 944293838Sdim 945293838Sdim for (SDep& SuccDep : SU->Succs) { 946293838Sdim SUnit *Succ = SuccDep.getSUnit(); 947293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 948293838Sdim continue; 949293838Sdim if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || 950293838Sdim CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) 951293838Sdim SUColors.insert(CurrentColoring[Succ->NodeNum]); 952293838Sdim SUColorsPending.insert(PendingColoring[Succ->NodeNum]); 953293838Sdim } 954321369Sdim // If there is only one child/parent block, and that block 955321369Sdim // is not among the ones we are removing in this path, then 956321369Sdim // merge the instruction to that block 957293838Sdim if (SUColors.size() == 1 && SUColorsPending.size() == 1) 958293838Sdim PendingColoring[SU->NodeNum] = *SUColors.begin(); 959293838Sdim else // TODO: Attribute new colors depending on color 960293838Sdim // combination of children. 961293838Sdim PendingColoring[SU->NodeNum] = NextNonReservedID++; 962293838Sdim } 963293838Sdim CurrentColoring = PendingColoring; 964293838Sdim} 965293838Sdim 966293838Sdim 967293838Sdimvoid SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { 968293838Sdim unsigned DAGSize = DAG->SUnits.size(); 969293838Sdim unsigned PreviousColor; 970293838Sdim std::set<unsigned> SeenColors; 971293838Sdim 972293838Sdim if (DAGSize <= 1) 973293838Sdim return; 974293838Sdim 975293838Sdim PreviousColor = CurrentColoring[0]; 976293838Sdim 977293838Sdim for (unsigned i = 1, e = DAGSize; i != e; ++i) { 978293838Sdim SUnit *SU = &DAG->SUnits[i]; 979293838Sdim unsigned CurrentColor = CurrentColoring[i]; 980293838Sdim unsigned PreviousColorSave = PreviousColor; 981293838Sdim assert(i == SU->NodeNum); 982293838Sdim 983293838Sdim if (CurrentColor != PreviousColor) 984293838Sdim SeenColors.insert(PreviousColor); 985293838Sdim PreviousColor = CurrentColor; 986293838Sdim 987293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 988293838Sdim continue; 989293838Sdim 990293838Sdim if (SeenColors.find(CurrentColor) == SeenColors.end()) 991293838Sdim continue; 992293838Sdim 993293838Sdim if (PreviousColorSave != CurrentColor) 994293838Sdim CurrentColoring[i] = NextNonReservedID++; 995293838Sdim else 996293838Sdim CurrentColoring[i] = CurrentColoring[i-1]; 997293838Sdim } 998293838Sdim} 999293838Sdim 1000293838Sdimvoid SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { 1001293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1002293838Sdim 1003309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1004309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1005293838Sdim std::set<unsigned> SUColors; 1006293838Sdim 1007293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1008293838Sdim continue; 1009293838Sdim 1010293838Sdim // No predecessor: Vgpr constant loading. 1011293838Sdim // Low latency instructions usually have a predecessor (the address) 1012293838Sdim if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) 1013293838Sdim continue; 1014293838Sdim 1015293838Sdim for (SDep& SuccDep : SU->Succs) { 1016293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1017293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1018293838Sdim continue; 1019293838Sdim SUColors.insert(CurrentColoring[Succ->NodeNum]); 1020293838Sdim } 1021293838Sdim if (SUColors.size() == 1) 1022293838Sdim CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1023293838Sdim } 1024293838Sdim} 1025293838Sdim 1026293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { 1027293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1028293838Sdim 1029309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1030309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1031293838Sdim std::set<unsigned> SUColors; 1032293838Sdim 1033293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1034293838Sdim continue; 1035293838Sdim 1036293838Sdim for (SDep& SuccDep : SU->Succs) { 1037293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1038293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1039293838Sdim continue; 1040293838Sdim SUColors.insert(CurrentColoring[Succ->NodeNum]); 1041293838Sdim } 1042293838Sdim if (SUColors.size() == 1) 1043293838Sdim CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1044293838Sdim } 1045293838Sdim} 1046293838Sdim 1047293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { 1048293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1049293838Sdim 1050309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1051309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1052293838Sdim std::set<unsigned> SUColors; 1053293838Sdim 1054293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1055293838Sdim continue; 1056293838Sdim 1057293838Sdim for (SDep& SuccDep : SU->Succs) { 1058293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1059293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1060293838Sdim continue; 1061293838Sdim SUColors.insert(CurrentColoring[Succ->NodeNum]); 1062293838Sdim } 1063293838Sdim if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) 1064293838Sdim CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1065293838Sdim } 1066293838Sdim} 1067293838Sdim 1068293838Sdimvoid SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { 1069293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1070293838Sdim std::map<unsigned, unsigned> ColorCount; 1071293838Sdim 1072309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1073309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1074293838Sdim unsigned color = CurrentColoring[SU->NodeNum]; 1075321369Sdim ++ColorCount[color]; 1076293838Sdim } 1077293838Sdim 1078309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1079309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1080293838Sdim unsigned color = CurrentColoring[SU->NodeNum]; 1081293838Sdim std::set<unsigned> SUColors; 1082293838Sdim 1083293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1084293838Sdim continue; 1085293838Sdim 1086293838Sdim if (ColorCount[color] > 1) 1087293838Sdim continue; 1088293838Sdim 1089293838Sdim for (SDep& SuccDep : SU->Succs) { 1090293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1091293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1092293838Sdim continue; 1093293838Sdim SUColors.insert(CurrentColoring[Succ->NodeNum]); 1094293838Sdim } 1095293838Sdim if (SUColors.size() == 1 && *SUColors.begin() != color) { 1096293838Sdim --ColorCount[color]; 1097293838Sdim CurrentColoring[SU->NodeNum] = *SUColors.begin(); 1098293838Sdim ++ColorCount[*SUColors.begin()]; 1099293838Sdim } 1100293838Sdim } 1101293838Sdim} 1102293838Sdim 1103293838Sdimvoid SIScheduleBlockCreator::cutHugeBlocks() { 1104293838Sdim // TODO 1105293838Sdim} 1106293838Sdim 1107293838Sdimvoid SIScheduleBlockCreator::regroupNoUserInstructions() { 1108293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1109293838Sdim int GroupID = NextNonReservedID++; 1110293838Sdim 1111309124Sdim for (unsigned SUNum : DAG->BottomUpIndex2SU) { 1112309124Sdim SUnit *SU = &DAG->SUnits[SUNum]; 1113293838Sdim bool hasSuccessor = false; 1114293838Sdim 1115293838Sdim if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) 1116293838Sdim continue; 1117293838Sdim 1118293838Sdim for (SDep& SuccDep : SU->Succs) { 1119293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1120293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1121293838Sdim continue; 1122293838Sdim hasSuccessor = true; 1123293838Sdim } 1124293838Sdim if (!hasSuccessor) 1125293838Sdim CurrentColoring[SU->NodeNum] = GroupID; 1126293838Sdim } 1127293838Sdim} 1128293838Sdim 1129327952Sdimvoid SIScheduleBlockCreator::colorExports() { 1130327952Sdim unsigned ExportColor = NextNonReservedID++; 1131327952Sdim SmallVector<unsigned, 8> ExpGroup; 1132327952Sdim 1133327952Sdim // Put all exports together in a block. 1134327952Sdim // The block will naturally end up being scheduled last, 1135327952Sdim // thus putting exports at the end of the schedule, which 1136327952Sdim // is better for performance. 1137327952Sdim // However we must ensure, for safety, the exports can be put 1138327952Sdim // together in the same block without any other instruction. 1139327952Sdim // This could happen, for example, when scheduling after regalloc 1140327952Sdim // if reloading a spilled register from memory using the same 1141327952Sdim // register than used in a previous export. 1142327952Sdim // If that happens, do not regroup the exports. 1143327952Sdim for (unsigned SUNum : DAG->TopDownIndex2SU) { 1144327952Sdim const SUnit &SU = DAG->SUnits[SUNum]; 1145327952Sdim if (SIInstrInfo::isEXP(*SU.getInstr())) { 1146327952Sdim // Check the EXP can be added to the group safely, 1147327952Sdim // ie without needing any other instruction. 1148327952Sdim // The EXP is allowed to depend on other EXP 1149327952Sdim // (they will be in the same group). 1150327952Sdim for (unsigned j : ExpGroup) { 1151327952Sdim bool HasSubGraph; 1152327952Sdim std::vector<int> SubGraph; 1153327952Sdim // By construction (topological order), if SU and 1154327952Sdim // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary 1155327952Sdim // in the parent graph of SU. 1156327952Sdim#ifndef NDEBUG 1157327952Sdim SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], 1158327952Sdim HasSubGraph); 1159327952Sdim assert(!HasSubGraph); 1160327952Sdim#endif 1161327952Sdim SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, 1162327952Sdim HasSubGraph); 1163327952Sdim if (!HasSubGraph) 1164327952Sdim continue; // No dependencies between each other 1165327952Sdim 1166327952Sdim // SubGraph contains all the instructions required 1167327952Sdim // between EXP SUnits[j] and EXP SU. 1168327952Sdim for (unsigned k : SubGraph) { 1169327952Sdim if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) 1170327952Sdim // Other instructions than EXP would be required in the group. 1171327952Sdim // Abort the groupping. 1172327952Sdim return; 1173327952Sdim } 1174327952Sdim } 1175327952Sdim 1176327952Sdim ExpGroup.push_back(SUNum); 1177327952Sdim } 1178327952Sdim } 1179327952Sdim 1180327952Sdim // The group can be formed. Give the color. 1181327952Sdim for (unsigned j : ExpGroup) 1182327952Sdim CurrentColoring[j] = ExportColor; 1183327952Sdim} 1184327952Sdim 1185293838Sdimvoid SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { 1186293838Sdim unsigned DAGSize = DAG->SUnits.size(); 1187293838Sdim std::map<unsigned,unsigned> RealID; 1188293838Sdim 1189293838Sdim CurrentBlocks.clear(); 1190293838Sdim CurrentColoring.clear(); 1191293838Sdim CurrentColoring.resize(DAGSize, 0); 1192293838Sdim Node2CurrentBlock.clear(); 1193293838Sdim 1194293838Sdim // Restore links previous scheduling variant has overridden. 1195293838Sdim DAG->restoreSULinksLeft(); 1196293838Sdim 1197293838Sdim NextReservedID = 1; 1198293838Sdim NextNonReservedID = DAGSize + 1; 1199293838Sdim 1200341825Sdim LLVM_DEBUG(dbgs() << "Coloring the graph\n"); 1201293838Sdim 1202293838Sdim if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) 1203293838Sdim colorHighLatenciesGroups(); 1204293838Sdim else 1205293838Sdim colorHighLatenciesAlone(); 1206293838Sdim colorComputeReservedDependencies(); 1207293838Sdim colorAccordingToReservedDependencies(); 1208293838Sdim colorEndsAccordingToDependencies(); 1209293838Sdim if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) 1210293838Sdim colorForceConsecutiveOrderInGroup(); 1211293838Sdim regroupNoUserInstructions(); 1212293838Sdim colorMergeConstantLoadsNextGroup(); 1213293838Sdim colorMergeIfPossibleNextGroupOnlyForReserved(); 1214327952Sdim colorExports(); 1215293838Sdim 1216293838Sdim // Put SUs of same color into same block 1217293838Sdim Node2CurrentBlock.resize(DAGSize, -1); 1218293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1219293838Sdim SUnit *SU = &DAG->SUnits[i]; 1220293838Sdim unsigned Color = CurrentColoring[SU->NodeNum]; 1221293838Sdim if (RealID.find(Color) == RealID.end()) { 1222293838Sdim int ID = CurrentBlocks.size(); 1223360784Sdim BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); 1224293838Sdim CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); 1225293838Sdim RealID[Color] = ID; 1226293838Sdim } 1227293838Sdim CurrentBlocks[RealID[Color]]->addUnit(SU); 1228293838Sdim Node2CurrentBlock[SU->NodeNum] = RealID[Color]; 1229293838Sdim } 1230293838Sdim 1231293838Sdim // Build dependencies between blocks. 1232293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1233293838Sdim SUnit *SU = &DAG->SUnits[i]; 1234293838Sdim int SUID = Node2CurrentBlock[i]; 1235293838Sdim for (SDep& SuccDep : SU->Succs) { 1236293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1237293838Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1238293838Sdim continue; 1239293838Sdim if (Node2CurrentBlock[Succ->NodeNum] != SUID) 1240321369Sdim CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], 1241321369Sdim SuccDep.isCtrl() ? NoData : Data); 1242293838Sdim } 1243293838Sdim for (SDep& PredDep : SU->Preds) { 1244293838Sdim SUnit *Pred = PredDep.getSUnit(); 1245293838Sdim if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) 1246293838Sdim continue; 1247293838Sdim if (Node2CurrentBlock[Pred->NodeNum] != SUID) 1248293838Sdim CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); 1249293838Sdim } 1250293838Sdim } 1251293838Sdim 1252293838Sdim // Free root and leafs of all blocks to enable scheduling inside them. 1253293838Sdim for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1254293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1255293838Sdim Block->finalizeUnits(); 1256293838Sdim } 1257341825Sdim LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; 1258341825Sdim for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1259341825Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1260341825Sdim Block->printDebug(true); 1261341825Sdim }); 1262293838Sdim} 1263293838Sdim 1264293838Sdim// Two functions taken from Codegen/MachineScheduler.cpp 1265293838Sdim 1266314564Sdim/// Non-const version. 1267314564Sdimstatic MachineBasicBlock::iterator 1268314564SdimnextIfDebug(MachineBasicBlock::iterator I, 1269293838Sdim MachineBasicBlock::const_iterator End) { 1270314564Sdim for (; I != End; ++I) { 1271341825Sdim if (!I->isDebugInstr()) 1272293838Sdim break; 1273293838Sdim } 1274293838Sdim return I; 1275293838Sdim} 1276293838Sdim 1277293838Sdimvoid SIScheduleBlockCreator::topologicalSort() { 1278293838Sdim unsigned DAGSize = CurrentBlocks.size(); 1279293838Sdim std::vector<int> WorkList; 1280293838Sdim 1281341825Sdim LLVM_DEBUG(dbgs() << "Topological Sort\n"); 1282293838Sdim 1283293838Sdim WorkList.reserve(DAGSize); 1284293838Sdim TopDownIndex2Block.resize(DAGSize); 1285293838Sdim TopDownBlock2Index.resize(DAGSize); 1286293838Sdim BottomUpIndex2Block.resize(DAGSize); 1287293838Sdim 1288293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1289293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1290293838Sdim unsigned Degree = Block->getSuccs().size(); 1291293838Sdim TopDownBlock2Index[i] = Degree; 1292293838Sdim if (Degree == 0) { 1293293838Sdim WorkList.push_back(i); 1294293838Sdim } 1295293838Sdim } 1296293838Sdim 1297293838Sdim int Id = DAGSize; 1298293838Sdim while (!WorkList.empty()) { 1299293838Sdim int i = WorkList.back(); 1300293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1301293838Sdim WorkList.pop_back(); 1302293838Sdim TopDownBlock2Index[i] = --Id; 1303293838Sdim TopDownIndex2Block[Id] = i; 1304293838Sdim for (SIScheduleBlock* Pred : Block->getPreds()) { 1305293838Sdim if (!--TopDownBlock2Index[Pred->getID()]) 1306293838Sdim WorkList.push_back(Pred->getID()); 1307293838Sdim } 1308293838Sdim } 1309293838Sdim 1310293838Sdim#ifndef NDEBUG 1311293838Sdim // Check correctness of the ordering. 1312293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1313293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1314293838Sdim for (SIScheduleBlock* Pred : Block->getPreds()) { 1315293838Sdim assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && 1316293838Sdim "Wrong Top Down topological sorting"); 1317293838Sdim } 1318293838Sdim } 1319293838Sdim#endif 1320293838Sdim 1321293838Sdim BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), 1322293838Sdim TopDownIndex2Block.rend()); 1323293838Sdim} 1324293838Sdim 1325293838Sdimvoid SIScheduleBlockCreator::scheduleInsideBlocks() { 1326293838Sdim unsigned DAGSize = CurrentBlocks.size(); 1327293838Sdim 1328341825Sdim LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); 1329293838Sdim 1330293838Sdim // We do schedule a valid scheduling such that a Block corresponds 1331293838Sdim // to a range of instructions. 1332341825Sdim LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); 1333293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1334293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1335293838Sdim Block->fastSchedule(); 1336293838Sdim } 1337293838Sdim 1338293838Sdim // Note: the following code, and the part restoring previous position 1339293838Sdim // is by far the most expensive operation of the Scheduler. 1340293838Sdim 1341293838Sdim // Do not update CurrentTop. 1342293838Sdim MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); 1343293838Sdim std::vector<MachineBasicBlock::iterator> PosOld; 1344293838Sdim std::vector<MachineBasicBlock::iterator> PosNew; 1345293838Sdim PosOld.reserve(DAG->SUnits.size()); 1346293838Sdim PosNew.reserve(DAG->SUnits.size()); 1347293838Sdim 1348293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1349293838Sdim int BlockIndice = TopDownIndex2Block[i]; 1350293838Sdim SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1351293838Sdim std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1352293838Sdim 1353293838Sdim for (SUnit* SU : SUs) { 1354293838Sdim MachineInstr *MI = SU->getInstr(); 1355293838Sdim MachineBasicBlock::iterator Pos = MI; 1356293838Sdim PosOld.push_back(Pos); 1357293838Sdim if (&*CurrentTopFastSched == MI) { 1358293838Sdim PosNew.push_back(Pos); 1359293838Sdim CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, 1360293838Sdim DAG->getCurrentBottom()); 1361293838Sdim } else { 1362293838Sdim // Update the instruction stream. 1363293838Sdim DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); 1364293838Sdim 1365293838Sdim // Update LiveIntervals. 1366314564Sdim // Note: Moving all instructions and calling handleMove every time 1367293838Sdim // is the most cpu intensive operation of the scheduler. 1368293838Sdim // It would gain a lot if there was a way to recompute the 1369293838Sdim // LiveIntervals for the entire scheduling region. 1370309124Sdim DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); 1371293838Sdim PosNew.push_back(CurrentTopFastSched); 1372293838Sdim } 1373293838Sdim } 1374293838Sdim } 1375293838Sdim 1376293838Sdim // Now we have Block of SUs == Block of MI. 1377293838Sdim // We do the final schedule for the instructions inside the block. 1378293838Sdim // The property that all the SUs of the Block are grouped together as MI 1379293838Sdim // is used for correct reg usage tracking. 1380293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1381293838Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1382293838Sdim std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1383293838Sdim Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); 1384293838Sdim } 1385293838Sdim 1386341825Sdim LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); 1387293838Sdim // Restore old ordering (which prevents a LIS->handleMove bug). 1388293838Sdim for (unsigned i = PosOld.size(), e = 0; i != e; --i) { 1389293838Sdim MachineBasicBlock::iterator POld = PosOld[i-1]; 1390293838Sdim MachineBasicBlock::iterator PNew = PosNew[i-1]; 1391293838Sdim if (PNew != POld) { 1392293838Sdim // Update the instruction stream. 1393293838Sdim DAG->getBB()->splice(POld, DAG->getBB(), PNew); 1394293838Sdim 1395293838Sdim // Update LiveIntervals. 1396309124Sdim DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); 1397293838Sdim } 1398293838Sdim } 1399293838Sdim 1400341825Sdim LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { 1401341825Sdim SIScheduleBlock *Block = CurrentBlocks[i]; 1402341825Sdim Block->printDebug(true); 1403341825Sdim }); 1404293838Sdim} 1405293838Sdim 1406293838Sdimvoid SIScheduleBlockCreator::fillStats() { 1407293838Sdim unsigned DAGSize = CurrentBlocks.size(); 1408293838Sdim 1409293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1410293838Sdim int BlockIndice = TopDownIndex2Block[i]; 1411293838Sdim SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1412314564Sdim if (Block->getPreds().empty()) 1413293838Sdim Block->Depth = 0; 1414293838Sdim else { 1415293838Sdim unsigned Depth = 0; 1416293838Sdim for (SIScheduleBlock *Pred : Block->getPreds()) { 1417327952Sdim if (Depth < Pred->Depth + Pred->getCost()) 1418327952Sdim Depth = Pred->Depth + Pred->getCost(); 1419293838Sdim } 1420293838Sdim Block->Depth = Depth; 1421293838Sdim } 1422293838Sdim } 1423293838Sdim 1424293838Sdim for (unsigned i = 0, e = DAGSize; i != e; ++i) { 1425293838Sdim int BlockIndice = BottomUpIndex2Block[i]; 1426293838Sdim SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; 1427314564Sdim if (Block->getSuccs().empty()) 1428293838Sdim Block->Height = 0; 1429293838Sdim else { 1430293838Sdim unsigned Height = 0; 1431321369Sdim for (const auto &Succ : Block->getSuccs()) 1432327952Sdim Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); 1433293838Sdim Block->Height = Height; 1434293838Sdim } 1435293838Sdim } 1436293838Sdim} 1437293838Sdim 1438293838Sdim// SIScheduleBlockScheduler // 1439293838Sdim 1440293838SdimSIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, 1441293838Sdim SISchedulerBlockSchedulerVariant Variant, 1442293838Sdim SIScheduleBlocks BlocksStruct) : 1443293838Sdim DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), 1444293838Sdim LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), 1445293838Sdim SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { 1446293838Sdim 1447293838Sdim // Fill the usage of every output 1448293838Sdim // Warning: while by construction we always have a link between two blocks 1449293838Sdim // when one needs a result from the other, the number of users of an output 1450293838Sdim // is not the sum of child blocks having as input the same virtual register. 1451293838Sdim // Here is an example. A produces x and y. B eats x and produces x'. 1452293838Sdim // C eats x' and y. The register coalescer may have attributed the same 1453293838Sdim // virtual register to x and x'. 1454293838Sdim // To count accurately, we do a topological sort. In case the register is 1455293838Sdim // found for several parents, we increment the usage of the one with the 1456293838Sdim // highest topological index. 1457293838Sdim LiveOutRegsNumUsages.resize(Blocks.size()); 1458293838Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1459293838Sdim SIScheduleBlock *Block = Blocks[i]; 1460293838Sdim for (unsigned Reg : Block->getInRegs()) { 1461293838Sdim bool Found = false; 1462293838Sdim int topoInd = -1; 1463293838Sdim for (SIScheduleBlock* Pred: Block->getPreds()) { 1464293838Sdim std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1465293838Sdim std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1466293838Sdim 1467293838Sdim if (RegPos != PredOutRegs.end()) { 1468293838Sdim Found = true; 1469293838Sdim if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { 1470293838Sdim topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; 1471293838Sdim } 1472293838Sdim } 1473293838Sdim } 1474293838Sdim 1475293838Sdim if (!Found) 1476293838Sdim continue; 1477293838Sdim 1478293838Sdim int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; 1479321369Sdim ++LiveOutRegsNumUsages[PredID][Reg]; 1480293838Sdim } 1481293838Sdim } 1482293838Sdim 1483293838Sdim LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); 1484293838Sdim BlockNumPredsLeft.resize(Blocks.size()); 1485293838Sdim BlockNumSuccsLeft.resize(Blocks.size()); 1486293838Sdim 1487293838Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1488293838Sdim SIScheduleBlock *Block = Blocks[i]; 1489293838Sdim BlockNumPredsLeft[i] = Block->getPreds().size(); 1490293838Sdim BlockNumSuccsLeft[i] = Block->getSuccs().size(); 1491293838Sdim } 1492293838Sdim 1493293838Sdim#ifndef NDEBUG 1494293838Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1495293838Sdim SIScheduleBlock *Block = Blocks[i]; 1496293838Sdim assert(Block->getID() == i); 1497293838Sdim } 1498293838Sdim#endif 1499293838Sdim 1500293838Sdim std::set<unsigned> InRegs = DAG->getInRegs(); 1501293838Sdim addLiveRegs(InRegs); 1502293838Sdim 1503321369Sdim // Increase LiveOutRegsNumUsages for blocks 1504321369Sdim // producing registers consumed in another 1505321369Sdim // scheduling region. 1506321369Sdim for (unsigned Reg : DAG->getOutRegs()) { 1507321369Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1508321369Sdim // Do reverse traversal 1509321369Sdim int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; 1510321369Sdim SIScheduleBlock *Block = Blocks[ID]; 1511321369Sdim const std::set<unsigned> &OutRegs = Block->getOutRegs(); 1512321369Sdim 1513321369Sdim if (OutRegs.find(Reg) == OutRegs.end()) 1514321369Sdim continue; 1515321369Sdim 1516321369Sdim ++LiveOutRegsNumUsages[ID][Reg]; 1517321369Sdim break; 1518321369Sdim } 1519321369Sdim } 1520321369Sdim 1521293838Sdim // Fill LiveRegsConsumers for regs that were already 1522293838Sdim // defined before scheduling. 1523293838Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1524293838Sdim SIScheduleBlock *Block = Blocks[i]; 1525293838Sdim for (unsigned Reg : Block->getInRegs()) { 1526293838Sdim bool Found = false; 1527293838Sdim for (SIScheduleBlock* Pred: Block->getPreds()) { 1528293838Sdim std::set<unsigned> PredOutRegs = Pred->getOutRegs(); 1529293838Sdim std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); 1530293838Sdim 1531293838Sdim if (RegPos != PredOutRegs.end()) { 1532293838Sdim Found = true; 1533293838Sdim break; 1534293838Sdim } 1535293838Sdim } 1536293838Sdim 1537321369Sdim if (!Found) 1538321369Sdim ++LiveRegsConsumers[Reg]; 1539293838Sdim } 1540293838Sdim } 1541293838Sdim 1542293838Sdim for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { 1543293838Sdim SIScheduleBlock *Block = Blocks[i]; 1544293838Sdim if (BlockNumPredsLeft[i] == 0) { 1545293838Sdim ReadyBlocks.push_back(Block); 1546293838Sdim } 1547293838Sdim } 1548293838Sdim 1549293838Sdim while (SIScheduleBlock *Block = pickBlock()) { 1550293838Sdim BlocksScheduled.push_back(Block); 1551293838Sdim blockScheduled(Block); 1552293838Sdim } 1553293838Sdim 1554341825Sdim LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block 1555341825Sdim : BlocksScheduled) { 1556341825Sdim dbgs() << ' ' << Block->getID(); 1557341825Sdim } dbgs() << '\n';); 1558293838Sdim} 1559293838Sdim 1560293838Sdimbool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, 1561293838Sdim SIBlockSchedCandidate &TryCand) { 1562293838Sdim if (!Cand.isValid()) { 1563293838Sdim TryCand.Reason = NodeOrder; 1564293838Sdim return true; 1565293838Sdim } 1566293838Sdim 1567293838Sdim // Try to hide high latencies. 1568341825Sdim if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, 1569341825Sdim Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) 1570293838Sdim return true; 1571293838Sdim // Schedule high latencies early so you can hide them better. 1572341825Sdim if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, 1573341825Sdim TryCand, Cand, Latency)) 1574293838Sdim return true; 1575341825Sdim if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, 1576341825Sdim TryCand, Cand, Depth)) 1577293838Sdim return true; 1578341825Sdim if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, 1579341825Sdim Cand.NumHighLatencySuccessors, 1580341825Sdim TryCand, Cand, Successor)) 1581293838Sdim return true; 1582293838Sdim return false; 1583293838Sdim} 1584293838Sdim 1585293838Sdimbool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, 1586293838Sdim SIBlockSchedCandidate &TryCand) { 1587293838Sdim if (!Cand.isValid()) { 1588293838Sdim TryCand.Reason = NodeOrder; 1589293838Sdim return true; 1590293838Sdim } 1591293838Sdim 1592341825Sdim if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, 1593341825Sdim TryCand, Cand, RegUsage)) 1594293838Sdim return true; 1595341825Sdim if (SISched::tryGreater(TryCand.NumSuccessors > 0, 1596341825Sdim Cand.NumSuccessors > 0, 1597341825Sdim TryCand, Cand, Successor)) 1598293838Sdim return true; 1599341825Sdim if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) 1600293838Sdim return true; 1601341825Sdim if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, 1602341825Sdim TryCand, Cand, RegUsage)) 1603293838Sdim return true; 1604293838Sdim return false; 1605293838Sdim} 1606293838Sdim 1607293838SdimSIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { 1608293838Sdim SIBlockSchedCandidate Cand; 1609293838Sdim std::vector<SIScheduleBlock*>::iterator Best; 1610293838Sdim SIScheduleBlock *Block; 1611293838Sdim if (ReadyBlocks.empty()) 1612293838Sdim return nullptr; 1613293838Sdim 1614293838Sdim DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), 1615293838Sdim VregCurrentUsage, SregCurrentUsage); 1616293838Sdim if (VregCurrentUsage > maxVregUsage) 1617293838Sdim maxVregUsage = VregCurrentUsage; 1618321369Sdim if (SregCurrentUsage > maxSregUsage) 1619321369Sdim maxSregUsage = SregCurrentUsage; 1620341825Sdim LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; 1621341825Sdim for (SIScheduleBlock *Block 1622341825Sdim : ReadyBlocks) dbgs() 1623341825Sdim << Block->getID() << ' '; 1624341825Sdim dbgs() << "\nCurrent Live:\n"; 1625341825Sdim for (unsigned Reg 1626341825Sdim : LiveRegs) dbgs() 1627341825Sdim << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; 1628341825Sdim dbgs() << '\n'; 1629341825Sdim dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; 1630341825Sdim dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); 1631293838Sdim 1632293838Sdim Cand.Block = nullptr; 1633293838Sdim for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), 1634293838Sdim E = ReadyBlocks.end(); I != E; ++I) { 1635293838Sdim SIBlockSchedCandidate TryCand; 1636293838Sdim TryCand.Block = *I; 1637293838Sdim TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); 1638293838Sdim TryCand.VGPRUsageDiff = 1639293838Sdim checkRegUsageImpact(TryCand.Block->getInRegs(), 1640293838Sdim TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; 1641293838Sdim TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); 1642293838Sdim TryCand.NumHighLatencySuccessors = 1643293838Sdim TryCand.Block->getNumHighLatencySuccessors(); 1644293838Sdim TryCand.LastPosHighLatParentScheduled = 1645293838Sdim (unsigned int) std::max<int> (0, 1646293838Sdim LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - 1647293838Sdim LastPosWaitedHighLatency); 1648293838Sdim TryCand.Height = TryCand.Block->Height; 1649293838Sdim // Try not to increase VGPR usage too much, else we may spill. 1650293838Sdim if (VregCurrentUsage > 120 || 1651293838Sdim Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { 1652293838Sdim if (!tryCandidateRegUsage(Cand, TryCand) && 1653293838Sdim Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) 1654293838Sdim tryCandidateLatency(Cand, TryCand); 1655293838Sdim } else { 1656293838Sdim if (!tryCandidateLatency(Cand, TryCand)) 1657293838Sdim tryCandidateRegUsage(Cand, TryCand); 1658293838Sdim } 1659293838Sdim if (TryCand.Reason != NoCand) { 1660293838Sdim Cand.setBest(TryCand); 1661293838Sdim Best = I; 1662341825Sdim LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' 1663341825Sdim << getReasonStr(Cand.Reason) << '\n'); 1664293838Sdim } 1665293838Sdim } 1666293838Sdim 1667341825Sdim LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; 1668341825Sdim dbgs() << "Is a block with high latency instruction: " 1669341825Sdim << (Cand.IsHighLatency ? "yes\n" : "no\n"); 1670341825Sdim dbgs() << "Position of last high latency dependency: " 1671341825Sdim << Cand.LastPosHighLatParentScheduled << '\n'; 1672341825Sdim dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; 1673341825Sdim dbgs() << '\n';); 1674293838Sdim 1675293838Sdim Block = Cand.Block; 1676293838Sdim ReadyBlocks.erase(Best); 1677293838Sdim return Block; 1678293838Sdim} 1679293838Sdim 1680293838Sdim// Tracking of currently alive registers to determine VGPR Usage. 1681293838Sdim 1682293838Sdimvoid SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { 1683293838Sdim for (unsigned Reg : Regs) { 1684293838Sdim // For now only track virtual registers. 1685360784Sdim if (!Register::isVirtualRegister(Reg)) 1686293838Sdim continue; 1687293838Sdim // If not already in the live set, then add it. 1688293838Sdim (void) LiveRegs.insert(Reg); 1689293838Sdim } 1690293838Sdim} 1691293838Sdim 1692293838Sdimvoid SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, 1693293838Sdim std::set<unsigned> &Regs) { 1694293838Sdim for (unsigned Reg : Regs) { 1695293838Sdim // For now only track virtual registers. 1696293838Sdim std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); 1697293838Sdim assert (Pos != LiveRegs.end() && // Reg must be live. 1698293838Sdim LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && 1699293838Sdim LiveRegsConsumers[Reg] >= 1); 1700293838Sdim --LiveRegsConsumers[Reg]; 1701293838Sdim if (LiveRegsConsumers[Reg] == 0) 1702293838Sdim LiveRegs.erase(Pos); 1703293838Sdim } 1704293838Sdim} 1705293838Sdim 1706293838Sdimvoid SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { 1707321369Sdim for (const auto &Block : Parent->getSuccs()) { 1708321369Sdim if (--BlockNumPredsLeft[Block.first->getID()] == 0) 1709321369Sdim ReadyBlocks.push_back(Block.first); 1710321369Sdim 1711321369Sdim if (Parent->isHighLatencyBlock() && 1712321369Sdim Block.second == SIScheduleBlockLinkKind::Data) 1713321369Sdim LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; 1714293838Sdim } 1715293838Sdim} 1716293838Sdim 1717293838Sdimvoid SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { 1718293838Sdim decreaseLiveRegs(Block, Block->getInRegs()); 1719293838Sdim addLiveRegs(Block->getOutRegs()); 1720293838Sdim releaseBlockSuccs(Block); 1721293838Sdim for (std::map<unsigned, unsigned>::iterator RegI = 1722293838Sdim LiveOutRegsNumUsages[Block->getID()].begin(), 1723293838Sdim E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { 1724293838Sdim std::pair<unsigned, unsigned> RegP = *RegI; 1725321369Sdim // We produce this register, thus it must not be previously alive. 1726321369Sdim assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || 1727321369Sdim LiveRegsConsumers[RegP.first] == 0); 1728321369Sdim LiveRegsConsumers[RegP.first] += RegP.second; 1729293838Sdim } 1730293838Sdim if (LastPosHighLatencyParentScheduled[Block->getID()] > 1731293838Sdim (unsigned)LastPosWaitedHighLatency) 1732293838Sdim LastPosWaitedHighLatency = 1733293838Sdim LastPosHighLatencyParentScheduled[Block->getID()]; 1734293838Sdim ++NumBlockScheduled; 1735293838Sdim} 1736293838Sdim 1737293838Sdimstd::vector<int> 1738293838SdimSIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, 1739293838Sdim std::set<unsigned> &OutRegs) { 1740293838Sdim std::vector<int> DiffSetPressure; 1741293838Sdim DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); 1742293838Sdim 1743293838Sdim for (unsigned Reg : InRegs) { 1744293838Sdim // For now only track virtual registers. 1745360784Sdim if (!Register::isVirtualRegister(Reg)) 1746293838Sdim continue; 1747293838Sdim if (LiveRegsConsumers[Reg] > 1) 1748293838Sdim continue; 1749293838Sdim PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1750293838Sdim for (; PSetI.isValid(); ++PSetI) { 1751293838Sdim DiffSetPressure[*PSetI] -= PSetI.getWeight(); 1752293838Sdim } 1753293838Sdim } 1754293838Sdim 1755293838Sdim for (unsigned Reg : OutRegs) { 1756293838Sdim // For now only track virtual registers. 1757360784Sdim if (!Register::isVirtualRegister(Reg)) 1758293838Sdim continue; 1759293838Sdim PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); 1760293838Sdim for (; PSetI.isValid(); ++PSetI) { 1761293838Sdim DiffSetPressure[*PSetI] += PSetI.getWeight(); 1762293838Sdim } 1763293838Sdim } 1764293838Sdim 1765293838Sdim return DiffSetPressure; 1766293838Sdim} 1767293838Sdim 1768293838Sdim// SIScheduler // 1769293838Sdim 1770293838Sdimstruct SIScheduleBlockResult 1771293838SdimSIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, 1772293838Sdim SISchedulerBlockSchedulerVariant ScheduleVariant) { 1773293838Sdim SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); 1774293838Sdim SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); 1775293838Sdim std::vector<SIScheduleBlock*> ScheduledBlocks; 1776293838Sdim struct SIScheduleBlockResult Res; 1777293838Sdim 1778293838Sdim ScheduledBlocks = Scheduler.getBlocks(); 1779293838Sdim 1780293838Sdim for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { 1781293838Sdim SIScheduleBlock *Block = ScheduledBlocks[b]; 1782293838Sdim std::vector<SUnit*> SUs = Block->getScheduledUnits(); 1783293838Sdim 1784293838Sdim for (SUnit* SU : SUs) 1785293838Sdim Res.SUs.push_back(SU->NodeNum); 1786293838Sdim } 1787293838Sdim 1788293838Sdim Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); 1789293838Sdim Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); 1790293838Sdim return Res; 1791293838Sdim} 1792293838Sdim 1793293838Sdim// SIScheduleDAGMI // 1794293838Sdim 1795293838SdimSIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : 1796360784Sdim ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { 1797293838Sdim SITII = static_cast<const SIInstrInfo*>(TII); 1798293838Sdim SITRI = static_cast<const SIRegisterInfo*>(TRI); 1799293838Sdim 1800314564Sdim VGPRSetID = SITRI->getVGPRPressureSet(); 1801314564Sdim SGPRSetID = SITRI->getSGPRPressureSet(); 1802293838Sdim} 1803293838Sdim 1804314564SdimSIScheduleDAGMI::~SIScheduleDAGMI() = default; 1805293838Sdim 1806293838Sdim// Code adapted from scheduleDAG.cpp 1807293838Sdim// Does a topological sort over the SUs. 1808293838Sdim// Both TopDown and BottomUp 1809293838Sdimvoid SIScheduleDAGMI::topologicalSort() { 1810309124Sdim Topo.InitDAGTopologicalSorting(); 1811293838Sdim 1812309124Sdim TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); 1813309124Sdim BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); 1814293838Sdim} 1815293838Sdim 1816293838Sdim// Move low latencies further from their user without 1817293838Sdim// increasing SGPR usage (in general) 1818293838Sdim// This is to be replaced by a better pass that would 1819293838Sdim// take into account SGPR usage (based on VGPR Usage 1820293838Sdim// and the corresponding wavefront count), that would 1821293838Sdim// try to merge groups of loads if it make sense, etc 1822293838Sdimvoid SIScheduleDAGMI::moveLowLatencies() { 1823293838Sdim unsigned DAGSize = SUnits.size(); 1824293838Sdim int LastLowLatencyUser = -1; 1825293838Sdim int LastLowLatencyPos = -1; 1826293838Sdim 1827293838Sdim for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { 1828293838Sdim SUnit *SU = &SUnits[ScheduledSUnits[i]]; 1829293838Sdim bool IsLowLatencyUser = false; 1830293838Sdim unsigned MinPos = 0; 1831293838Sdim 1832293838Sdim for (SDep& PredDep : SU->Preds) { 1833293838Sdim SUnit *Pred = PredDep.getSUnit(); 1834309124Sdim if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { 1835293838Sdim IsLowLatencyUser = true; 1836293838Sdim } 1837293838Sdim if (Pred->NodeNum >= DAGSize) 1838293838Sdim continue; 1839293838Sdim unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; 1840293838Sdim if (PredPos >= MinPos) 1841293838Sdim MinPos = PredPos + 1; 1842293838Sdim } 1843293838Sdim 1844309124Sdim if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1845293838Sdim unsigned BestPos = LastLowLatencyUser + 1; 1846293838Sdim if ((int)BestPos <= LastLowLatencyPos) 1847293838Sdim BestPos = LastLowLatencyPos + 1; 1848293838Sdim if (BestPos < MinPos) 1849293838Sdim BestPos = MinPos; 1850293838Sdim if (BestPos < i) { 1851293838Sdim for (unsigned u = i; u > BestPos; --u) { 1852293838Sdim ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1853293838Sdim ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1854293838Sdim } 1855293838Sdim ScheduledSUnits[BestPos] = SU->NodeNum; 1856293838Sdim ScheduledSUnitsInv[SU->NodeNum] = BestPos; 1857293838Sdim } 1858293838Sdim LastLowLatencyPos = BestPos; 1859293838Sdim if (IsLowLatencyUser) 1860293838Sdim LastLowLatencyUser = BestPos; 1861293838Sdim } else if (IsLowLatencyUser) { 1862293838Sdim LastLowLatencyUser = i; 1863293838Sdim // Moves COPY instructions on which depends 1864293838Sdim // the low latency instructions too. 1865293838Sdim } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { 1866293838Sdim bool CopyForLowLat = false; 1867293838Sdim for (SDep& SuccDep : SU->Succs) { 1868293838Sdim SUnit *Succ = SuccDep.getSUnit(); 1869353358Sdim if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) 1870353358Sdim continue; 1871309124Sdim if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { 1872293838Sdim CopyForLowLat = true; 1873293838Sdim } 1874293838Sdim } 1875293838Sdim if (!CopyForLowLat) 1876293838Sdim continue; 1877293838Sdim if (MinPos < i) { 1878293838Sdim for (unsigned u = i; u > MinPos; --u) { 1879293838Sdim ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; 1880293838Sdim ScheduledSUnits[u] = ScheduledSUnits[u-1]; 1881293838Sdim } 1882293838Sdim ScheduledSUnits[MinPos] = SU->NodeNum; 1883293838Sdim ScheduledSUnitsInv[SU->NodeNum] = MinPos; 1884293838Sdim } 1885293838Sdim } 1886293838Sdim } 1887293838Sdim} 1888293838Sdim 1889293838Sdimvoid SIScheduleDAGMI::restoreSULinksLeft() { 1890293838Sdim for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { 1891293838Sdim SUnits[i].isScheduled = false; 1892293838Sdim SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; 1893293838Sdim SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; 1894293838Sdim SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; 1895293838Sdim SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; 1896293838Sdim } 1897293838Sdim} 1898293838Sdim 1899293838Sdim// Return the Vgpr and Sgpr usage corresponding to some virtual registers. 1900293838Sdimtemplate<typename _Iterator> void 1901293838SdimSIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, 1902293838Sdim unsigned &VgprUsage, unsigned &SgprUsage) { 1903293838Sdim VgprUsage = 0; 1904293838Sdim SgprUsage = 0; 1905293838Sdim for (_Iterator RegI = First; RegI != End; ++RegI) { 1906293838Sdim unsigned Reg = *RegI; 1907293838Sdim // For now only track virtual registers 1908360784Sdim if (!Register::isVirtualRegister(Reg)) 1909293838Sdim continue; 1910293838Sdim PSetIterator PSetI = MRI.getPressureSets(Reg); 1911293838Sdim for (; PSetI.isValid(); ++PSetI) { 1912293838Sdim if (*PSetI == VGPRSetID) 1913293838Sdim VgprUsage += PSetI.getWeight(); 1914293838Sdim else if (*PSetI == SGPRSetID) 1915293838Sdim SgprUsage += PSetI.getWeight(); 1916293838Sdim } 1917293838Sdim } 1918293838Sdim} 1919293838Sdim 1920293838Sdimvoid SIScheduleDAGMI::schedule() 1921293838Sdim{ 1922293838Sdim SmallVector<SUnit*, 8> TopRoots, BotRoots; 1923293838Sdim SIScheduleBlockResult Best, Temp; 1924341825Sdim LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); 1925293838Sdim 1926293838Sdim buildDAGWithRegPressure(); 1927344779Sdim LLVM_DEBUG(dump()); 1928293838Sdim 1929293838Sdim topologicalSort(); 1930293838Sdim findRootsAndBiasEdges(TopRoots, BotRoots); 1931293838Sdim // We reuse several ScheduleDAGMI and ScheduleDAGMILive 1932293838Sdim // functions, but to make them happy we must initialize 1933293838Sdim // the default Scheduler implementation (even if we do not 1934293838Sdim // run it) 1935293838Sdim SchedImpl->initialize(this); 1936293838Sdim initQueues(TopRoots, BotRoots); 1937293838Sdim 1938293838Sdim // Fill some stats to help scheduling. 1939293838Sdim 1940293838Sdim SUnitsLinksBackup = SUnits; 1941293838Sdim IsLowLatencySU.clear(); 1942293838Sdim LowLatencyOffset.clear(); 1943293838Sdim IsHighLatencySU.clear(); 1944293838Sdim 1945293838Sdim IsLowLatencySU.resize(SUnits.size(), 0); 1946293838Sdim LowLatencyOffset.resize(SUnits.size(), 0); 1947293838Sdim IsHighLatencySU.resize(SUnits.size(), 0); 1948293838Sdim 1949293838Sdim for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 1950293838Sdim SUnit *SU = &SUnits[i]; 1951353358Sdim const MachineOperand *BaseLatOp; 1952309124Sdim int64_t OffLatReg; 1953309124Sdim if (SITII->isLowLatencyInstruction(*SU->getInstr())) { 1954293838Sdim IsLowLatencySU[i] = 1; 1955344779Sdim if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, 1956344779Sdim TRI)) 1957293838Sdim LowLatencyOffset[i] = OffLatReg; 1958309124Sdim } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) 1959293838Sdim IsHighLatencySU[i] = 1; 1960293838Sdim } 1961293838Sdim 1962293838Sdim SIScheduler Scheduler(this); 1963293838Sdim Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, 1964293838Sdim SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); 1965309124Sdim 1966293838Sdim // if VGPR usage is extremely high, try other good performing variants 1967293838Sdim // which could lead to lower VGPR usage 1968293838Sdim if (Best.MaxVGPRUsage > 180) { 1969321369Sdim static const std::pair<SISchedulerBlockCreatorVariant, 1970321369Sdim SISchedulerBlockSchedulerVariant> 1971321369Sdim Variants[] = { 1972293838Sdim { LatenciesAlone, BlockRegUsageLatency }, 1973293838Sdim// { LatenciesAlone, BlockRegUsage }, 1974293838Sdim { LatenciesGrouped, BlockLatencyRegUsage }, 1975293838Sdim// { LatenciesGrouped, BlockRegUsageLatency }, 1976293838Sdim// { LatenciesGrouped, BlockRegUsage }, 1977293838Sdim { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1978293838Sdim// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 1979293838Sdim// { LatenciesAlonePlusConsecutive, BlockRegUsage } 1980293838Sdim }; 1981293838Sdim for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 1982293838Sdim Temp = Scheduler.scheduleVariant(v.first, v.second); 1983293838Sdim if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 1984293838Sdim Best = Temp; 1985293838Sdim } 1986293838Sdim } 1987293838Sdim // if VGPR usage is still extremely high, we may spill. Try other variants 1988293838Sdim // which are less performing, but that could lead to lower VGPR usage. 1989293838Sdim if (Best.MaxVGPRUsage > 200) { 1990321369Sdim static const std::pair<SISchedulerBlockCreatorVariant, 1991321369Sdim SISchedulerBlockSchedulerVariant> 1992321369Sdim Variants[] = { 1993293838Sdim// { LatenciesAlone, BlockRegUsageLatency }, 1994293838Sdim { LatenciesAlone, BlockRegUsage }, 1995293838Sdim// { LatenciesGrouped, BlockLatencyRegUsage }, 1996293838Sdim { LatenciesGrouped, BlockRegUsageLatency }, 1997293838Sdim { LatenciesGrouped, BlockRegUsage }, 1998293838Sdim// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, 1999293838Sdim { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, 2000293838Sdim { LatenciesAlonePlusConsecutive, BlockRegUsage } 2001293838Sdim }; 2002293838Sdim for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { 2003293838Sdim Temp = Scheduler.scheduleVariant(v.first, v.second); 2004293838Sdim if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) 2005293838Sdim Best = Temp; 2006293838Sdim } 2007293838Sdim } 2008309124Sdim 2009293838Sdim ScheduledSUnits = Best.SUs; 2010293838Sdim ScheduledSUnitsInv.resize(SUnits.size()); 2011293838Sdim 2012293838Sdim for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { 2013293838Sdim ScheduledSUnitsInv[ScheduledSUnits[i]] = i; 2014293838Sdim } 2015293838Sdim 2016293838Sdim moveLowLatencies(); 2017293838Sdim 2018293838Sdim // Tell the outside world about the result of the scheduling. 2019293838Sdim 2020293838Sdim assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); 2021293838Sdim TopRPTracker.setPos(CurrentTop); 2022293838Sdim 2023293838Sdim for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), 2024293838Sdim E = ScheduledSUnits.end(); I != E; ++I) { 2025293838Sdim SUnit *SU = &SUnits[*I]; 2026293838Sdim 2027293838Sdim scheduleMI(SU, true); 2028293838Sdim 2029341825Sdim LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " 2030341825Sdim << *SU->getInstr()); 2031293838Sdim } 2032293838Sdim 2033293838Sdim assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); 2034293838Sdim 2035293838Sdim placeDebugValues(); 2036293838Sdim 2037341825Sdim LLVM_DEBUG({ 2038327952Sdim dbgs() << "*** Final schedule for " 2039327952Sdim << printMBBReference(*begin()->getParent()) << " ***\n"; 2040327952Sdim dumpSchedule(); 2041327952Sdim dbgs() << '\n'; 2042327952Sdim }); 2043293838Sdim} 2044