1//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// \brief Try to reassign registers on GFX10+ to reduce register bank 11/// conflicts. 12/// 13/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in 14/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to 15/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, 16/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. 17/// 18/// The shader can read one dword from each of these banks once per cycle. 19/// If an instruction has to read more register operands from the same bank 20/// an additional cycle is needed. HW attempts to pre-load registers through 21/// input operand gathering, but a stall cycle may occur if that fails. For 22/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, 23/// potentially incuring 2 stall cycles. 24/// 25/// The pass tries to reassign registers to reduce bank conflicts. 26/// 27/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so 28/// that 4 has to be subtracted from an SGPR bank number to get the real value. 29/// This also corresponds to bit numbers in bank masks used in the pass. 30/// 31//===----------------------------------------------------------------------===// 32 33#include "AMDGPU.h" 34#include "AMDGPUSubtarget.h" 35#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 36#include "SIInstrInfo.h" 37#include "SIMachineFunctionInfo.h" 38#include "llvm/ADT/SmallSet.h" 39#include "llvm/ADT/Statistic.h" 40#include "llvm/CodeGen/LiveInterval.h" 41#include "llvm/CodeGen/LiveIntervals.h" 42#include "llvm/CodeGen/LiveRegMatrix.h" 43#include "llvm/CodeGen/MachineFunctionPass.h" 44#include "llvm/CodeGen/MachineLoopInfo.h" 45#include "llvm/CodeGen/VirtRegMap.h" 46#include "llvm/InitializePasses.h" 47#include "llvm/Support/MathExtras.h" 48 49using namespace llvm; 50 51static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", 52 cl::desc("Verify stall cycles in the regbanks reassign pass"), 53 cl::value_desc("0|1|2"), 54 cl::init(0), cl::Hidden); 55 56#define DEBUG_TYPE "amdgpu-regbanks-reassign" 57 58#define NUM_VGPR_BANKS 4 59#define NUM_SGPR_BANKS 8 60#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) 61#define SGPR_BANK_OFFSET NUM_VGPR_BANKS 62#define VGPR_BANK_MASK 0xf 63#define SGPR_BANK_MASK 0xff0 64#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) 65 66STATISTIC(NumStallsDetected, 67 "Number of operand read stalls detected"); 68STATISTIC(NumStallsRecovered, 69 "Number of operand read stalls recovered"); 70 71namespace { 72 73class GCNRegBankReassign : public MachineFunctionPass { 74 75 class OperandMask { 76 public: 77 OperandMask(unsigned r, unsigned s, unsigned m) 78 : Reg(r), SubReg(s), Mask(m) {} 79 unsigned Reg; 80 unsigned SubReg; 81 unsigned Mask; 82 }; 83 84 class Candidate { 85 public: 86 Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, 87 unsigned weight) 88 : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} 89 90 bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } 91 92#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 93 void dump(const GCNRegBankReassign *P) const { 94 MI->dump(); 95 dbgs() << P->printReg(Reg) << " to banks "; 96 dumpFreeBanks(FreeBanks); 97 dbgs() << " weight " << Weight << '\n'; 98 } 99#endif 100 101 MachineInstr *MI; 102 unsigned Reg; 103 unsigned FreeBanks; 104 unsigned Weight; 105 }; 106 107 class CandidateList : public std::list<Candidate> { 108 public: 109 // Speedup subsequent sort. 110 void push(const Candidate&& C) { 111 if (C.Weight) push_back(C); 112 else push_front(C); 113 } 114 }; 115 116public: 117 static char ID; 118 119public: 120 GCNRegBankReassign() : MachineFunctionPass(ID) { 121 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); 122 } 123 124 bool runOnMachineFunction(MachineFunction &MF) override; 125 126 StringRef getPassName() const override { return "GCN RegBank Reassign"; } 127 128 void getAnalysisUsage(AnalysisUsage &AU) const override { 129 AU.addRequired<MachineLoopInfo>(); 130 AU.addRequired<LiveIntervals>(); 131 AU.addRequired<VirtRegMap>(); 132 AU.addRequired<LiveRegMatrix>(); 133 AU.setPreservesAll(); 134 MachineFunctionPass::getAnalysisUsage(AU); 135 } 136 137private: 138 const GCNSubtarget *ST; 139 140 const MachineRegisterInfo *MRI; 141 142 const SIRegisterInfo *TRI; 143 144 MachineLoopInfo *MLI; 145 146 VirtRegMap *VRM; 147 148 LiveRegMatrix *LRM; 149 150 LiveIntervals *LIS; 151 152 unsigned MaxNumVGPRs; 153 154 unsigned MaxNumSGPRs; 155 156 BitVector RegsUsed; 157 158 SmallVector<OperandMask, 8> OperandMasks; 159 160 CandidateList Candidates; 161 162 const MCPhysReg *CSRegs; 163 164 // Returns bank for a phys reg. 165 unsigned getPhysRegBank(unsigned Reg) const; 166 167 // Return a bit set for each register bank used. 4 banks for VGPRs and 168 // 8 banks for SGPRs. 169 // Registers already processed and recorded in RegsUsed are excluded. 170 // If Bank is not -1 assume Reg:SubReg to belong to that Bank. 171 uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); 172 173 // Analyze one instruction returning the number of stalls and a mask of the 174 // banks used by all operands. 175 // If Reg and Bank are provided, assume all uses of Reg will be replaced with 176 // a register chosen from Bank. 177 std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI, 178 unsigned Reg = AMDGPU::NoRegister, 179 int Bank = -1); 180 181 // Return true if register is regular VGPR or SGPR or their tuples. 182 // Returns false for special registers like m0, vcc etc. 183 bool isReassignable(unsigned Reg) const; 184 185 // Check if registers' defs are old and may be pre-loaded. 186 // Returns 0 if both registers are old enough, 1 or 2 if one or both 187 // registers will not likely be pre-loaded. 188 unsigned getOperandGatherWeight(const MachineInstr& MI, 189 unsigned Reg1, 190 unsigned Reg2, 191 unsigned StallCycles) const; 192 193 194 // Find all bank bits in UsedBanks where Mask can be relocated to. 195 unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; 196 197 // Find all bank bits in UsedBanks where Mask can be relocated to. 198 // Bank is relative to the register and not its subregister component. 199 // Returns 0 is a register is not reassignable. 200 unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, 201 unsigned UsedBanks) const; 202 203 // Add cadidate instruction to the work list. 204 void collectCandidates(MachineInstr& MI, unsigned UsedBanks, 205 unsigned StallCycles); 206 207 // Collect cadidate instructions across function. Returns a number stall 208 // cycles detected. Only counts stalls if Collect is false. 209 unsigned collectCandidates(MachineFunction &MF, bool Collect = true); 210 211 // Remove all candidates that read specified register. 212 void removeCandidates(unsigned Reg); 213 214 // Compute stalls within the uses of SrcReg replaced by a register from 215 // Bank. If Bank is -1 does not perform substitution. If Collect is set 216 // candidates are collected and added to work list. 217 unsigned computeStallCycles(unsigned SrcReg, 218 unsigned Reg = AMDGPU::NoRegister, 219 int Bank = -1, bool Collect = false); 220 221 // Search for a register in Bank unused within LI. 222 // Returns phys reg or NoRegister. 223 unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; 224 225 // Try to reassign candidate. Returns number or stall cycles saved. 226 unsigned tryReassign(Candidate &C); 227 228 bool verifyCycles(MachineFunction &MF, 229 unsigned OriginalCycles, unsigned CyclesSaved); 230 231 232#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 233public: 234 Printable printReg(unsigned Reg, unsigned SubReg = 0) const { 235 return Printable([Reg, SubReg, this](raw_ostream &OS) { 236 if (Register::isPhysicalRegister(Reg)) { 237 OS << llvm::printReg(Reg, TRI); 238 return; 239 } 240 if (!VRM->isAssignedReg(Reg)) 241 OS << "<unassigned> " << llvm::printReg(Reg, TRI); 242 else 243 OS << llvm::printReg(Reg, TRI) << '(' 244 << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; 245 if (SubReg) 246 OS << ':' << TRI->getSubRegIndexName(SubReg); 247 }); 248 } 249 250 static Printable printBank(unsigned Bank) { 251 return Printable([Bank](raw_ostream &OS) { 252 OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); 253 }); 254 } 255 256 static void dumpFreeBanks(unsigned FreeBanks) { 257 for (unsigned L = 0; L < NUM_BANKS; ++L) 258 if (FreeBanks & (1 << L)) 259 dbgs() << printBank(L) << ' '; 260 } 261#endif 262}; 263 264} // End anonymous namespace. 265 266INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", 267 false, false) 268INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 269INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) 270INITIALIZE_PASS_DEPENDENCY(VirtRegMap) 271INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) 272INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", 273 false, false) 274 275 276char GCNRegBankReassign::ID = 0; 277 278char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; 279 280unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { 281 assert(Register::isPhysicalRegister(Reg)); 282 283 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); 284 unsigned Size = TRI->getRegSizeInBits(*RC); 285 if (Size == 16) 286 Reg = TRI->get32BitRegister(Reg); 287 else if (Size > 32) 288 Reg = TRI->getSubReg(Reg, AMDGPU::sub0); 289 290 if (TRI->hasVGPRs(RC)) { 291 Reg -= AMDGPU::VGPR0; 292 return Reg % NUM_VGPR_BANKS; 293 } 294 295 Reg = TRI->getEncodingValue(Reg) / 2; 296 return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; 297} 298 299uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, 300 int Bank) { 301 if (Register::isVirtualRegister(Reg)) { 302 if (!VRM->isAssignedReg(Reg)) 303 return 0; 304 305 Reg = VRM->getPhys(Reg); 306 if (!Reg) 307 return 0; 308 if (SubReg) 309 Reg = TRI->getSubReg(Reg, SubReg); 310 } 311 312 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); 313 unsigned Size = TRI->getRegSizeInBits(*RC); 314 315 if (Size == 16) { 316 Reg = TRI->get32BitRegister(Reg); 317 Size = 1; 318 } else { 319 Size /= 32; 320 if (Size > 1) 321 Reg = TRI->getSubReg(Reg, AMDGPU::sub0); 322 } 323 324 if (TRI->hasVGPRs(RC)) { 325 // VGPRs have 4 banks assigned in a round-robin fashion. 326 Reg -= AMDGPU::VGPR0; 327 uint32_t Mask = maskTrailingOnes<uint32_t>(Size); 328 unsigned Used = 0; 329 // Bitmask lacks an extract method 330 for (unsigned I = 0; I < Size; ++I) 331 if (RegsUsed.test(Reg + I)) 332 Used |= 1 << I; 333 RegsUsed.set(Reg, Reg + Size); 334 Mask &= ~Used; 335 Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank); 336 return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; 337 } 338 339 // SGPRs have 8 banks holding 2 consequitive registers each. 340 Reg = TRI->getEncodingValue(Reg) / 2; 341 unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); 342 if (Reg + StartBit >= RegsUsed.size()) 343 return 0; 344 345 if (Size > 1) 346 Size /= 2; 347 unsigned Mask = (1 << Size) - 1; 348 unsigned Used = 0; 349 for (unsigned I = 0; I < Size; ++I) 350 if (RegsUsed.test(StartBit + Reg + I)) 351 Used |= 1 << I; 352 RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); 353 Mask &= ~Used; 354 Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS 355 : unsigned(Bank - SGPR_BANK_OFFSET); 356 Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; 357 // Reserve 4 bank ids for VGPRs. 358 return Mask << SGPR_BANK_OFFSET; 359} 360 361std::pair<unsigned, unsigned> 362GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, 363 int Bank) { 364 unsigned StallCycles = 0; 365 unsigned UsedBanks = 0; 366 367 if (MI.isDebugValue()) 368 return std::make_pair(StallCycles, UsedBanks); 369 370 RegsUsed.reset(); 371 OperandMasks.clear(); 372 for (const auto& Op : MI.explicit_uses()) { 373 // Undef can be assigned to any register, so two vregs can be assigned 374 // the same phys reg within the same instruction. 375 if (!Op.isReg() || Op.isUndef()) 376 continue; 377 378 Register R = Op.getReg(); 379 if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R))) 380 continue; 381 382 unsigned ShiftedBank = Bank; 383 384 if (Bank != -1 && R == Reg && Op.getSubReg()) { 385 unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); 386 LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); 387 if (Offset && Bank < NUM_VGPR_BANKS) { 388 // If a register spans all banks we cannot shift it to avoid conflict. 389 if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) 390 continue; 391 ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; 392 } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { 393 // If a register spans all banks we cannot shift it to avoid conflict. 394 if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) 395 continue; 396 ShiftedBank = SGPR_BANK_OFFSET + 397 (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; 398 } 399 } 400 401 uint32_t Mask = getRegBankMask(R, Op.getSubReg(), 402 (Reg == R) ? ShiftedBank : -1); 403 StallCycles += countPopulation(UsedBanks & Mask); 404 UsedBanks |= Mask; 405 OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); 406 } 407 408 return std::make_pair(StallCycles, UsedBanks); 409} 410 411unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, 412 unsigned Reg1, 413 unsigned Reg2, 414 unsigned StallCycles) const 415{ 416 unsigned Defs = 0; 417 MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); 418 MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); 419 for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { 420 if (MI.isDebugInstr()) 421 continue; 422 --Def; 423 if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) 424 continue; 425 if (Def->modifiesRegister(Reg1, TRI)) 426 Defs |= 1; 427 if (Def->modifiesRegister(Reg2, TRI)) 428 Defs |= 2; 429 } 430 return countPopulation(Defs); 431} 432 433bool GCNRegBankReassign::isReassignable(unsigned Reg) const { 434 if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) 435 return false; 436 437 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 438 439 Register PhysReg = VRM->getPhys(Reg); 440 441 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) 442 return false; 443 444 for (auto U : MRI->use_nodbg_operands(Reg)) { 445 if (U.isImplicit()) 446 return false; 447 const MachineInstr *UseInst = U.getParent(); 448 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) 449 return false; 450 } 451 452 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); 453 unsigned Size = TRI->getRegSizeInBits(*RC); 454 455 // TODO: Support 16 bit registers. Those needs to be moved with their 456 // parent VGPR_32 and potentially a sibling 16 bit sub-register. 457 if (Size < 32) 458 return false; 459 460 if (TRI->hasVGPRs(RC)) 461 return true; 462 463 if (Size == 16) 464 return AMDGPU::SGPR_LO16RegClass.contains(PhysReg); 465 466 if (Size > 32) 467 PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); 468 469 return AMDGPU::SGPR_32RegClass.contains(PhysReg); 470} 471 472unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, 473 unsigned UsedBanks) const { 474 unsigned Size = countPopulation(Mask); 475 unsigned FreeBanks = 0; 476 unsigned Bank = findFirstSet(Mask); 477 478 UsedBanks &= ~Mask; 479 480 // Find free VGPR banks 481 if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { 482 for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { 483 if (Bank == I) 484 continue; 485 unsigned NewMask = ((1 << Size) - 1) << I; 486 NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; 487 if (!(UsedBanks & NewMask)) 488 FreeBanks |= 1 << I; 489 } 490 return FreeBanks; 491 } 492 493 // Find free SGPR banks 494 // SGPR tuples must be aligned, so step is size in banks it 495 // crosses. 496 Bank -= SGPR_BANK_OFFSET; 497 for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { 498 if (Bank == I) 499 continue; 500 unsigned NewMask = ((1 << Size) - 1) << I; 501 NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; 502 if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) 503 FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; 504 } 505 506 return FreeBanks; 507} 508 509unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, 510 unsigned SubReg, 511 unsigned Mask, 512 unsigned UsedBanks) const { 513 if (!isReassignable(Reg)) 514 return 0; 515 516 unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); 517 518 unsigned Offset = TRI->getChannelFromSubReg(SubReg); 519 if (Offset && (Mask & VGPR_BANK_MASK)) { 520 unsigned Shift = Offset; 521 if (Shift >= NUM_VGPR_BANKS) 522 return 0; 523 unsigned VB = FreeBanks & VGPR_BANK_MASK; 524 FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & 525 VGPR_BANK_MASK; 526 } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { 527 unsigned Shift = Offset >> 1; 528 if (Shift >= NUM_SGPR_BANKS) 529 return 0; 530 unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; 531 FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & 532 SGPR_BANK_SHIFTED_MASK; 533 FreeBanks <<= SGPR_BANK_OFFSET; 534 } 535 536 LLVM_DEBUG(if (FreeBanks) { 537 dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) 538 << " to banks: "; dumpFreeBanks(FreeBanks); 539 dbgs() << '\n'; }); 540 541 return FreeBanks; 542} 543 544void GCNRegBankReassign::collectCandidates(MachineInstr& MI, 545 unsigned UsedBanks, 546 unsigned StallCycles) { 547 LLVM_DEBUG(MI.dump()); 548 549 if (!StallCycles) 550 return; 551 552 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); 553 554 for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { 555 for (unsigned J = I + 1; J != E; ++J) { 556 if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) 557 continue; 558 559 unsigned Reg1 = OperandMasks[I].Reg; 560 unsigned Reg2 = OperandMasks[J].Reg; 561 unsigned SubReg1 = OperandMasks[I].SubReg; 562 unsigned SubReg2 = OperandMasks[J].SubReg; 563 unsigned Mask1 = OperandMasks[I].Mask; 564 unsigned Mask2 = OperandMasks[J].Mask; 565 unsigned Size1 = countPopulation(Mask1); 566 unsigned Size2 = countPopulation(Mask2); 567 568 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << 569 " and " << printReg(Reg2, SubReg2) << '\n'); 570 571 unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); 572 Weight += MLI->getLoopDepth(MI.getParent()) * 10; 573 574 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); 575 576 unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); 577 unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); 578 if (FreeBanks1) 579 Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight 580 + ((Size2 > Size1) ? 1 : 0))); 581 if (FreeBanks2) 582 Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight 583 + ((Size1 > Size2) ? 1 : 0))); 584 } 585 } 586} 587 588unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, 589 unsigned Reg, int Bank, 590 bool Collect) { 591 unsigned TotalStallCycles = 0; 592 SmallSet<const MachineInstr *, 16> Visited; 593 594 for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { 595 if (MI.isBundle()) 596 continue; 597 if (!Visited.insert(&MI).second) 598 continue; 599 unsigned StallCycles; 600 unsigned UsedBanks; 601 std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank); 602 TotalStallCycles += StallCycles; 603 if (Collect) 604 collectCandidates(MI, UsedBanks, StallCycles); 605 } 606 607 return TotalStallCycles; 608} 609 610unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, 611 unsigned Bank) const { 612 const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); 613 unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs 614 : MaxNumSGPRs; 615 unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 616 : AMDGPU::SGPR0); 617 618 for (unsigned Reg : RC->getRegisters()) { 619 // Check occupancy limit. 620 if (TRI->isSubRegisterEq(Reg, MaxReg)) 621 break; 622 623 if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) 624 continue; 625 626 for (unsigned I = 0; CSRegs[I]; ++I) 627 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && 628 !LRM->isPhysRegUsed(CSRegs[I])) 629 return AMDGPU::NoRegister; 630 631 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); 632 633 if (!LRM->checkInterference(LI, Reg)) 634 return Reg; 635 } 636 637 return AMDGPU::NoRegister; 638} 639 640unsigned GCNRegBankReassign::tryReassign(Candidate &C) { 641 if (!LIS->hasInterval(C.Reg)) 642 return 0; 643 644 LiveInterval &LI = LIS->getInterval(C.Reg); 645 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); 646 LI.dump()); 647 648 // For each candidate bank walk all instructions in the range of live 649 // interval and check if replacing the register with one belonging to 650 // the candidate bank reduces conflicts. 651 652 unsigned OrigStalls = computeStallCycles(C.Reg); 653 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); 654 if (!OrigStalls) 655 return 0; 656 657 struct BankStall { 658 BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; 659 bool operator<(const BankStall &RHS) const { 660 if (Stalls == RHS.Stalls) 661 return Bank < RHS.Bank; 662 return Stalls > RHS.Stalls; 663 } 664 unsigned Bank; 665 unsigned Stalls; 666 }; 667 SmallVector<BankStall, 8> BankStalls; 668 669 for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { 670 if (C.FreeBanks & (1 << Bank)) { 671 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); 672 unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); 673 if (Stalls < OrigStalls) { 674 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " 675 << Stalls << '\n'); 676 BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); 677 } 678 } 679 } 680 llvm::sort(BankStalls); 681 682 Register OrigReg = VRM->getPhys(C.Reg); 683 LRM->unassign(LI); 684 while (!BankStalls.empty()) { 685 BankStall BS = BankStalls.pop_back_val(); 686 unsigned Reg = scavengeReg(LI, BS.Bank); 687 if (Reg == AMDGPU::NoRegister) { 688 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) 689 << '\n'); 690 continue; 691 } 692 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) 693 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") 694 << " in bank " << printBank(BS.Bank) << '\n'); 695 696 LRM->assign(LI, Reg); 697 698 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); 699 700 return OrigStalls - BS.Stalls; 701 } 702 LRM->assign(LI, OrigReg); 703 704 return 0; 705} 706 707unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, 708 bool Collect) { 709 unsigned TotalStallCycles = 0; 710 711 for (MachineBasicBlock &MBB : MF) { 712 713 LLVM_DEBUG(if (Collect) { 714 if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); 715 else dbgs() << MBB.getName(); dbgs() << ":\n"; 716 }); 717 718 for (MachineInstr &MI : MBB.instrs()) { 719 if (MI.isBundle()) 720 continue; // we analyze the instructions inside the bundle individually 721 722 unsigned StallCycles; 723 unsigned UsedBanks; 724 std::tie(StallCycles, UsedBanks) = analyzeInst(MI); 725 726 if (Collect) 727 collectCandidates(MI, UsedBanks, StallCycles); 728 729 TotalStallCycles += StallCycles; 730 } 731 732 LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); 733 } 734 735 return TotalStallCycles; 736} 737 738void GCNRegBankReassign::removeCandidates(unsigned Reg) { 739 Candidates.remove_if([Reg, this](const Candidate& C) { 740 return C.MI->readsRegister(Reg, TRI); 741 }); 742} 743 744bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, 745 unsigned OriginalCycles, 746 unsigned CyclesSaved) { 747 unsigned StallCycles = collectCandidates(MF, false); 748 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles 749 << " stall cycles left\n"); 750 return StallCycles + CyclesSaved == OriginalCycles; 751} 752 753bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { 754 ST = &MF.getSubtarget<GCNSubtarget>(); 755 if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) 756 return false; 757 758 MRI = &MF.getRegInfo(); 759 TRI = ST->getRegisterInfo(); 760 MLI = &getAnalysis<MachineLoopInfo>(); 761 VRM = &getAnalysis<VirtRegMap>(); 762 LRM = &getAnalysis<LiveRegMatrix>(); 763 LIS = &getAnalysis<LiveIntervals>(); 764 765 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 766 unsigned Occupancy = MFI->getOccupancy(); 767 MaxNumVGPRs = ST->getMaxNumVGPRs(MF); 768 MaxNumSGPRs = ST->getMaxNumSGPRs(MF); 769 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); 770 MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); 771 772 CSRegs = MRI->getCalleeSavedRegs(); 773 774 RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + 775 TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); 776 777 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() 778 << '\n'); 779 780 unsigned StallCycles = collectCandidates(MF); 781 NumStallsDetected += StallCycles; 782 783 LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " 784 "function " << MF.getName() << '\n'); 785 786 Candidates.sort(); 787 788 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; 789 for (auto C : Candidates) C.dump(this); 790 dbgs() << "\n\n"); 791 792 unsigned CyclesSaved = 0; 793 while (!Candidates.empty()) { 794 Candidate C = Candidates.back(); 795 unsigned LocalCyclesSaved = tryReassign(C); 796 CyclesSaved += LocalCyclesSaved; 797 798 if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) 799 report_fatal_error("RegBank reassign stall cycles verification failed."); 800 801 Candidates.pop_back(); 802 if (LocalCyclesSaved) { 803 removeCandidates(C.Reg); 804 computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); 805 Candidates.sort(); 806 807 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; 808 for (auto C : Candidates) 809 C.dump(this); 810 dbgs() << "\n\n"); 811 } 812 } 813 NumStallsRecovered += CyclesSaved; 814 815 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved 816 << " cycles saved in function " << MF.getName() << '\n'); 817 818 Candidates.clear(); 819 820 if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) 821 report_fatal_error("RegBank reassign stall cycles verification failed."); 822 823 RegsUsed.clear(); 824 825 return CyclesSaved > 0; 826} 827