1//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ to reduce register bank
11/// conflicts.
12///
13/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17///
18/// The shader can read one dword from each of these banks once per cycle.
19/// If an instruction has to read more register operands from the same bank
20/// an additional cycle is needed. HW attempts to pre-load registers through
21/// input operand gathering, but a stall cycle may occur if that fails. For
22/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23/// potentially incuring 2 stall cycles.
24///
25/// The pass tries to reassign registers to reduce bank conflicts.
26///
27/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28/// that 4 has to be subtracted from an SGPR bank number to get the real value.
29/// This also corresponds to bit numbers in bank masks used in the pass.
30///
31//===----------------------------------------------------------------------===//
32
33#include "AMDGPU.h"
34#include "AMDGPUSubtarget.h"
35#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
36#include "SIInstrInfo.h"
37#include "SIMachineFunctionInfo.h"
38#include "llvm/ADT/SmallSet.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/CodeGen/LiveInterval.h"
41#include "llvm/CodeGen/LiveIntervals.h"
42#include "llvm/CodeGen/LiveRegMatrix.h"
43#include "llvm/CodeGen/MachineFunctionPass.h"
44#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/VirtRegMap.h"
46#include "llvm/InitializePasses.h"
47#include "llvm/Support/MathExtras.h"
48
49using namespace llvm;
50
51static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
52  cl::desc("Verify stall cycles in the regbanks reassign pass"),
53  cl::value_desc("0|1|2"),
54  cl::init(0), cl::Hidden);
55
56#define DEBUG_TYPE "amdgpu-regbanks-reassign"
57
58#define NUM_VGPR_BANKS 4
59#define NUM_SGPR_BANKS 8
60#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
61#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
62#define VGPR_BANK_MASK 0xf
63#define SGPR_BANK_MASK 0xff0
64#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
65
66STATISTIC(NumStallsDetected,
67          "Number of operand read stalls detected");
68STATISTIC(NumStallsRecovered,
69          "Number of operand read stalls recovered");
70
71namespace {
72
73class GCNRegBankReassign : public MachineFunctionPass {
74
75  class OperandMask {
76  public:
77    OperandMask(unsigned r, unsigned s, unsigned m)
78      : Reg(r), SubReg(s), Mask(m) {}
79    unsigned Reg;
80    unsigned SubReg;
81    unsigned Mask;
82  };
83
84  class Candidate {
85  public:
86    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
87              unsigned weight)
88      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
89
90    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
91
92#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
93    void dump(const GCNRegBankReassign *P) const {
94      MI->dump();
95      dbgs() << P->printReg(Reg) << " to banks ";
96      dumpFreeBanks(FreeBanks);
97      dbgs() << " weight " << Weight << '\n';
98    }
99#endif
100
101    MachineInstr *MI;
102    unsigned Reg;
103    unsigned FreeBanks;
104    unsigned Weight;
105  };
106
107  class CandidateList : public std::list<Candidate> {
108  public:
109    // Speedup subsequent sort.
110    void push(const Candidate&& C) {
111      if (C.Weight) push_back(C);
112      else push_front(C);
113    }
114  };
115
116public:
117  static char ID;
118
119public:
120  GCNRegBankReassign() : MachineFunctionPass(ID) {
121    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
122  }
123
124  bool runOnMachineFunction(MachineFunction &MF) override;
125
126  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
127
128  void getAnalysisUsage(AnalysisUsage &AU) const override {
129    AU.addRequired<MachineLoopInfo>();
130    AU.addRequired<LiveIntervals>();
131    AU.addRequired<VirtRegMap>();
132    AU.addRequired<LiveRegMatrix>();
133    AU.setPreservesAll();
134    MachineFunctionPass::getAnalysisUsage(AU);
135  }
136
137private:
138  const GCNSubtarget *ST;
139
140  const MachineRegisterInfo *MRI;
141
142  const SIRegisterInfo *TRI;
143
144  MachineLoopInfo *MLI;
145
146  VirtRegMap *VRM;
147
148  LiveRegMatrix *LRM;
149
150  LiveIntervals *LIS;
151
152  unsigned MaxNumVGPRs;
153
154  unsigned MaxNumSGPRs;
155
156  BitVector RegsUsed;
157
158  SmallVector<OperandMask, 8> OperandMasks;
159
160  CandidateList Candidates;
161
162  const MCPhysReg *CSRegs;
163
164  // Returns bank for a phys reg.
165  unsigned getPhysRegBank(unsigned Reg) const;
166
167  // Return a bit set for each register bank used. 4 banks for VGPRs and
168  // 8 banks for SGPRs.
169  // Registers already processed and recorded in RegsUsed are excluded.
170  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
171  uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
172
173  // Analyze one instruction returning the number of stalls and a mask of the
174  // banks used by all operands.
175  // If Reg and Bank are provided, assume all uses of Reg will be replaced with
176  // a register chosen from Bank.
177  std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
178                                            unsigned Reg = AMDGPU::NoRegister,
179                                            int Bank = -1);
180
181  // Return true if register is regular VGPR or SGPR or their tuples.
182  // Returns false for special registers like m0, vcc etc.
183  bool isReassignable(unsigned Reg) const;
184
185  // Check if registers' defs are old and may be pre-loaded.
186  // Returns 0 if both registers are old enough, 1 or 2 if one or both
187  // registers will not likely be pre-loaded.
188  unsigned getOperandGatherWeight(const MachineInstr& MI,
189                                  unsigned Reg1,
190                                  unsigned Reg2,
191                                  unsigned StallCycles) const;
192
193
194  // Find all bank bits in UsedBanks where Mask can be relocated to.
195  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
196
197  // Find all bank bits in UsedBanks where Mask can be relocated to.
198  // Bank is relative to the register and not its subregister component.
199  // Returns 0 is a register is not reassignable.
200  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
201                        unsigned UsedBanks) const;
202
203  // Add cadidate instruction to the work list.
204  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
205                         unsigned StallCycles);
206
207  // Collect cadidate instructions across function. Returns a number stall
208  // cycles detected. Only counts stalls if Collect is false.
209  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
210
211  // Remove all candidates that read specified register.
212  void removeCandidates(unsigned Reg);
213
214  // Compute stalls within the uses of SrcReg replaced by a register from
215  // Bank. If Bank is -1 does not perform substitution. If Collect is set
216  // candidates are collected and added to work list.
217  unsigned computeStallCycles(unsigned SrcReg,
218                              unsigned Reg = AMDGPU::NoRegister,
219                              int Bank = -1, bool Collect = false);
220
221  // Search for a register in Bank unused within LI.
222  // Returns phys reg or NoRegister.
223  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
224
225  // Try to reassign candidate. Returns number or stall cycles saved.
226  unsigned tryReassign(Candidate &C);
227
228  bool verifyCycles(MachineFunction &MF,
229                    unsigned OriginalCycles, unsigned CyclesSaved);
230
231
232#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
233public:
234  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
235    return Printable([Reg, SubReg, this](raw_ostream &OS) {
236      if (Register::isPhysicalRegister(Reg)) {
237        OS << llvm::printReg(Reg, TRI);
238        return;
239      }
240      if (!VRM->isAssignedReg(Reg))
241        OS << "<unassigned> " << llvm::printReg(Reg, TRI);
242      else
243        OS << llvm::printReg(Reg, TRI) << '('
244           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
245      if (SubReg)
246        OS << ':' << TRI->getSubRegIndexName(SubReg);
247    });
248  }
249
250  static Printable printBank(unsigned Bank) {
251    return Printable([Bank](raw_ostream &OS) {
252      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
253    });
254  }
255
256  static void dumpFreeBanks(unsigned FreeBanks) {
257    for (unsigned L = 0; L < NUM_BANKS; ++L)
258      if (FreeBanks & (1 << L))
259        dbgs() << printBank(L) << ' ';
260  }
261#endif
262};
263
264} // End anonymous namespace.
265
266INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
267                      false, false)
268INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
269INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
270INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
271INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
272INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
273                    false, false)
274
275
276char GCNRegBankReassign::ID = 0;
277
278char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
279
280unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
281  assert(Register::isPhysicalRegister(Reg));
282
283  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
284  unsigned Size = TRI->getRegSizeInBits(*RC);
285  if (Size == 16)
286    Reg = TRI->get32BitRegister(Reg);
287  else if (Size > 32)
288    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
289
290  if (TRI->hasVGPRs(RC)) {
291    Reg -= AMDGPU::VGPR0;
292    return Reg % NUM_VGPR_BANKS;
293  }
294
295  Reg = TRI->getEncodingValue(Reg) / 2;
296  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
297}
298
299uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
300                                            int Bank) {
301  if (Register::isVirtualRegister(Reg)) {
302    if (!VRM->isAssignedReg(Reg))
303      return 0;
304
305    Reg = VRM->getPhys(Reg);
306    if (!Reg)
307      return 0;
308    if (SubReg)
309      Reg = TRI->getSubReg(Reg, SubReg);
310  }
311
312  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
313  unsigned Size = TRI->getRegSizeInBits(*RC);
314
315  if (Size == 16) {
316    Reg = TRI->get32BitRegister(Reg);
317    Size = 1;
318  } else {
319    Size /= 32;
320    if (Size > 1)
321      Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
322  }
323
324  if (TRI->hasVGPRs(RC)) {
325    // VGPRs have 4 banks assigned in a round-robin fashion.
326    Reg -= AMDGPU::VGPR0;
327    uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
328    unsigned Used = 0;
329    // Bitmask lacks an extract method
330    for (unsigned I = 0; I < Size; ++I)
331      if (RegsUsed.test(Reg + I))
332        Used |= 1 << I;
333    RegsUsed.set(Reg, Reg + Size);
334    Mask &= ~Used;
335    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
336    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
337  }
338
339  // SGPRs have 8 banks holding 2 consequitive registers each.
340  Reg = TRI->getEncodingValue(Reg) / 2;
341  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
342  if (Reg + StartBit >= RegsUsed.size())
343    return 0;
344
345  if (Size > 1)
346    Size /= 2;
347  unsigned Mask = (1 << Size) - 1;
348  unsigned Used = 0;
349  for (unsigned I = 0; I < Size; ++I)
350    if (RegsUsed.test(StartBit + Reg + I))
351      Used |= 1 << I;
352  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
353  Mask &= ~Used;
354  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
355                        : unsigned(Bank - SGPR_BANK_OFFSET);
356  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
357  // Reserve 4 bank ids for VGPRs.
358  return Mask << SGPR_BANK_OFFSET;
359}
360
361std::pair<unsigned, unsigned>
362GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
363                                int Bank) {
364  unsigned StallCycles = 0;
365  unsigned UsedBanks = 0;
366
367  if (MI.isDebugValue())
368    return std::make_pair(StallCycles, UsedBanks);
369
370  RegsUsed.reset();
371  OperandMasks.clear();
372  for (const auto& Op : MI.explicit_uses()) {
373    // Undef can be assigned to any register, so two vregs can be assigned
374    // the same phys reg within the same instruction.
375    if (!Op.isReg() || Op.isUndef())
376      continue;
377
378    Register R = Op.getReg();
379    if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
380      continue;
381
382    unsigned ShiftedBank = Bank;
383
384    if (Bank != -1 && R == Reg && Op.getSubReg()) {
385      unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
386      LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
387      if (Offset && Bank < NUM_VGPR_BANKS) {
388        // If a register spans all banks we cannot shift it to avoid conflict.
389        if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
390          continue;
391        ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
392      } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
393        // If a register spans all banks we cannot shift it to avoid conflict.
394        if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
395          continue;
396        ShiftedBank = SGPR_BANK_OFFSET +
397          (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
398      }
399    }
400
401    uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
402                                   (Reg == R) ? ShiftedBank : -1);
403    StallCycles += countPopulation(UsedBanks & Mask);
404    UsedBanks |= Mask;
405    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
406  }
407
408  return std::make_pair(StallCycles, UsedBanks);
409}
410
411unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
412                                                    unsigned Reg1,
413                                                    unsigned Reg2,
414                                                    unsigned StallCycles) const
415{
416  unsigned Defs = 0;
417  MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
418  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
419  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
420    if (MI.isDebugInstr())
421      continue;
422    --Def;
423    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
424      continue;
425    if (Def->modifiesRegister(Reg1, TRI))
426      Defs |= 1;
427    if (Def->modifiesRegister(Reg2, TRI))
428      Defs |= 2;
429  }
430  return countPopulation(Defs);
431}
432
433bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
434  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
435    return false;
436
437  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
438
439  Register PhysReg = VRM->getPhys(Reg);
440
441  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
442    return false;
443
444  for (auto U : MRI->use_nodbg_operands(Reg)) {
445    if (U.isImplicit())
446      return false;
447    const MachineInstr *UseInst = U.getParent();
448    if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
449      return false;
450  }
451
452  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
453  unsigned Size = TRI->getRegSizeInBits(*RC);
454
455  // TODO: Support 16 bit registers. Those needs to be moved with their
456  //       parent VGPR_32 and potentially a sibling 16 bit sub-register.
457  if (Size < 32)
458    return false;
459
460  if (TRI->hasVGPRs(RC))
461    return true;
462
463  if (Size == 16)
464    return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
465
466  if (Size > 32)
467    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
468
469  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
470}
471
472unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
473                                          unsigned UsedBanks) const {
474  unsigned Size = countPopulation(Mask);
475  unsigned FreeBanks = 0;
476  unsigned Bank = findFirstSet(Mask);
477
478  UsedBanks &= ~Mask;
479
480  // Find free VGPR banks
481  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
482    for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
483      if (Bank == I)
484        continue;
485      unsigned NewMask = ((1 << Size) - 1) << I;
486      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
487      if (!(UsedBanks & NewMask))
488        FreeBanks |= 1 << I;
489    }
490    return FreeBanks;
491  }
492
493  // Find free SGPR banks
494  // SGPR tuples must be aligned, so step is size in banks it
495  // crosses.
496  Bank -= SGPR_BANK_OFFSET;
497  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
498    if (Bank == I)
499      continue;
500    unsigned NewMask = ((1 << Size) - 1) << I;
501    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
502    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
503      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
504  }
505
506  return FreeBanks;
507}
508
509unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
510                                          unsigned SubReg,
511                                          unsigned Mask,
512                                          unsigned UsedBanks) const {
513  if (!isReassignable(Reg))
514    return 0;
515
516  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
517
518  unsigned Offset = TRI->getChannelFromSubReg(SubReg);
519  if (Offset && (Mask & VGPR_BANK_MASK)) {
520    unsigned Shift = Offset;
521    if (Shift >= NUM_VGPR_BANKS)
522      return 0;
523    unsigned VB = FreeBanks & VGPR_BANK_MASK;
524    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
525                VGPR_BANK_MASK;
526  } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
527    unsigned Shift = Offset >> 1;
528    if (Shift >= NUM_SGPR_BANKS)
529      return 0;
530    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
531    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
532                SGPR_BANK_SHIFTED_MASK;
533    FreeBanks <<= SGPR_BANK_OFFSET;
534  }
535
536  LLVM_DEBUG(if (FreeBanks) {
537          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
538                 << " to banks: "; dumpFreeBanks(FreeBanks);
539          dbgs() << '\n'; });
540
541  return FreeBanks;
542}
543
544void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
545                                           unsigned UsedBanks,
546                                           unsigned StallCycles) {
547  LLVM_DEBUG(MI.dump());
548
549  if (!StallCycles)
550    return;
551
552  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
553
554  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
555    for (unsigned J = I + 1; J != E; ++J) {
556      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
557        continue;
558
559      unsigned Reg1 = OperandMasks[I].Reg;
560      unsigned Reg2 = OperandMasks[J].Reg;
561      unsigned SubReg1 = OperandMasks[I].SubReg;
562      unsigned SubReg2 = OperandMasks[J].SubReg;
563      unsigned Mask1 = OperandMasks[I].Mask;
564      unsigned Mask2 = OperandMasks[J].Mask;
565      unsigned Size1 = countPopulation(Mask1);
566      unsigned Size2 = countPopulation(Mask2);
567
568      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
569                      " and " << printReg(Reg2, SubReg2) << '\n');
570
571      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
572      Weight += MLI->getLoopDepth(MI.getParent()) * 10;
573
574      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
575
576      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
577      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
578      if (FreeBanks1)
579        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
580                                    + ((Size2 > Size1) ? 1 : 0)));
581      if (FreeBanks2)
582        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
583                                    + ((Size1 > Size2) ? 1 : 0)));
584    }
585  }
586}
587
588unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
589                                                unsigned Reg, int Bank,
590                                                bool Collect) {
591  unsigned TotalStallCycles = 0;
592  SmallSet<const MachineInstr *, 16> Visited;
593
594  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
595    if (MI.isBundle())
596      continue;
597    if (!Visited.insert(&MI).second)
598      continue;
599    unsigned StallCycles;
600    unsigned UsedBanks;
601    std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
602    TotalStallCycles += StallCycles;
603    if (Collect)
604      collectCandidates(MI, UsedBanks, StallCycles);
605  }
606
607  return TotalStallCycles;
608}
609
610unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
611                                         unsigned Bank) const {
612  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
613  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
614                                                : MaxNumSGPRs;
615  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
616                                                        : AMDGPU::SGPR0);
617
618  for (unsigned Reg : RC->getRegisters()) {
619    // Check occupancy limit.
620    if (TRI->isSubRegisterEq(Reg, MaxReg))
621      break;
622
623    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
624      continue;
625
626    for (unsigned I = 0; CSRegs[I]; ++I)
627      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
628          !LRM->isPhysRegUsed(CSRegs[I]))
629        return AMDGPU::NoRegister;
630
631    LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
632
633    if (!LRM->checkInterference(LI, Reg))
634      return Reg;
635  }
636
637  return AMDGPU::NoRegister;
638}
639
640unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
641  if (!LIS->hasInterval(C.Reg))
642    return 0;
643
644  LiveInterval &LI = LIS->getInterval(C.Reg);
645  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
646             LI.dump());
647
648  // For each candidate bank walk all instructions in the range of live
649  // interval and check if replacing the register with one belonging to
650  // the candidate bank reduces conflicts.
651
652  unsigned OrigStalls = computeStallCycles(C.Reg);
653  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
654  if (!OrigStalls)
655    return 0;
656
657  struct BankStall {
658    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
659    bool operator<(const BankStall &RHS) const {
660      if (Stalls == RHS.Stalls)
661        return Bank < RHS.Bank;
662      return Stalls > RHS.Stalls;
663    }
664    unsigned Bank;
665    unsigned Stalls;
666  };
667  SmallVector<BankStall, 8> BankStalls;
668
669  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
670    if (C.FreeBanks & (1 << Bank)) {
671      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
672      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
673      if (Stalls < OrigStalls) {
674        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
675                     << Stalls << '\n');
676        BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
677      }
678    }
679  }
680  llvm::sort(BankStalls);
681
682  Register OrigReg = VRM->getPhys(C.Reg);
683  LRM->unassign(LI);
684  while (!BankStalls.empty()) {
685    BankStall BS = BankStalls.pop_back_val();
686    unsigned Reg = scavengeReg(LI, BS.Bank);
687    if (Reg == AMDGPU::NoRegister) {
688      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
689                   << '\n');
690      continue;
691    }
692    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
693                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
694                 << " in bank " << printBank(BS.Bank) << '\n');
695
696    LRM->assign(LI, Reg);
697
698    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
699
700    return OrigStalls - BS.Stalls;
701  }
702  LRM->assign(LI, OrigReg);
703
704  return 0;
705}
706
707unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
708                                               bool Collect) {
709  unsigned TotalStallCycles = 0;
710
711  for (MachineBasicBlock &MBB : MF) {
712
713    LLVM_DEBUG(if (Collect) {
714            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
715            else dbgs() << MBB.getName(); dbgs() << ":\n";
716          });
717
718    for (MachineInstr &MI : MBB.instrs()) {
719      if (MI.isBundle())
720          continue; // we analyze the instructions inside the bundle individually
721
722      unsigned StallCycles;
723      unsigned UsedBanks;
724      std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
725
726      if (Collect)
727        collectCandidates(MI, UsedBanks, StallCycles);
728
729      TotalStallCycles += StallCycles;
730    }
731
732    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
733  }
734
735  return TotalStallCycles;
736}
737
738void GCNRegBankReassign::removeCandidates(unsigned Reg) {
739  Candidates.remove_if([Reg, this](const Candidate& C) {
740    return C.MI->readsRegister(Reg, TRI);
741  });
742}
743
744bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
745                                      unsigned OriginalCycles,
746                                      unsigned CyclesSaved) {
747  unsigned StallCycles = collectCandidates(MF, false);
748  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
749               << " stall cycles left\n");
750  return StallCycles + CyclesSaved == OriginalCycles;
751}
752
753bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
754  ST = &MF.getSubtarget<GCNSubtarget>();
755  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
756    return false;
757
758  MRI = &MF.getRegInfo();
759  TRI = ST->getRegisterInfo();
760  MLI = &getAnalysis<MachineLoopInfo>();
761  VRM = &getAnalysis<VirtRegMap>();
762  LRM = &getAnalysis<LiveRegMatrix>();
763  LIS = &getAnalysis<LiveIntervals>();
764
765  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
766  unsigned Occupancy = MFI->getOccupancy();
767  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
768  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
769  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
770  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
771
772  CSRegs = MRI->getCalleeSavedRegs();
773
774  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
775                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
776
777  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
778               << '\n');
779
780  unsigned StallCycles = collectCandidates(MF);
781  NumStallsDetected += StallCycles;
782
783  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
784                  "function " << MF.getName() << '\n');
785
786  Candidates.sort();
787
788  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
789        for (auto C : Candidates) C.dump(this);
790        dbgs() << "\n\n");
791
792  unsigned CyclesSaved = 0;
793  while (!Candidates.empty()) {
794    Candidate C = Candidates.back();
795    unsigned LocalCyclesSaved = tryReassign(C);
796    CyclesSaved += LocalCyclesSaved;
797
798    if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
799      report_fatal_error("RegBank reassign stall cycles verification failed.");
800
801    Candidates.pop_back();
802    if (LocalCyclesSaved) {
803      removeCandidates(C.Reg);
804      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
805      Candidates.sort();
806
807      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
808            for (auto C : Candidates)
809              C.dump(this);
810            dbgs() << "\n\n");
811    }
812  }
813  NumStallsRecovered += CyclesSaved;
814
815  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
816               << " cycles saved in function " << MF.getName() << '\n');
817
818  Candidates.clear();
819
820  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
821    report_fatal_error("RegBank reassign stall cycles verification failed.");
822
823  RegsUsed.clear();
824
825  return CyclesSaved > 0;
826}
827