GCNHazardRecognizer.cpp revision 360784
1//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements hazard recognizers for scheduling on GCN processors.
10//
11//===----------------------------------------------------------------------===//
12
13#include "GCNHazardRecognizer.h"
14#include "AMDGPUSubtarget.h"
15#include "SIDefines.h"
16#include "SIInstrInfo.h"
17#include "SIRegisterInfo.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "Utils/AMDGPUBaseInfo.h"
20#include "llvm/ADT/iterator_range.h"
21#include "llvm/CodeGen/MachineFunction.h"
22#include "llvm/CodeGen/MachineInstr.h"
23#include "llvm/CodeGen/MachineInstrBuilder.h"
24#include "llvm/CodeGen/MachineOperand.h"
25#include "llvm/CodeGen/ScheduleDAG.h"
26#include "llvm/MC/MCInstrDesc.h"
27#include "llvm/Support/ErrorHandling.h"
28#include <algorithm>
29#include <cassert>
30#include <limits>
31#include <set>
32#include <vector>
33
34using namespace llvm;
35
36//===----------------------------------------------------------------------===//
37// Hazard Recoginizer Implementation
38//===----------------------------------------------------------------------===//
39
40GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41  IsHazardRecognizerMode(false),
42  CurrCycleInstr(nullptr),
43  MF(MF),
44  ST(MF.getSubtarget<GCNSubtarget>()),
45  TII(*ST.getInstrInfo()),
46  TRI(TII.getRegisterInfo()),
47  ClauseUses(TRI.getNumRegUnits()),
48  ClauseDefs(TRI.getNumRegUnits()) {
49  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50  TSchedModel.init(&ST);
51}
52
53void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54  EmitInstruction(SU->getInstr());
55}
56
57void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58  CurrCycleInstr = MI;
59}
60
61static bool isDivFMas(unsigned Opcode) {
62  return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63}
64
65static bool isSGetReg(unsigned Opcode) {
66  return Opcode == AMDGPU::S_GETREG_B32;
67}
68
69static bool isSSetReg(unsigned Opcode) {
70  return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
71}
72
73static bool isRWLane(unsigned Opcode) {
74  return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
75}
76
77static bool isRFE(unsigned Opcode) {
78  return Opcode == AMDGPU::S_RFE_B64;
79}
80
81static bool isSMovRel(unsigned Opcode) {
82  switch (Opcode) {
83  case AMDGPU::S_MOVRELS_B32:
84  case AMDGPU::S_MOVRELS_B64:
85  case AMDGPU::S_MOVRELD_B32:
86  case AMDGPU::S_MOVRELD_B64:
87    return true;
88  default:
89    return false;
90  }
91}
92
93static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94                                    const MachineInstr &MI) {
95  if (TII.isAlwaysGDS(MI.getOpcode()))
96    return true;
97
98  switch (MI.getOpcode()) {
99  case AMDGPU::S_SENDMSG:
100  case AMDGPU::S_SENDMSGHALT:
101  case AMDGPU::S_TTRACEDATA:
102    return true;
103  // These DS opcodes don't support GDS.
104  case AMDGPU::DS_NOP:
105  case AMDGPU::DS_PERMUTE_B32:
106  case AMDGPU::DS_BPERMUTE_B32:
107    return false;
108  default:
109    if (TII.isDS(MI.getOpcode())) {
110      int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111                                           AMDGPU::OpName::gds);
112      if (MI.getOperand(GDS).getImm())
113        return true;
114    }
115    return false;
116  }
117}
118
119static bool isPermlane(const MachineInstr &MI) {
120  unsigned Opcode = MI.getOpcode();
121  return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122         Opcode == AMDGPU::V_PERMLANEX16_B32;
123}
124
125static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126  const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127                                                     AMDGPU::OpName::simm16);
128  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
129}
130
131ScheduleHazardRecognizer::HazardType
132GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133  MachineInstr *MI = SU->getInstr();
134  if (MI->isBundle())
135   return NoHazard;
136
137  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138    return NoopHazard;
139
140  // FIXME: Should flat be considered vmem?
141  if ((SIInstrInfo::isVMEM(*MI) ||
142       SIInstrInfo::isFLAT(*MI))
143      && checkVMEMHazards(MI) > 0)
144    return NoopHazard;
145
146  if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147    return NoopHazard;
148
149  if (checkFPAtomicToDenormModeHazard(MI) > 0)
150    return NoopHazard;
151
152  if (ST.hasNoDataDepHazard())
153    return NoHazard;
154
155  if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156    return NoopHazard;
157
158  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159    return NoopHazard;
160
161  if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162    return NoopHazard;
163
164  if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165    return NoopHazard;
166
167  if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168    return NoopHazard;
169
170  if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171    return NoopHazard;
172
173  if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174    return NoopHazard;
175
176  if (ST.hasReadM0MovRelInterpHazard() &&
177      (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178      checkReadM0Hazards(MI) > 0)
179    return NoopHazard;
180
181  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182      checkReadM0Hazards(MI) > 0)
183    return NoopHazard;
184
185  if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186    return NoopHazard;
187
188  if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
189    return NoopHazard;
190
191  if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192    return NoopHazard;
193
194  if (checkAnyInstHazards(MI) > 0)
195    return NoopHazard;
196
197  return NoHazard;
198}
199
200static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
201  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
202      .addImm(0);
203}
204
205void GCNHazardRecognizer::processBundle() {
206  MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
207  MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
208  // Check bundled MachineInstr's for hazards.
209  for (; MI != E && MI->isInsideBundle(); ++MI) {
210    CurrCycleInstr = &*MI;
211    unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
212
213    if (IsHazardRecognizerMode)
214      fixHazards(CurrCycleInstr);
215
216    for (unsigned i = 0; i < WaitStates; ++i)
217      insertNoopInBundle(CurrCycleInstr, TII);
218
219    // It���s unnecessary to track more than MaxLookAhead instructions. Since we
220    // include the bundled MI directly after, only add a maximum of
221    // (MaxLookAhead - 1) noops to EmittedInstrs.
222    for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
223      EmittedInstrs.push_front(nullptr);
224
225    EmittedInstrs.push_front(CurrCycleInstr);
226    EmittedInstrs.resize(MaxLookAhead);
227  }
228  CurrCycleInstr = nullptr;
229}
230
231unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
232  IsHazardRecognizerMode = false;
233  return PreEmitNoopsCommon(SU->getInstr());
234}
235
236unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
237  IsHazardRecognizerMode = true;
238  CurrCycleInstr = MI;
239  unsigned W = PreEmitNoopsCommon(MI);
240  fixHazards(MI);
241  CurrCycleInstr = nullptr;
242  return W;
243}
244
245unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
246  if (MI->isBundle())
247    return 0;
248
249  int WaitStates = std::max(0, checkAnyInstHazards(MI));
250
251  if (SIInstrInfo::isSMRD(*MI))
252    return std::max(WaitStates, checkSMRDHazards(MI));
253
254  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
255    WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
256
257  if (ST.hasNSAtoVMEMBug())
258    WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
259
260  WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
261
262  if (ST.hasNoDataDepHazard())
263    return WaitStates;
264
265  if (SIInstrInfo::isVALU(*MI))
266    WaitStates = std::max(WaitStates, checkVALUHazards(MI));
267
268  if (SIInstrInfo::isDPP(*MI))
269    WaitStates = std::max(WaitStates, checkDPPHazards(MI));
270
271  if (isDivFMas(MI->getOpcode()))
272    WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
273
274  if (isRWLane(MI->getOpcode()))
275    WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
276
277  if (MI->isInlineAsm())
278    return std::max(WaitStates, checkInlineAsmHazards(MI));
279
280  if (isSGetReg(MI->getOpcode()))
281    return std::max(WaitStates, checkGetRegHazards(MI));
282
283  if (isSSetReg(MI->getOpcode()))
284    return std::max(WaitStates, checkSetRegHazards(MI));
285
286  if (isRFE(MI->getOpcode()))
287    return std::max(WaitStates, checkRFEHazards(MI));
288
289  if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
290                                           isSMovRel(MI->getOpcode())))
291    return std::max(WaitStates, checkReadM0Hazards(MI));
292
293  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
294    return std::max(WaitStates, checkReadM0Hazards(MI));
295
296  if (SIInstrInfo::isMAI(*MI))
297    return std::max(WaitStates, checkMAIHazards(MI));
298
299  if (MI->mayLoadOrStore())
300    return std::max(WaitStates, checkMAILdStHazards(MI));
301
302  return WaitStates;
303}
304
305void GCNHazardRecognizer::EmitNoop() {
306  EmittedInstrs.push_front(nullptr);
307}
308
309void GCNHazardRecognizer::AdvanceCycle() {
310  // When the scheduler detects a stall, it will call AdvanceCycle() without
311  // emitting any instructions.
312  if (!CurrCycleInstr)
313    return;
314
315  // Do not track non-instructions which do not affect the wait states.
316  // If included, these instructions can lead to buffer overflow such that
317  // detectable hazards are missed.
318  if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
319      CurrCycleInstr->isKill())
320    return;
321
322  if (CurrCycleInstr->isBundle()) {
323    processBundle();
324    return;
325  }
326
327  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
328
329  // Keep track of emitted instructions
330  EmittedInstrs.push_front(CurrCycleInstr);
331
332  // Add a nullptr for each additional wait state after the first.  Make sure
333  // not to add more than getMaxLookAhead() items to the list, since we
334  // truncate the list to that size right after this loop.
335  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
336       i < e; ++i) {
337    EmittedInstrs.push_front(nullptr);
338  }
339
340  // getMaxLookahead() is the largest number of wait states we will ever need
341  // to insert, so there is no point in keeping track of more than that many
342  // wait states.
343  EmittedInstrs.resize(getMaxLookAhead());
344
345  CurrCycleInstr = nullptr;
346}
347
348void GCNHazardRecognizer::RecedeCycle() {
349  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
350}
351
352//===----------------------------------------------------------------------===//
353// Helper Functions
354//===----------------------------------------------------------------------===//
355
356typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
357
358// Returns a minimum wait states since \p I walking all predecessors.
359// Only scans until \p IsExpired does not return true.
360// Can only be run in a hazard recognizer mode.
361static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
362                              MachineBasicBlock *MBB,
363                              MachineBasicBlock::reverse_instr_iterator I,
364                              int WaitStates,
365                              IsExpiredFn IsExpired,
366                              DenseSet<const MachineBasicBlock *> &Visited) {
367  for (auto E = MBB->instr_rend(); I != E; ++I) {
368    // Don't add WaitStates for parent BUNDLE instructions.
369    if (I->isBundle())
370      continue;
371
372    if (IsHazard(&*I))
373      return WaitStates;
374
375    if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
376      continue;
377
378    WaitStates += SIInstrInfo::getNumWaitStates(*I);
379
380    if (IsExpired(&*I, WaitStates))
381      return std::numeric_limits<int>::max();
382  }
383
384  int MinWaitStates = WaitStates;
385  bool Found = false;
386  for (MachineBasicBlock *Pred : MBB->predecessors()) {
387    if (!Visited.insert(Pred).second)
388      continue;
389
390    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
391                               WaitStates, IsExpired, Visited);
392
393    if (W == std::numeric_limits<int>::max())
394      continue;
395
396    MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
397    if (IsExpired(nullptr, MinWaitStates))
398      return MinWaitStates;
399
400    Found = true;
401  }
402
403  if (Found)
404    return MinWaitStates;
405
406  return std::numeric_limits<int>::max();
407}
408
409static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
410                              MachineInstr *MI,
411                              IsExpiredFn IsExpired) {
412  DenseSet<const MachineBasicBlock *> Visited;
413  return getWaitStatesSince(IsHazard, MI->getParent(),
414                            std::next(MI->getReverseIterator()),
415                            0, IsExpired, Visited);
416}
417
418int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
419  if (IsHazardRecognizerMode) {
420    auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
421      return WaitStates >= Limit;
422    };
423    return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
424  }
425
426  int WaitStates = 0;
427  for (MachineInstr *MI : EmittedInstrs) {
428    if (MI) {
429      if (IsHazard(MI))
430        return WaitStates;
431
432      if (MI->isInlineAsm())
433        continue;
434    }
435    ++WaitStates;
436
437    if (WaitStates >= Limit)
438      break;
439  }
440  return std::numeric_limits<int>::max();
441}
442
443int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
444                                               IsHazardFn IsHazardDef,
445                                               int Limit) {
446  const SIRegisterInfo *TRI = ST.getRegisterInfo();
447
448  auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
449    return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
450  };
451
452  return getWaitStatesSince(IsHazardFn, Limit);
453}
454
455int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
456                                                  int Limit) {
457  auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
458    return isSSetReg(MI->getOpcode()) && IsHazard(MI);
459  };
460
461  return getWaitStatesSince(IsHazardFn, Limit);
462}
463
464//===----------------------------------------------------------------------===//
465// No-op Hazard Detection
466//===----------------------------------------------------------------------===//
467
468static void addRegUnits(const SIRegisterInfo &TRI,
469                        BitVector &BV, unsigned Reg) {
470  for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
471    BV.set(*RUI);
472}
473
474static void addRegsToSet(const SIRegisterInfo &TRI,
475                         iterator_range<MachineInstr::const_mop_iterator> Ops,
476                         BitVector &Set) {
477  for (const MachineOperand &Op : Ops) {
478    if (Op.isReg())
479      addRegUnits(TRI, Set, Op.getReg());
480  }
481}
482
483void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
484  // XXX: Do we need to worry about implicit operands
485  addRegsToSet(TRI, MI.defs(), ClauseDefs);
486  addRegsToSet(TRI, MI.uses(), ClauseUses);
487}
488
489int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
490  // SMEM soft clause are only present on VI+, and only matter if xnack is
491  // enabled.
492  if (!ST.isXNACKEnabled())
493    return 0;
494
495  bool IsSMRD = TII.isSMRD(*MEM);
496
497  resetClause();
498
499  // A soft-clause is any group of consecutive SMEM instructions.  The
500  // instructions in this group may return out of order and/or may be
501  // replayed (i.e. the same instruction issued more than once).
502  //
503  // In order to handle these situations correctly we need to make sure that
504  // when a clause has more than one instruction, no instruction in the clause
505  // writes to a register that is read by another instruction in the clause
506  // (including itself). If we encounter this situaion, we need to break the
507  // clause by inserting a non SMEM instruction.
508
509  for (MachineInstr *MI : EmittedInstrs) {
510    // When we hit a non-SMEM instruction then we have passed the start of the
511    // clause and we can stop.
512    if (!MI)
513      break;
514
515    if (IsSMRD != SIInstrInfo::isSMRD(*MI))
516      break;
517
518    addClauseInst(*MI);
519  }
520
521  if (ClauseDefs.none())
522    return 0;
523
524  // We need to make sure not to put loads and stores in the same clause if they
525  // use the same address. For now, just start a new clause whenever we see a
526  // store.
527  if (MEM->mayStore())
528    return 1;
529
530  addClauseInst(*MEM);
531
532  // If the set of defs and uses intersect then we cannot add this instruction
533  // to the clause, so we have a hazard.
534  return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
535}
536
537int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
538  int WaitStatesNeeded = 0;
539
540  WaitStatesNeeded = checkSoftClauseHazards(SMRD);
541
542  // This SMRD hazard only affects SI.
543  if (!ST.hasSMRDReadVALUDefHazard())
544    return WaitStatesNeeded;
545
546  // A read of an SGPR by SMRD instruction requires 4 wait states when the
547  // SGPR was written by a VALU instruction.
548  int SmrdSgprWaitStates = 4;
549  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
550  auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
551
552  bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
553
554  for (const MachineOperand &Use : SMRD->uses()) {
555    if (!Use.isReg())
556      continue;
557    int WaitStatesNeededForUse =
558        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
559                                                   SmrdSgprWaitStates);
560    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
561
562    // This fixes what appears to be undocumented hardware behavior in SI where
563    // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564    // needs some number of nops in between. We don't know how many we need, but
565    // let's use 4. This wasn't discovered before probably because the only
566    // case when this happens is when we expand a 64-bit pointer into a full
567    // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568    // probably never encountered in the closed-source land.
569    if (IsBufferSMRD) {
570      int WaitStatesNeededForUse =
571        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
572                                                   IsBufferHazardDefFn,
573                                                   SmrdSgprWaitStates);
574      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575    }
576  }
577
578  return WaitStatesNeeded;
579}
580
581int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
582  if (!ST.hasVMEMReadSGPRVALUDefHazard())
583    return 0;
584
585  int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
586
587  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588  // SGPR was written by a VALU Instruction.
589  const int VmemSgprWaitStates = 5;
590  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
591  for (const MachineOperand &Use : VMEM->uses()) {
592    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
593      continue;
594
595    int WaitStatesNeededForUse =
596        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
597                                                   VmemSgprWaitStates);
598    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
599  }
600  return WaitStatesNeeded;
601}
602
603int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
604  const SIRegisterInfo *TRI = ST.getRegisterInfo();
605  const SIInstrInfo *TII = ST.getInstrInfo();
606
607  // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608  int DppVgprWaitStates = 2;
609  int DppExecWaitStates = 5;
610  int WaitStatesNeeded = 0;
611  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
612
613  for (const MachineOperand &Use : DPP->uses()) {
614    if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
615      continue;
616    int WaitStatesNeededForUse =
617        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
618                              [](MachineInstr *) { return true; },
619                              DppVgprWaitStates);
620    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
621  }
622
623  WaitStatesNeeded = std::max(
624      WaitStatesNeeded,
625      DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
626                                                DppExecWaitStates));
627
628  return WaitStatesNeeded;
629}
630
631int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
632  const SIInstrInfo *TII = ST.getInstrInfo();
633
634  // v_div_fmas requires 4 wait states after a write to vcc from a VALU
635  // instruction.
636  const int DivFMasWaitStates = 4;
637  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
638  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
639                                               DivFMasWaitStates);
640
641  return DivFMasWaitStates - WaitStatesNeeded;
642}
643
644int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
645  const SIInstrInfo *TII = ST.getInstrInfo();
646  unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
647
648  const int GetRegWaitStates = 2;
649  auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
650    return GetRegHWReg == getHWReg(TII, *MI);
651  };
652  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
653
654  return GetRegWaitStates - WaitStatesNeeded;
655}
656
657int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
658  const SIInstrInfo *TII = ST.getInstrInfo();
659  unsigned HWReg = getHWReg(TII, *SetRegInstr);
660
661  const int SetRegWaitStates = ST.getSetRegWaitStates();
662  auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
663    return HWReg == getHWReg(TII, *MI);
664  };
665  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
666  return SetRegWaitStates - WaitStatesNeeded;
667}
668
669int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
670  if (!MI.mayStore())
671    return -1;
672
673  const SIInstrInfo *TII = ST.getInstrInfo();
674  unsigned Opcode = MI.getOpcode();
675  const MCInstrDesc &Desc = MI.getDesc();
676
677  int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
678  int VDataRCID = -1;
679  if (VDataIdx != -1)
680    VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
681
682  if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
683    // There is no hazard if the instruction does not use vector regs
684    // (like wbinvl1)
685    if (VDataIdx == -1)
686      return -1;
687    // For MUBUF/MTBUF instructions this hazard only exists if the
688    // instruction is not using a register in the soffset field.
689    const MachineOperand *SOffset =
690        TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
691    // If we have no soffset operand, then assume this field has been
692    // hardcoded to zero.
693    if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
694        (!SOffset || !SOffset->isReg()))
695      return VDataIdx;
696  }
697
698  // MIMG instructions create a hazard if they don't use a 256-bit T# and
699  // the store size is greater than 8 bytes and they have more than two bits
700  // of their dmask set.
701  // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702  if (TII->isMIMG(MI)) {
703    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
704    assert(SRsrcIdx != -1 &&
705           AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
706    (void)SRsrcIdx;
707  }
708
709  if (TII->isFLAT(MI)) {
710    int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
711    if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
712      return DataIdx;
713  }
714
715  return -1;
716}
717
718int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
719						const MachineRegisterInfo &MRI) {
720  // Helper to check for the hazard where VMEM instructions that store more than
721  // 8 bytes can have there store data over written by the next instruction.
722  const SIRegisterInfo *TRI = ST.getRegisterInfo();
723
724  const int VALUWaitStates = 1;
725  int WaitStatesNeeded = 0;
726
727  if (!TRI->isVGPR(MRI, Def.getReg()))
728    return WaitStatesNeeded;
729  Register Reg = Def.getReg();
730  auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
731    int DataIdx = createsVALUHazard(*MI);
732    return DataIdx >= 0 &&
733    TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
734  };
735  int WaitStatesNeededForDef =
736    VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
737  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
738
739  return WaitStatesNeeded;
740}
741
742int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
743  // This checks for the hazard where VMEM instructions that store more than
744  // 8 bytes can have there store data over written by the next instruction.
745  if (!ST.has12DWordStoreHazard())
746    return 0;
747
748  const MachineRegisterInfo &MRI = MF.getRegInfo();
749  int WaitStatesNeeded = 0;
750
751  for (const MachineOperand &Def : VALU->defs()) {
752    WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
753  }
754
755  return WaitStatesNeeded;
756}
757
758int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
759  // This checks for hazards associated with inline asm statements.
760  // Since inline asms can contain just about anything, we use this
761  // to call/leverage other check*Hazard routines. Note that
762  // this function doesn't attempt to address all possible inline asm
763  // hazards (good luck), but is a collection of what has been
764  // problematic thus far.
765
766  // see checkVALUHazards()
767  if (!ST.has12DWordStoreHazard())
768    return 0;
769
770  const MachineRegisterInfo &MRI = MF.getRegInfo();
771  int WaitStatesNeeded = 0;
772
773  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
774       I != E; ++I) {
775    const MachineOperand &Op = IA->getOperand(I);
776    if (Op.isReg() && Op.isDef()) {
777      WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
778    }
779  }
780
781  return WaitStatesNeeded;
782}
783
784int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
785  const SIInstrInfo *TII = ST.getInstrInfo();
786  const SIRegisterInfo *TRI = ST.getRegisterInfo();
787  const MachineRegisterInfo &MRI = MF.getRegInfo();
788
789  const MachineOperand *LaneSelectOp =
790      TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
791
792  if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
793    return 0;
794
795  Register LaneSelectReg = LaneSelectOp->getReg();
796  auto IsHazardFn = [TII] (MachineInstr *MI) {
797    return TII->isVALU(*MI);
798  };
799
800  const int RWLaneWaitStates = 4;
801  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
802                                              RWLaneWaitStates);
803  return RWLaneWaitStates - WaitStatesSince;
804}
805
806int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
807  if (!ST.hasRFEHazards())
808    return 0;
809
810  const SIInstrInfo *TII = ST.getInstrInfo();
811
812  const int RFEWaitStates = 1;
813
814  auto IsHazardFn = [TII] (MachineInstr *MI) {
815    return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
816  };
817  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
818  return RFEWaitStates - WaitStatesNeeded;
819}
820
821int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
822  if (MI->isDebugInstr())
823    return 0;
824
825  const SIRegisterInfo *TRI = ST.getRegisterInfo();
826  if (!ST.hasSMovFedHazard())
827    return 0;
828
829  // Check for any instruction reading an SGPR after a write from
830  // s_mov_fed_b32.
831  int MovFedWaitStates = 1;
832  int WaitStatesNeeded = 0;
833
834  for (const MachineOperand &Use : MI->uses()) {
835    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
836      continue;
837    auto IsHazardFn = [] (MachineInstr *MI) {
838      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
839    };
840    int WaitStatesNeededForUse =
841        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
842                                                 MovFedWaitStates);
843    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
844  }
845
846  return WaitStatesNeeded;
847}
848
849int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
850  const SIInstrInfo *TII = ST.getInstrInfo();
851  const int SMovRelWaitStates = 1;
852  auto IsHazardFn = [TII] (MachineInstr *MI) {
853    return TII->isSALU(*MI);
854  };
855  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
856                                                   SMovRelWaitStates);
857}
858
859void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
860  fixVMEMtoScalarWriteHazards(MI);
861  fixVcmpxPermlaneHazards(MI);
862  fixSMEMtoVectorWriteHazards(MI);
863  fixVcmpxExecWARHazard(MI);
864  fixLdsBranchVmemWARHazard(MI);
865}
866
867bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
868  if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
869    return false;
870
871  const SIInstrInfo *TII = ST.getInstrInfo();
872  auto IsHazardFn = [TII] (MachineInstr *MI) {
873    return TII->isVOPC(*MI);
874  };
875
876  auto IsExpiredFn = [] (MachineInstr *MI, int) {
877    if (!MI)
878      return false;
879    unsigned Opc = MI->getOpcode();
880    return SIInstrInfo::isVALU(*MI) &&
881           Opc != AMDGPU::V_NOP_e32 &&
882           Opc != AMDGPU::V_NOP_e64 &&
883           Opc != AMDGPU::V_NOP_sdwa;
884  };
885
886  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
887      std::numeric_limits<int>::max())
888    return false;
889
890  // V_NOP will be discarded by SQ.
891  // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
892  // which is always a VGPR and available.
893  auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
894  Register Reg = Src0->getReg();
895  bool IsUndef = Src0->isUndef();
896  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
897          TII->get(AMDGPU::V_MOV_B32_e32))
898    .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
899    .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
900
901  return true;
902}
903
904bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
905  if (!ST.hasVMEMtoScalarWriteHazard())
906    return false;
907
908  if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
909    return false;
910
911  if (MI->getNumDefs() == 0)
912    return false;
913
914  const SIRegisterInfo *TRI = ST.getRegisterInfo();
915
916  auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
917    if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
918        !SIInstrInfo::isFLAT(*I))
919      return false;
920
921    for (const MachineOperand &Def : MI->defs()) {
922      MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
923      if (!Op)
924        continue;
925      return true;
926    }
927    return false;
928  };
929
930  auto IsExpiredFn = [] (MachineInstr *MI, int) {
931    return MI && (SIInstrInfo::isVALU(*MI) ||
932                  (MI->getOpcode() == AMDGPU::S_WAITCNT &&
933                   !MI->getOperand(0).getImm()));
934  };
935
936  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
937      std::numeric_limits<int>::max())
938    return false;
939
940  const SIInstrInfo *TII = ST.getInstrInfo();
941  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
942  return true;
943}
944
945bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
946  if (!ST.hasSMEMtoVectorWriteHazard())
947    return false;
948
949  if (!SIInstrInfo::isVALU(*MI))
950    return false;
951
952  unsigned SDSTName;
953  switch (MI->getOpcode()) {
954  case AMDGPU::V_READLANE_B32:
955  case AMDGPU::V_READLANE_B32_gfx10:
956  case AMDGPU::V_READFIRSTLANE_B32:
957    SDSTName = AMDGPU::OpName::vdst;
958    break;
959  default:
960    SDSTName = AMDGPU::OpName::sdst;
961    break;
962  }
963
964  const SIInstrInfo *TII = ST.getInstrInfo();
965  const SIRegisterInfo *TRI = ST.getRegisterInfo();
966  const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
967  const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
968  if (!SDST) {
969    for (const auto &MO : MI->implicit_operands()) {
970      if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
971        SDST = &MO;
972        break;
973      }
974    }
975  }
976
977  if (!SDST)
978    return false;
979
980  const Register SDSTReg = SDST->getReg();
981  auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
982    return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
983  };
984
985  auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
986    if (MI) {
987      if (TII->isSALU(*MI)) {
988        switch (MI->getOpcode()) {
989        case AMDGPU::S_SETVSKIP:
990        case AMDGPU::S_VERSION:
991        case AMDGPU::S_WAITCNT_VSCNT:
992        case AMDGPU::S_WAITCNT_VMCNT:
993        case AMDGPU::S_WAITCNT_EXPCNT:
994          // These instructions cannot not mitigate the hazard.
995          return false;
996        case AMDGPU::S_WAITCNT_LGKMCNT:
997          // Reducing lgkmcnt count to 0 always mitigates the hazard.
998          return (MI->getOperand(1).getImm() == 0) &&
999                 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1000        case AMDGPU::S_WAITCNT: {
1001          const int64_t Imm = MI->getOperand(0).getImm();
1002          AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1003          return (Decoded.LgkmCnt == 0);
1004        }
1005        default:
1006          // SOPP instructions cannot mitigate the hazard.
1007          if (TII->isSOPP(*MI))
1008            return false;
1009          // At this point the SALU can be assumed to mitigate the hazard
1010          // because either:
1011          // (a) it is independent of the at risk SMEM (breaking chain),
1012          // or
1013          // (b) it is dependent on the SMEM, in which case an appropriate
1014          //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1015          //     SMEM instruction.
1016          return true;
1017        }
1018      }
1019    }
1020    return false;
1021  };
1022
1023  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1024      std::numeric_limits<int>::max())
1025    return false;
1026
1027  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1028          TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1029      .addImm(0);
1030  return true;
1031}
1032
1033bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1034  if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1035    return false;
1036
1037  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1038  if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1039    return false;
1040
1041  auto IsHazardFn = [TRI] (MachineInstr *I) {
1042    if (SIInstrInfo::isVALU(*I))
1043      return false;
1044    return I->readsRegister(AMDGPU::EXEC, TRI);
1045  };
1046
1047  const SIInstrInfo *TII = ST.getInstrInfo();
1048  auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1049    if (!MI)
1050      return false;
1051    if (SIInstrInfo::isVALU(*MI)) {
1052      if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1053        return true;
1054      for (auto MO : MI->implicit_operands())
1055        if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1056          return true;
1057    }
1058    if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1059        (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1060      return true;
1061    return false;
1062  };
1063
1064  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1065      std::numeric_limits<int>::max())
1066    return false;
1067
1068  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1069          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1070    .addImm(0xfffe);
1071  return true;
1072}
1073
1074bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1075  if (!ST.hasLdsBranchVmemWARHazard())
1076    return false;
1077
1078  auto IsHazardInst = [] (const MachineInstr *MI) {
1079    if (SIInstrInfo::isDS(*MI))
1080      return 1;
1081    if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1082      return 2;
1083    return 0;
1084  };
1085
1086  auto InstType = IsHazardInst(MI);
1087  if (!InstType)
1088    return false;
1089
1090  auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1091    return I && (IsHazardInst(I) ||
1092                 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1093                  I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1094                  !I->getOperand(1).getImm()));
1095  };
1096
1097  auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1098    if (!I->isBranch())
1099      return false;
1100
1101    auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1102      auto InstType2 = IsHazardInst(I);
1103      return InstType2 && InstType != InstType2;
1104    };
1105
1106    auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1107      if (!I)
1108        return false;
1109
1110      auto InstType2 = IsHazardInst(I);
1111      if (InstType == InstType2)
1112        return true;
1113
1114      return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1115             I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1116             !I->getOperand(1).getImm();
1117    };
1118
1119    return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1120           std::numeric_limits<int>::max();
1121  };
1122
1123  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1124      std::numeric_limits<int>::max())
1125    return false;
1126
1127  const SIInstrInfo *TII = ST.getInstrInfo();
1128  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1129          TII->get(AMDGPU::S_WAITCNT_VSCNT))
1130    .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1131    .addImm(0);
1132
1133  return true;
1134}
1135
1136int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1137  int NSAtoVMEMWaitStates = 1;
1138
1139  if (!ST.hasNSAtoVMEMBug())
1140    return 0;
1141
1142  if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1143    return 0;
1144
1145  const SIInstrInfo *TII = ST.getInstrInfo();
1146  const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1147  if (!Offset || (Offset->getImm() & 6) == 0)
1148    return 0;
1149
1150  auto IsHazardFn = [TII] (MachineInstr *I) {
1151    if (!SIInstrInfo::isMIMG(*I))
1152      return false;
1153    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1154    return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1155           TII->getInstSizeInBytes(*I) >= 16;
1156  };
1157
1158  return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1159}
1160
1161int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1162  int FPAtomicToDenormModeWaitStates = 3;
1163
1164  if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1165    return 0;
1166
1167  auto IsHazardFn = [] (MachineInstr *I) {
1168    if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1169      return false;
1170    return SIInstrInfo::isFPAtomic(*I);
1171  };
1172
1173  auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1174    if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1175      return true;
1176
1177    switch (MI->getOpcode()) {
1178    case AMDGPU::S_WAITCNT:
1179    case AMDGPU::S_WAITCNT_VSCNT:
1180    case AMDGPU::S_WAITCNT_VMCNT:
1181    case AMDGPU::S_WAITCNT_EXPCNT:
1182    case AMDGPU::S_WAITCNT_LGKMCNT:
1183    case AMDGPU::S_WAITCNT_IDLE:
1184      return true;
1185    default:
1186      break;
1187    }
1188
1189    return false;
1190  };
1191
1192
1193  return FPAtomicToDenormModeWaitStates -
1194         ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1195}
1196
1197int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1198  assert(SIInstrInfo::isMAI(*MI));
1199
1200  int WaitStatesNeeded = 0;
1201  unsigned Opc = MI->getOpcode();
1202
1203  auto IsVALUFn = [] (MachineInstr *MI) {
1204    return SIInstrInfo::isVALU(*MI);
1205  };
1206
1207  if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1208    const int LegacyVALUWritesVGPRWaitStates = 2;
1209    const int VALUWritesExecWaitStates = 4;
1210    const int MaxWaitStates = 4;
1211
1212    int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1213      getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1214    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1215
1216    if (WaitStatesNeeded < MaxWaitStates) {
1217      for (const MachineOperand &Use : MI->explicit_uses()) {
1218        const int MaxWaitStates = 2;
1219
1220        if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1221          continue;
1222
1223        int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1224          getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1225        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1226
1227        if (WaitStatesNeeded == MaxWaitStates)
1228          break;
1229      }
1230    }
1231  }
1232
1233  auto IsMFMAFn = [] (MachineInstr *MI) {
1234    return SIInstrInfo::isMAI(*MI) &&
1235           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1236           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1237  };
1238
1239  for (const MachineOperand &Op : MI->explicit_operands()) {
1240    if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1241      continue;
1242
1243    if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1244      continue;
1245
1246    const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1247    const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1248    const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1249    const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1250    const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1251    const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1252    const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1253    const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1254    const int MaxWaitStates = 18;
1255    Register Reg = Op.getReg();
1256    unsigned HazardDefLatency = 0;
1257
1258    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1259                              (MachineInstr *MI) {
1260      if (!IsMFMAFn(MI))
1261        return false;
1262      Register DstReg = MI->getOperand(0).getReg();
1263      if (DstReg == Reg)
1264        return false;
1265      HazardDefLatency = std::max(HazardDefLatency,
1266                                  TSchedModel.computeInstrLatency(MI));
1267      return TRI.regsOverlap(DstReg, Reg);
1268    };
1269
1270    int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1271                                                   MaxWaitStates);
1272    int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1273    int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1274    int OpNo = MI->getOperandNo(&Op);
1275    if (OpNo == SrcCIdx) {
1276      NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1277    } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1278      switch (HazardDefLatency) {
1279      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1280               break;
1281      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1282               break;
1283      case 16: LLVM_FALLTHROUGH;
1284      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1285               break;
1286      }
1287    } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1288      switch (HazardDefLatency) {
1289      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1290               break;
1291      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1292               break;
1293      case 16: LLVM_FALLTHROUGH;
1294      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1295               break;
1296      }
1297    }
1298
1299    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1300    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1301
1302    if (WaitStatesNeeded == MaxWaitStates)
1303      return WaitStatesNeeded; // Early exit.
1304
1305    auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1306      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1307        return false;
1308      Register DstReg = MI->getOperand(0).getReg();
1309      return TRI.regsOverlap(Reg, DstReg);
1310    };
1311
1312    const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1313    const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1314    const int AccVGPRWriteAccVgprReadWaitStates = 3;
1315    NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1316    if (OpNo == SrcCIdx)
1317      NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1318    else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1319      NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1320
1321    WaitStatesNeededForUse = NeedWaitStates -
1322      getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1323    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1324
1325    if (WaitStatesNeeded == MaxWaitStates)
1326      return WaitStatesNeeded; // Early exit.
1327  }
1328
1329  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1330    const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1331    const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1332    const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1333    const int MaxWaitStates = 13;
1334    Register DstReg = MI->getOperand(0).getReg();
1335    unsigned HazardDefLatency = 0;
1336
1337    auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1338                         (MachineInstr *MI) {
1339      if (!IsMFMAFn(MI))
1340        return false;
1341      Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1342      HazardDefLatency = std::max(HazardDefLatency,
1343                                  TSchedModel.computeInstrLatency(MI));
1344      return TRI.regsOverlap(Reg, DstReg);
1345    };
1346
1347    int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1348    int NeedWaitStates;
1349    switch (HazardDefLatency) {
1350    case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1351             break;
1352    case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1353             break;
1354    case 16: LLVM_FALLTHROUGH;
1355    default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1356             break;
1357    }
1358
1359    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1360    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1361  }
1362
1363  return WaitStatesNeeded;
1364}
1365
1366int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1367  if (!ST.hasMAIInsts())
1368    return 0;
1369
1370  int WaitStatesNeeded = 0;
1371
1372  auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1373    return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1374  };
1375
1376  for (const MachineOperand &Op : MI->explicit_uses()) {
1377    if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1378      continue;
1379
1380    Register Reg = Op.getReg();
1381
1382    const int AccVgprReadLdStWaitStates = 2;
1383    const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1384    const int MaxWaitStates = 2;
1385
1386    int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1387      getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1388    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1389
1390    if (WaitStatesNeeded == MaxWaitStates)
1391      return WaitStatesNeeded; // Early exit.
1392
1393    auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1394      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1395        return false;
1396      auto IsVALUFn = [] (MachineInstr *MI) {
1397        return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1398      };
1399      return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1400             std::numeric_limits<int>::max();
1401    };
1402
1403    WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1404      getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1405    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1406  }
1407
1408  return WaitStatesNeeded;
1409}
1410