1303231Sdim//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2303231Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6303231Sdim//
7303231Sdim//===----------------------------------------------------------------------===//
8303231Sdim//
9303231Sdim// This file implements hazard recognizers for scheduling on GCN processors.
10303231Sdim//
11303231Sdim//===----------------------------------------------------------------------===//
12303231Sdim
13303231Sdim#include "GCNHazardRecognizer.h"
14303231Sdim#include "AMDGPUSubtarget.h"
15321369Sdim#include "SIDefines.h"
16303231Sdim#include "SIInstrInfo.h"
17321369Sdim#include "SIRegisterInfo.h"
18341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19321369Sdim#include "Utils/AMDGPUBaseInfo.h"
20321369Sdim#include "llvm/ADT/iterator_range.h"
21321369Sdim#include "llvm/CodeGen/MachineFunction.h"
22321369Sdim#include "llvm/CodeGen/MachineInstr.h"
23353358Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
24321369Sdim#include "llvm/CodeGen/MachineOperand.h"
25303231Sdim#include "llvm/CodeGen/ScheduleDAG.h"
26321369Sdim#include "llvm/MC/MCInstrDesc.h"
27321369Sdim#include "llvm/Support/ErrorHandling.h"
28321369Sdim#include <algorithm>
29321369Sdim#include <cassert>
30321369Sdim#include <limits>
31321369Sdim#include <set>
32321369Sdim#include <vector>
33303231Sdim
34303231Sdimusing namespace llvm;
35303231Sdim
36303231Sdim//===----------------------------------------------------------------------===//
37303231Sdim// Hazard Recoginizer Implementation
38303231Sdim//===----------------------------------------------------------------------===//
39303231Sdim
40303231SdimGCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41353358Sdim  IsHazardRecognizerMode(false),
42303231Sdim  CurrCycleInstr(nullptr),
43303231Sdim  MF(MF),
44341825Sdim  ST(MF.getSubtarget<GCNSubtarget>()),
45327952Sdim  TII(*ST.getInstrInfo()),
46327952Sdim  TRI(TII.getRegisterInfo()),
47327952Sdim  ClauseUses(TRI.getNumRegUnits()),
48327952Sdim  ClauseDefs(TRI.getNumRegUnits()) {
49353358Sdim  MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50353358Sdim  TSchedModel.init(&ST);
51303231Sdim}
52303231Sdim
53303231Sdimvoid GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54303231Sdim  EmitInstruction(SU->getInstr());
55303231Sdim}
56303231Sdim
57303231Sdimvoid GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58303231Sdim  CurrCycleInstr = MI;
59303231Sdim}
60303231Sdim
61314564Sdimstatic bool isDivFMas(unsigned Opcode) {
62314564Sdim  return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63314564Sdim}
64314564Sdim
65314564Sdimstatic bool isSGetReg(unsigned Opcode) {
66314564Sdim  return Opcode == AMDGPU::S_GETREG_B32;
67314564Sdim}
68314564Sdim
69314564Sdimstatic bool isSSetReg(unsigned Opcode) {
70314564Sdim  return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
71314564Sdim}
72314564Sdim
73314564Sdimstatic bool isRWLane(unsigned Opcode) {
74314564Sdim  return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
75314564Sdim}
76314564Sdim
77314564Sdimstatic bool isRFE(unsigned Opcode) {
78314564Sdim  return Opcode == AMDGPU::S_RFE_B64;
79314564Sdim}
80314564Sdim
81321369Sdimstatic bool isSMovRel(unsigned Opcode) {
82321369Sdim  switch (Opcode) {
83321369Sdim  case AMDGPU::S_MOVRELS_B32:
84321369Sdim  case AMDGPU::S_MOVRELS_B64:
85321369Sdim  case AMDGPU::S_MOVRELD_B32:
86321369Sdim  case AMDGPU::S_MOVRELD_B64:
87321369Sdim    return true;
88321369Sdim  default:
89321369Sdim    return false;
90321369Sdim  }
91321369Sdim}
92321369Sdim
93344779Sdimstatic bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94344779Sdim                                    const MachineInstr &MI) {
95344779Sdim  if (TII.isAlwaysGDS(MI.getOpcode()))
96344779Sdim    return true;
97344779Sdim
98327952Sdim  switch (MI.getOpcode()) {
99327952Sdim  case AMDGPU::S_SENDMSG:
100327952Sdim  case AMDGPU::S_SENDMSGHALT:
101327952Sdim  case AMDGPU::S_TTRACEDATA:
102327952Sdim    return true;
103344779Sdim  // These DS opcodes don't support GDS.
104344779Sdim  case AMDGPU::DS_NOP:
105344779Sdim  case AMDGPU::DS_PERMUTE_B32:
106344779Sdim  case AMDGPU::DS_BPERMUTE_B32:
107344779Sdim    return false;
108327952Sdim  default:
109344779Sdim    if (TII.isDS(MI.getOpcode())) {
110344779Sdim      int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111344779Sdim                                           AMDGPU::OpName::gds);
112344779Sdim      if (MI.getOperand(GDS).getImm())
113344779Sdim        return true;
114344779Sdim    }
115327952Sdim    return false;
116327952Sdim  }
117327952Sdim}
118327952Sdim
119353358Sdimstatic bool isPermlane(const MachineInstr &MI) {
120353358Sdim  unsigned Opcode = MI.getOpcode();
121353358Sdim  return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122353358Sdim         Opcode == AMDGPU::V_PERMLANEX16_B32;
123353358Sdim}
124353358Sdim
125314564Sdimstatic unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126314564Sdim  const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127314564Sdim                                                     AMDGPU::OpName::simm16);
128314564Sdim  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
129314564Sdim}
130314564Sdim
131303231SdimScheduleHazardRecognizer::HazardType
132303231SdimGCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133303231Sdim  MachineInstr *MI = SU->getInstr();
134353358Sdim  if (MI->isBundle())
135353358Sdim   return NoHazard;
136303231Sdim
137303231Sdim  if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138303231Sdim    return NoopHazard;
139303231Sdim
140327952Sdim  // FIXME: Should flat be considered vmem?
141327952Sdim  if ((SIInstrInfo::isVMEM(*MI) ||
142327952Sdim       SIInstrInfo::isFLAT(*MI))
143327952Sdim      && checkVMEMHazards(MI) > 0)
144303231Sdim    return NoopHazard;
145303231Sdim
146353358Sdim  if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147353358Sdim    return NoopHazard;
148353358Sdim
149353358Sdim  if (checkFPAtomicToDenormModeHazard(MI) > 0)
150353358Sdim    return NoopHazard;
151353358Sdim
152353358Sdim  if (ST.hasNoDataDepHazard())
153353358Sdim    return NoHazard;
154353358Sdim
155314564Sdim  if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156314564Sdim    return NoopHazard;
157314564Sdim
158303231Sdim  if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159303231Sdim    return NoopHazard;
160303231Sdim
161314564Sdim  if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162314564Sdim    return NoopHazard;
163314564Sdim
164314564Sdim  if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165314564Sdim    return NoopHazard;
166314564Sdim
167314564Sdim  if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168314564Sdim    return NoopHazard;
169314564Sdim
170314564Sdim  if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171314564Sdim    return NoopHazard;
172314564Sdim
173314564Sdim  if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174314564Sdim    return NoopHazard;
175314564Sdim
176327952Sdim  if (ST.hasReadM0MovRelInterpHazard() &&
177327952Sdim      (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178321369Sdim      checkReadM0Hazards(MI) > 0)
179321369Sdim    return NoopHazard;
180321369Sdim
181344779Sdim  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182327952Sdim      checkReadM0Hazards(MI) > 0)
183327952Sdim    return NoopHazard;
184327952Sdim
185353358Sdim  if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186353358Sdim    return NoopHazard;
187353358Sdim
188360784Sdim  if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0)
189353358Sdim    return NoopHazard;
190353358Sdim
191327952Sdim  if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192327952Sdim    return NoopHazard;
193327952Sdim
194321369Sdim  if (checkAnyInstHazards(MI) > 0)
195321369Sdim    return NoopHazard;
196321369Sdim
197303231Sdim  return NoHazard;
198303231Sdim}
199303231Sdim
200353358Sdimstatic void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
201353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
202353358Sdim      .addImm(0);
203353358Sdim}
204353358Sdim
205353358Sdimvoid GCNHazardRecognizer::processBundle() {
206353358Sdim  MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
207353358Sdim  MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
208353358Sdim  // Check bundled MachineInstr's for hazards.
209353358Sdim  for (; MI != E && MI->isInsideBundle(); ++MI) {
210353358Sdim    CurrCycleInstr = &*MI;
211353358Sdim    unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
212353358Sdim
213353358Sdim    if (IsHazardRecognizerMode)
214353358Sdim      fixHazards(CurrCycleInstr);
215353358Sdim
216353358Sdim    for (unsigned i = 0; i < WaitStates; ++i)
217353358Sdim      insertNoopInBundle(CurrCycleInstr, TII);
218353358Sdim
219353358Sdim    // It���s unnecessary to track more than MaxLookAhead instructions. Since we
220353358Sdim    // include the bundled MI directly after, only add a maximum of
221353358Sdim    // (MaxLookAhead - 1) noops to EmittedInstrs.
222353358Sdim    for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
223353358Sdim      EmittedInstrs.push_front(nullptr);
224353358Sdim
225353358Sdim    EmittedInstrs.push_front(CurrCycleInstr);
226353358Sdim    EmittedInstrs.resize(MaxLookAhead);
227353358Sdim  }
228353358Sdim  CurrCycleInstr = nullptr;
229353358Sdim}
230353358Sdim
231303231Sdimunsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
232353358Sdim  IsHazardRecognizerMode = false;
233353358Sdim  return PreEmitNoopsCommon(SU->getInstr());
234303231Sdim}
235303231Sdim
236303231Sdimunsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
237353358Sdim  IsHazardRecognizerMode = true;
238353358Sdim  CurrCycleInstr = MI;
239353358Sdim  unsigned W = PreEmitNoopsCommon(MI);
240353358Sdim  fixHazards(MI);
241353358Sdim  CurrCycleInstr = nullptr;
242353358Sdim  return W;
243353358Sdim}
244353358Sdim
245353358Sdimunsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
246353358Sdim  if (MI->isBundle())
247353358Sdim    return 0;
248353358Sdim
249321369Sdim  int WaitStates = std::max(0, checkAnyInstHazards(MI));
250321369Sdim
251303231Sdim  if (SIInstrInfo::isSMRD(*MI))
252321369Sdim    return std::max(WaitStates, checkSMRDHazards(MI));
253303231Sdim
254353358Sdim  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
255353358Sdim    WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
256353358Sdim
257353358Sdim  if (ST.hasNSAtoVMEMBug())
258353358Sdim    WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
259353358Sdim
260353358Sdim  WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
261353358Sdim
262353358Sdim  if (ST.hasNoDataDepHazard())
263353358Sdim    return WaitStates;
264353358Sdim
265327952Sdim  if (SIInstrInfo::isVALU(*MI))
266327952Sdim    WaitStates = std::max(WaitStates, checkVALUHazards(MI));
267303231Sdim
268327952Sdim  if (SIInstrInfo::isDPP(*MI))
269327952Sdim    WaitStates = std::max(WaitStates, checkDPPHazards(MI));
270314564Sdim
271327952Sdim  if (isDivFMas(MI->getOpcode()))
272327952Sdim    WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
273314564Sdim
274327952Sdim  if (isRWLane(MI->getOpcode()))
275327952Sdim    WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
276314564Sdim
277327952Sdim  if (MI->isInlineAsm())
278327952Sdim    return std::max(WaitStates, checkInlineAsmHazards(MI));
279321369Sdim
280314564Sdim  if (isSGetReg(MI->getOpcode()))
281321369Sdim    return std::max(WaitStates, checkGetRegHazards(MI));
282314564Sdim
283314564Sdim  if (isSSetReg(MI->getOpcode()))
284321369Sdim    return std::max(WaitStates, checkSetRegHazards(MI));
285314564Sdim
286314564Sdim  if (isRFE(MI->getOpcode()))
287321369Sdim    return std::max(WaitStates, checkRFEHazards(MI));
288314564Sdim
289327952Sdim  if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
290327952Sdim                                           isSMovRel(MI->getOpcode())))
291321369Sdim    return std::max(WaitStates, checkReadM0Hazards(MI));
292321369Sdim
293344779Sdim  if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
294327952Sdim    return std::max(WaitStates, checkReadM0Hazards(MI));
295327952Sdim
296353358Sdim  if (SIInstrInfo::isMAI(*MI))
297353358Sdim    return std::max(WaitStates, checkMAIHazards(MI));
298353358Sdim
299360784Sdim  if (MI->mayLoadOrStore())
300353358Sdim    return std::max(WaitStates, checkMAILdStHazards(MI));
301353358Sdim
302321369Sdim  return WaitStates;
303303231Sdim}
304303231Sdim
305303231Sdimvoid GCNHazardRecognizer::EmitNoop() {
306303231Sdim  EmittedInstrs.push_front(nullptr);
307303231Sdim}
308303231Sdim
309303231Sdimvoid GCNHazardRecognizer::AdvanceCycle() {
310303231Sdim  // When the scheduler detects a stall, it will call AdvanceCycle() without
311303231Sdim  // emitting any instructions.
312303231Sdim  if (!CurrCycleInstr)
313303231Sdim    return;
314303231Sdim
315344779Sdim  // Do not track non-instructions which do not affect the wait states.
316344779Sdim  // If included, these instructions can lead to buffer overflow such that
317344779Sdim  // detectable hazards are missed.
318353358Sdim  if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
319353358Sdim      CurrCycleInstr->isKill())
320344779Sdim    return;
321353358Sdim
322353358Sdim  if (CurrCycleInstr->isBundle()) {
323353358Sdim    processBundle();
324344779Sdim    return;
325353358Sdim  }
326344779Sdim
327321369Sdim  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
328303231Sdim
329303231Sdim  // Keep track of emitted instructions
330303231Sdim  EmittedInstrs.push_front(CurrCycleInstr);
331303231Sdim
332303231Sdim  // Add a nullptr for each additional wait state after the first.  Make sure
333303231Sdim  // not to add more than getMaxLookAhead() items to the list, since we
334303231Sdim  // truncate the list to that size right after this loop.
335303231Sdim  for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
336303231Sdim       i < e; ++i) {
337303231Sdim    EmittedInstrs.push_front(nullptr);
338303231Sdim  }
339303231Sdim
340303231Sdim  // getMaxLookahead() is the largest number of wait states we will ever need
341303231Sdim  // to insert, so there is no point in keeping track of more than that many
342303231Sdim  // wait states.
343303231Sdim  EmittedInstrs.resize(getMaxLookAhead());
344303231Sdim
345303231Sdim  CurrCycleInstr = nullptr;
346303231Sdim}
347303231Sdim
348303231Sdimvoid GCNHazardRecognizer::RecedeCycle() {
349303231Sdim  llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
350303231Sdim}
351303231Sdim
352303231Sdim//===----------------------------------------------------------------------===//
353303231Sdim// Helper Functions
354303231Sdim//===----------------------------------------------------------------------===//
355303231Sdim
356353358Sdimtypedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
357353358Sdim
358353358Sdim// Returns a minimum wait states since \p I walking all predecessors.
359353358Sdim// Only scans until \p IsExpired does not return true.
360353358Sdim// Can only be run in a hazard recognizer mode.
361353358Sdimstatic int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
362353358Sdim                              MachineBasicBlock *MBB,
363353358Sdim                              MachineBasicBlock::reverse_instr_iterator I,
364353358Sdim                              int WaitStates,
365353358Sdim                              IsExpiredFn IsExpired,
366353358Sdim                              DenseSet<const MachineBasicBlock *> &Visited) {
367353358Sdim  for (auto E = MBB->instr_rend(); I != E; ++I) {
368353358Sdim    // Don't add WaitStates for parent BUNDLE instructions.
369353358Sdim    if (I->isBundle())
370353358Sdim      continue;
371353358Sdim
372353358Sdim    if (IsHazard(&*I))
373353358Sdim      return WaitStates;
374353358Sdim
375353358Sdim    if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
376353358Sdim      continue;
377353358Sdim
378353358Sdim    WaitStates += SIInstrInfo::getNumWaitStates(*I);
379353358Sdim
380353358Sdim    if (IsExpired(&*I, WaitStates))
381353358Sdim      return std::numeric_limits<int>::max();
382353358Sdim  }
383353358Sdim
384353358Sdim  int MinWaitStates = WaitStates;
385353358Sdim  bool Found = false;
386353358Sdim  for (MachineBasicBlock *Pred : MBB->predecessors()) {
387353358Sdim    if (!Visited.insert(Pred).second)
388353358Sdim      continue;
389353358Sdim
390353358Sdim    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
391353358Sdim                               WaitStates, IsExpired, Visited);
392353358Sdim
393353358Sdim    if (W == std::numeric_limits<int>::max())
394353358Sdim      continue;
395353358Sdim
396353358Sdim    MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
397353358Sdim    if (IsExpired(nullptr, MinWaitStates))
398353358Sdim      return MinWaitStates;
399353358Sdim
400353358Sdim    Found = true;
401353358Sdim  }
402353358Sdim
403353358Sdim  if (Found)
404353358Sdim    return MinWaitStates;
405353358Sdim
406353358Sdim  return std::numeric_limits<int>::max();
407353358Sdim}
408353358Sdim
409353358Sdimstatic int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
410353358Sdim                              MachineInstr *MI,
411353358Sdim                              IsExpiredFn IsExpired) {
412353358Sdim  DenseSet<const MachineBasicBlock *> Visited;
413353358Sdim  return getWaitStatesSince(IsHazard, MI->getParent(),
414353358Sdim                            std::next(MI->getReverseIterator()),
415353358Sdim                            0, IsExpired, Visited);
416353358Sdim}
417353358Sdim
418353358Sdimint GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
419353358Sdim  if (IsHazardRecognizerMode) {
420353358Sdim    auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
421353358Sdim      return WaitStates >= Limit;
422353358Sdim    };
423353358Sdim    return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
424353358Sdim  }
425353358Sdim
426326496Sdim  int WaitStates = 0;
427303231Sdim  for (MachineInstr *MI : EmittedInstrs) {
428326496Sdim    if (MI) {
429326496Sdim      if (IsHazard(MI))
430326496Sdim        return WaitStates;
431326496Sdim
432353358Sdim      if (MI->isInlineAsm())
433326496Sdim        continue;
434326496Sdim    }
435303231Sdim    ++WaitStates;
436353358Sdim
437353358Sdim    if (WaitStates >= Limit)
438353358Sdim      break;
439303231Sdim  }
440303231Sdim  return std::numeric_limits<int>::max();
441303231Sdim}
442303231Sdim
443353358Sdimint GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
444353358Sdim                                               IsHazardFn IsHazardDef,
445353358Sdim                                               int Limit) {
446314564Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
447314564Sdim
448314564Sdim  auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
449314564Sdim    return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
450314564Sdim  };
451314564Sdim
452353358Sdim  return getWaitStatesSince(IsHazardFn, Limit);
453314564Sdim}
454314564Sdim
455353358Sdimint GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
456353358Sdim                                                  int Limit) {
457314564Sdim  auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
458314564Sdim    return isSSetReg(MI->getOpcode()) && IsHazard(MI);
459314564Sdim  };
460314564Sdim
461353358Sdim  return getWaitStatesSince(IsHazardFn, Limit);
462314564Sdim}
463314564Sdim
464303231Sdim//===----------------------------------------------------------------------===//
465303231Sdim// No-op Hazard Detection
466303231Sdim//===----------------------------------------------------------------------===//
467303231Sdim
468327952Sdimstatic void addRegUnits(const SIRegisterInfo &TRI,
469327952Sdim                        BitVector &BV, unsigned Reg) {
470327952Sdim  for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
471327952Sdim    BV.set(*RUI);
472327952Sdim}
473327952Sdim
474327952Sdimstatic void addRegsToSet(const SIRegisterInfo &TRI,
475327952Sdim                         iterator_range<MachineInstr::const_mop_iterator> Ops,
476327952Sdim                         BitVector &Set) {
477303231Sdim  for (const MachineOperand &Op : Ops) {
478303231Sdim    if (Op.isReg())
479327952Sdim      addRegUnits(TRI, Set, Op.getReg());
480303231Sdim  }
481303231Sdim}
482303231Sdim
483327952Sdimvoid GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
484327952Sdim  // XXX: Do we need to worry about implicit operands
485327952Sdim  addRegsToSet(TRI, MI.defs(), ClauseDefs);
486327952Sdim  addRegsToSet(TRI, MI.uses(), ClauseUses);
487327952Sdim}
488327952Sdim
489327952Sdimint GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
490327952Sdim  // SMEM soft clause are only present on VI+, and only matter if xnack is
491327952Sdim  // enabled.
492327952Sdim  if (!ST.isXNACKEnabled())
493303231Sdim    return 0;
494303231Sdim
495327952Sdim  bool IsSMRD = TII.isSMRD(*MEM);
496327952Sdim
497327952Sdim  resetClause();
498327952Sdim
499303231Sdim  // A soft-clause is any group of consecutive SMEM instructions.  The
500303231Sdim  // instructions in this group may return out of order and/or may be
501303231Sdim  // replayed (i.e. the same instruction issued more than once).
502303231Sdim  //
503353358Sdim  // In order to handle these situations correctly we need to make sure that
504353358Sdim  // when a clause has more than one instruction, no instruction in the clause
505353358Sdim  // writes to a register that is read by another instruction in the clause
506303231Sdim  // (including itself). If we encounter this situaion, we need to break the
507303231Sdim  // clause by inserting a non SMEM instruction.
508303231Sdim
509303231Sdim  for (MachineInstr *MI : EmittedInstrs) {
510303231Sdim    // When we hit a non-SMEM instruction then we have passed the start of the
511303231Sdim    // clause and we can stop.
512327952Sdim    if (!MI)
513303231Sdim      break;
514303231Sdim
515327952Sdim    if (IsSMRD != SIInstrInfo::isSMRD(*MI))
516327952Sdim      break;
517327952Sdim
518327952Sdim    addClauseInst(*MI);
519303231Sdim  }
520303231Sdim
521327952Sdim  if (ClauseDefs.none())
522303231Sdim    return 0;
523303231Sdim
524327952Sdim  // We need to make sure not to put loads and stores in the same clause if they
525327952Sdim  // use the same address. For now, just start a new clause whenever we see a
526327952Sdim  // store.
527327952Sdim  if (MEM->mayStore())
528303231Sdim    return 1;
529303231Sdim
530327952Sdim  addClauseInst(*MEM);
531303231Sdim
532303231Sdim  // If the set of defs and uses intersect then we cannot add this instruction
533303231Sdim  // to the clause, so we have a hazard.
534327952Sdim  return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
535303231Sdim}
536303231Sdim
537303231Sdimint GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
538303231Sdim  int WaitStatesNeeded = 0;
539303231Sdim
540327952Sdim  WaitStatesNeeded = checkSoftClauseHazards(SMRD);
541303231Sdim
542303231Sdim  // This SMRD hazard only affects SI.
543353358Sdim  if (!ST.hasSMRDReadVALUDefHazard())
544303231Sdim    return WaitStatesNeeded;
545303231Sdim
546303231Sdim  // A read of an SGPR by SMRD instruction requires 4 wait states when the
547303231Sdim  // SGPR was written by a VALU instruction.
548303231Sdim  int SmrdSgprWaitStates = 4;
549321369Sdim  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
550327952Sdim  auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
551303231Sdim
552327952Sdim  bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
553327952Sdim
554303231Sdim  for (const MachineOperand &Use : SMRD->uses()) {
555303231Sdim    if (!Use.isReg())
556303231Sdim      continue;
557303231Sdim    int WaitStatesNeededForUse =
558353358Sdim        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
559353358Sdim                                                   SmrdSgprWaitStates);
560303231Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
561327952Sdim
562327952Sdim    // This fixes what appears to be undocumented hardware behavior in SI where
563327952Sdim    // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564327952Sdim    // needs some number of nops in between. We don't know how many we need, but
565327952Sdim    // let's use 4. This wasn't discovered before probably because the only
566327952Sdim    // case when this happens is when we expand a 64-bit pointer into a full
567327952Sdim    // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568327952Sdim    // probably never encountered in the closed-source land.
569327952Sdim    if (IsBufferSMRD) {
570327952Sdim      int WaitStatesNeededForUse =
571327952Sdim        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
572353358Sdim                                                   IsBufferHazardDefFn,
573353358Sdim                                                   SmrdSgprWaitStates);
574327952Sdim      WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575327952Sdim    }
576303231Sdim  }
577327952Sdim
578303231Sdim  return WaitStatesNeeded;
579303231Sdim}
580303231Sdim
581303231Sdimint GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
582353358Sdim  if (!ST.hasVMEMReadSGPRVALUDefHazard())
583303231Sdim    return 0;
584303231Sdim
585327952Sdim  int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
586303231Sdim
587303231Sdim  // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588303231Sdim  // SGPR was written by a VALU Instruction.
589327952Sdim  const int VmemSgprWaitStates = 5;
590327952Sdim  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
591303231Sdim  for (const MachineOperand &Use : VMEM->uses()) {
592303231Sdim    if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
593303231Sdim      continue;
594303231Sdim
595303231Sdim    int WaitStatesNeededForUse =
596353358Sdim        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
597353358Sdim                                                   VmemSgprWaitStates);
598303231Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
599303231Sdim  }
600303231Sdim  return WaitStatesNeeded;
601303231Sdim}
602303231Sdim
603303231Sdimint GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
604303231Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
605327952Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
606303231Sdim
607327952Sdim  // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608303231Sdim  int DppVgprWaitStates = 2;
609327952Sdim  int DppExecWaitStates = 5;
610303231Sdim  int WaitStatesNeeded = 0;
611327952Sdim  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
612303231Sdim
613303231Sdim  for (const MachineOperand &Use : DPP->uses()) {
614303231Sdim    if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
615303231Sdim      continue;
616303231Sdim    int WaitStatesNeededForUse =
617353358Sdim        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
618353358Sdim                              [](MachineInstr *) { return true; },
619353358Sdim                              DppVgprWaitStates);
620303231Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
621303231Sdim  }
622303231Sdim
623327952Sdim  WaitStatesNeeded = std::max(
624327952Sdim      WaitStatesNeeded,
625353358Sdim      DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
626353358Sdim                                                DppExecWaitStates));
627327952Sdim
628303231Sdim  return WaitStatesNeeded;
629303231Sdim}
630314564Sdim
631314564Sdimint GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
632314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
633314564Sdim
634314564Sdim  // v_div_fmas requires 4 wait states after a write to vcc from a VALU
635314564Sdim  // instruction.
636314564Sdim  const int DivFMasWaitStates = 4;
637314564Sdim  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
638353358Sdim  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
639353358Sdim                                               DivFMasWaitStates);
640314564Sdim
641314564Sdim  return DivFMasWaitStates - WaitStatesNeeded;
642314564Sdim}
643314564Sdim
644314564Sdimint GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
645314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
646314564Sdim  unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
647314564Sdim
648314564Sdim  const int GetRegWaitStates = 2;
649314564Sdim  auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
650314564Sdim    return GetRegHWReg == getHWReg(TII, *MI);
651314564Sdim  };
652353358Sdim  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
653314564Sdim
654314564Sdim  return GetRegWaitStates - WaitStatesNeeded;
655314564Sdim}
656314564Sdim
657314564Sdimint GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
658314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
659314564Sdim  unsigned HWReg = getHWReg(TII, *SetRegInstr);
660314564Sdim
661353358Sdim  const int SetRegWaitStates = ST.getSetRegWaitStates();
662314564Sdim  auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
663314564Sdim    return HWReg == getHWReg(TII, *MI);
664314564Sdim  };
665353358Sdim  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
666314564Sdim  return SetRegWaitStates - WaitStatesNeeded;
667314564Sdim}
668314564Sdim
669314564Sdimint GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
670314564Sdim  if (!MI.mayStore())
671314564Sdim    return -1;
672314564Sdim
673314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
674314564Sdim  unsigned Opcode = MI.getOpcode();
675314564Sdim  const MCInstrDesc &Desc = MI.getDesc();
676314564Sdim
677314564Sdim  int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
678314564Sdim  int VDataRCID = -1;
679314564Sdim  if (VDataIdx != -1)
680314564Sdim    VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
681314564Sdim
682314564Sdim  if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
683314564Sdim    // There is no hazard if the instruction does not use vector regs
684314564Sdim    // (like wbinvl1)
685314564Sdim    if (VDataIdx == -1)
686314564Sdim      return -1;
687314564Sdim    // For MUBUF/MTBUF instructions this hazard only exists if the
688314564Sdim    // instruction is not using a register in the soffset field.
689314564Sdim    const MachineOperand *SOffset =
690314564Sdim        TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
691314564Sdim    // If we have no soffset operand, then assume this field has been
692314564Sdim    // hardcoded to zero.
693314564Sdim    if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
694314564Sdim        (!SOffset || !SOffset->isReg()))
695314564Sdim      return VDataIdx;
696314564Sdim  }
697314564Sdim
698314564Sdim  // MIMG instructions create a hazard if they don't use a 256-bit T# and
699314564Sdim  // the store size is greater than 8 bytes and they have more than two bits
700314564Sdim  // of their dmask set.
701314564Sdim  // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702314564Sdim  if (TII->isMIMG(MI)) {
703314564Sdim    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
704314564Sdim    assert(SRsrcIdx != -1 &&
705314564Sdim           AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
706314564Sdim    (void)SRsrcIdx;
707314564Sdim  }
708314564Sdim
709314564Sdim  if (TII->isFLAT(MI)) {
710314564Sdim    int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
711314564Sdim    if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
712314564Sdim      return DataIdx;
713314564Sdim  }
714314564Sdim
715314564Sdim  return -1;
716314564Sdim}
717314564Sdim
718327952Sdimint GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
719327952Sdim						const MachineRegisterInfo &MRI) {
720327952Sdim  // Helper to check for the hazard where VMEM instructions that store more than
721327952Sdim  // 8 bytes can have there store data over written by the next instruction.
722327952Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
723327952Sdim
724327952Sdim  const int VALUWaitStates = 1;
725327952Sdim  int WaitStatesNeeded = 0;
726327952Sdim
727327952Sdim  if (!TRI->isVGPR(MRI, Def.getReg()))
728327952Sdim    return WaitStatesNeeded;
729360784Sdim  Register Reg = Def.getReg();
730327952Sdim  auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
731327952Sdim    int DataIdx = createsVALUHazard(*MI);
732327952Sdim    return DataIdx >= 0 &&
733327952Sdim    TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
734327952Sdim  };
735327952Sdim  int WaitStatesNeededForDef =
736353358Sdim    VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
737327952Sdim  WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
738327952Sdim
739327952Sdim  return WaitStatesNeeded;
740327952Sdim}
741327952Sdim
742314564Sdimint GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
743314564Sdim  // This checks for the hazard where VMEM instructions that store more than
744314564Sdim  // 8 bytes can have there store data over written by the next instruction.
745314564Sdim  if (!ST.has12DWordStoreHazard())
746314564Sdim    return 0;
747314564Sdim
748327952Sdim  const MachineRegisterInfo &MRI = MF.getRegInfo();
749314564Sdim  int WaitStatesNeeded = 0;
750314564Sdim
751314564Sdim  for (const MachineOperand &Def : VALU->defs()) {
752327952Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
753314564Sdim  }
754327952Sdim
755314564Sdim  return WaitStatesNeeded;
756314564Sdim}
757314564Sdim
758327952Sdimint GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
759327952Sdim  // This checks for hazards associated with inline asm statements.
760327952Sdim  // Since inline asms can contain just about anything, we use this
761327952Sdim  // to call/leverage other check*Hazard routines. Note that
762327952Sdim  // this function doesn't attempt to address all possible inline asm
763327952Sdim  // hazards (good luck), but is a collection of what has been
764327952Sdim  // problematic thus far.
765327952Sdim
766327952Sdim  // see checkVALUHazards()
767327952Sdim  if (!ST.has12DWordStoreHazard())
768327952Sdim    return 0;
769327952Sdim
770327952Sdim  const MachineRegisterInfo &MRI = MF.getRegInfo();
771327952Sdim  int WaitStatesNeeded = 0;
772327952Sdim
773327952Sdim  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
774327952Sdim       I != E; ++I) {
775327952Sdim    const MachineOperand &Op = IA->getOperand(I);
776327952Sdim    if (Op.isReg() && Op.isDef()) {
777327952Sdim      WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
778327952Sdim    }
779327952Sdim  }
780327952Sdim
781327952Sdim  return WaitStatesNeeded;
782327952Sdim}
783327952Sdim
784314564Sdimint GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
785314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
786314564Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
787327952Sdim  const MachineRegisterInfo &MRI = MF.getRegInfo();
788314564Sdim
789314564Sdim  const MachineOperand *LaneSelectOp =
790314564Sdim      TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
791314564Sdim
792314564Sdim  if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
793314564Sdim    return 0;
794314564Sdim
795360784Sdim  Register LaneSelectReg = LaneSelectOp->getReg();
796314564Sdim  auto IsHazardFn = [TII] (MachineInstr *MI) {
797314564Sdim    return TII->isVALU(*MI);
798314564Sdim  };
799314564Sdim
800314564Sdim  const int RWLaneWaitStates = 4;
801353358Sdim  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
802353358Sdim                                              RWLaneWaitStates);
803314564Sdim  return RWLaneWaitStates - WaitStatesSince;
804314564Sdim}
805314564Sdim
806314564Sdimint GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
807353358Sdim  if (!ST.hasRFEHazards())
808314564Sdim    return 0;
809314564Sdim
810314564Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
811314564Sdim
812314564Sdim  const int RFEWaitStates = 1;
813314564Sdim
814314564Sdim  auto IsHazardFn = [TII] (MachineInstr *MI) {
815314564Sdim    return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
816314564Sdim  };
817353358Sdim  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
818314564Sdim  return RFEWaitStates - WaitStatesNeeded;
819314564Sdim}
820321369Sdim
821321369Sdimint GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
822341825Sdim  if (MI->isDebugInstr())
823321369Sdim    return 0;
824321369Sdim
825321369Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
826321369Sdim  if (!ST.hasSMovFedHazard())
827321369Sdim    return 0;
828321369Sdim
829321369Sdim  // Check for any instruction reading an SGPR after a write from
830321369Sdim  // s_mov_fed_b32.
831321369Sdim  int MovFedWaitStates = 1;
832321369Sdim  int WaitStatesNeeded = 0;
833321369Sdim
834321369Sdim  for (const MachineOperand &Use : MI->uses()) {
835321369Sdim    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
836321369Sdim      continue;
837321369Sdim    auto IsHazardFn = [] (MachineInstr *MI) {
838321369Sdim      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
839321369Sdim    };
840321369Sdim    int WaitStatesNeededForUse =
841353358Sdim        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
842353358Sdim                                                 MovFedWaitStates);
843321369Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
844321369Sdim  }
845321369Sdim
846321369Sdim  return WaitStatesNeeded;
847321369Sdim}
848321369Sdim
849321369Sdimint GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
850321369Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
851327952Sdim  const int SMovRelWaitStates = 1;
852321369Sdim  auto IsHazardFn = [TII] (MachineInstr *MI) {
853321369Sdim    return TII->isSALU(*MI);
854321369Sdim  };
855353358Sdim  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
856353358Sdim                                                   SMovRelWaitStates);
857321369Sdim}
858353358Sdim
859353358Sdimvoid GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
860353358Sdim  fixVMEMtoScalarWriteHazards(MI);
861353358Sdim  fixVcmpxPermlaneHazards(MI);
862353358Sdim  fixSMEMtoVectorWriteHazards(MI);
863353358Sdim  fixVcmpxExecWARHazard(MI);
864353358Sdim  fixLdsBranchVmemWARHazard(MI);
865353358Sdim}
866353358Sdim
867353358Sdimbool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
868353358Sdim  if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
869353358Sdim    return false;
870353358Sdim
871353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
872353358Sdim  auto IsHazardFn = [TII] (MachineInstr *MI) {
873353358Sdim    return TII->isVOPC(*MI);
874353358Sdim  };
875353358Sdim
876353358Sdim  auto IsExpiredFn = [] (MachineInstr *MI, int) {
877353358Sdim    if (!MI)
878353358Sdim      return false;
879353358Sdim    unsigned Opc = MI->getOpcode();
880353358Sdim    return SIInstrInfo::isVALU(*MI) &&
881353358Sdim           Opc != AMDGPU::V_NOP_e32 &&
882353358Sdim           Opc != AMDGPU::V_NOP_e64 &&
883353358Sdim           Opc != AMDGPU::V_NOP_sdwa;
884353358Sdim  };
885353358Sdim
886353358Sdim  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
887353358Sdim      std::numeric_limits<int>::max())
888353358Sdim    return false;
889353358Sdim
890353358Sdim  // V_NOP will be discarded by SQ.
891353358Sdim  // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
892353358Sdim  // which is always a VGPR and available.
893353358Sdim  auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
894360784Sdim  Register Reg = Src0->getReg();
895353358Sdim  bool IsUndef = Src0->isUndef();
896353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
897353358Sdim          TII->get(AMDGPU::V_MOV_B32_e32))
898353358Sdim    .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
899353358Sdim    .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
900353358Sdim
901353358Sdim  return true;
902353358Sdim}
903353358Sdim
904353358Sdimbool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
905353358Sdim  if (!ST.hasVMEMtoScalarWriteHazard())
906353358Sdim    return false;
907353358Sdim
908353358Sdim  if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
909353358Sdim    return false;
910353358Sdim
911353358Sdim  if (MI->getNumDefs() == 0)
912353358Sdim    return false;
913353358Sdim
914353358Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
915353358Sdim
916353358Sdim  auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
917353358Sdim    if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
918353358Sdim        !SIInstrInfo::isFLAT(*I))
919353358Sdim      return false;
920353358Sdim
921353358Sdim    for (const MachineOperand &Def : MI->defs()) {
922353358Sdim      MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
923353358Sdim      if (!Op)
924353358Sdim        continue;
925353358Sdim      return true;
926353358Sdim    }
927353358Sdim    return false;
928353358Sdim  };
929353358Sdim
930353358Sdim  auto IsExpiredFn = [] (MachineInstr *MI, int) {
931353358Sdim    return MI && (SIInstrInfo::isVALU(*MI) ||
932353358Sdim                  (MI->getOpcode() == AMDGPU::S_WAITCNT &&
933353358Sdim                   !MI->getOperand(0).getImm()));
934353358Sdim  };
935353358Sdim
936353358Sdim  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
937353358Sdim      std::numeric_limits<int>::max())
938353358Sdim    return false;
939353358Sdim
940353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
941353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
942353358Sdim  return true;
943353358Sdim}
944353358Sdim
945353358Sdimbool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
946353358Sdim  if (!ST.hasSMEMtoVectorWriteHazard())
947353358Sdim    return false;
948353358Sdim
949353358Sdim  if (!SIInstrInfo::isVALU(*MI))
950353358Sdim    return false;
951353358Sdim
952353358Sdim  unsigned SDSTName;
953353358Sdim  switch (MI->getOpcode()) {
954353358Sdim  case AMDGPU::V_READLANE_B32:
955360661Sdim  case AMDGPU::V_READLANE_B32_gfx10:
956353358Sdim  case AMDGPU::V_READFIRSTLANE_B32:
957353358Sdim    SDSTName = AMDGPU::OpName::vdst;
958353358Sdim    break;
959353358Sdim  default:
960353358Sdim    SDSTName = AMDGPU::OpName::sdst;
961353358Sdim    break;
962353358Sdim  }
963353358Sdim
964353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
965353358Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
966353358Sdim  const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
967353358Sdim  const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
968353358Sdim  if (!SDST) {
969353358Sdim    for (const auto &MO : MI->implicit_operands()) {
970353358Sdim      if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
971353358Sdim        SDST = &MO;
972353358Sdim        break;
973353358Sdim      }
974353358Sdim    }
975353358Sdim  }
976353358Sdim
977353358Sdim  if (!SDST)
978353358Sdim    return false;
979353358Sdim
980360784Sdim  const Register SDSTReg = SDST->getReg();
981353358Sdim  auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
982353358Sdim    return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
983353358Sdim  };
984353358Sdim
985353358Sdim  auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
986353358Sdim    if (MI) {
987353358Sdim      if (TII->isSALU(*MI)) {
988353358Sdim        switch (MI->getOpcode()) {
989353358Sdim        case AMDGPU::S_SETVSKIP:
990353358Sdim        case AMDGPU::S_VERSION:
991353358Sdim        case AMDGPU::S_WAITCNT_VSCNT:
992353358Sdim        case AMDGPU::S_WAITCNT_VMCNT:
993353358Sdim        case AMDGPU::S_WAITCNT_EXPCNT:
994353358Sdim          // These instructions cannot not mitigate the hazard.
995353358Sdim          return false;
996353358Sdim        case AMDGPU::S_WAITCNT_LGKMCNT:
997353358Sdim          // Reducing lgkmcnt count to 0 always mitigates the hazard.
998353358Sdim          return (MI->getOperand(1).getImm() == 0) &&
999353358Sdim                 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1000353358Sdim        case AMDGPU::S_WAITCNT: {
1001353358Sdim          const int64_t Imm = MI->getOperand(0).getImm();
1002353358Sdim          AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1003353358Sdim          return (Decoded.LgkmCnt == 0);
1004353358Sdim        }
1005353358Sdim        default:
1006353358Sdim          // SOPP instructions cannot mitigate the hazard.
1007353358Sdim          if (TII->isSOPP(*MI))
1008353358Sdim            return false;
1009353358Sdim          // At this point the SALU can be assumed to mitigate the hazard
1010353358Sdim          // because either:
1011353358Sdim          // (a) it is independent of the at risk SMEM (breaking chain),
1012353358Sdim          // or
1013353358Sdim          // (b) it is dependent on the SMEM, in which case an appropriate
1014353358Sdim          //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1015353358Sdim          //     SMEM instruction.
1016353358Sdim          return true;
1017353358Sdim        }
1018353358Sdim      }
1019353358Sdim    }
1020353358Sdim    return false;
1021353358Sdim  };
1022353358Sdim
1023353358Sdim  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1024353358Sdim      std::numeric_limits<int>::max())
1025353358Sdim    return false;
1026353358Sdim
1027353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1028353358Sdim          TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1029353358Sdim      .addImm(0);
1030353358Sdim  return true;
1031353358Sdim}
1032353358Sdim
1033353358Sdimbool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1034353358Sdim  if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1035353358Sdim    return false;
1036353358Sdim
1037353358Sdim  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1038353358Sdim  if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1039353358Sdim    return false;
1040353358Sdim
1041353358Sdim  auto IsHazardFn = [TRI] (MachineInstr *I) {
1042353358Sdim    if (SIInstrInfo::isVALU(*I))
1043353358Sdim      return false;
1044353358Sdim    return I->readsRegister(AMDGPU::EXEC, TRI);
1045353358Sdim  };
1046353358Sdim
1047353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
1048353358Sdim  auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1049353358Sdim    if (!MI)
1050353358Sdim      return false;
1051353358Sdim    if (SIInstrInfo::isVALU(*MI)) {
1052353358Sdim      if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1053353358Sdim        return true;
1054353358Sdim      for (auto MO : MI->implicit_operands())
1055353358Sdim        if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1056353358Sdim          return true;
1057353358Sdim    }
1058353358Sdim    if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1059353358Sdim        (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1060353358Sdim      return true;
1061353358Sdim    return false;
1062353358Sdim  };
1063353358Sdim
1064353358Sdim  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1065353358Sdim      std::numeric_limits<int>::max())
1066353358Sdim    return false;
1067353358Sdim
1068353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1069353358Sdim          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1070353358Sdim    .addImm(0xfffe);
1071353358Sdim  return true;
1072353358Sdim}
1073353358Sdim
1074353358Sdimbool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1075353358Sdim  if (!ST.hasLdsBranchVmemWARHazard())
1076353358Sdim    return false;
1077353358Sdim
1078353358Sdim  auto IsHazardInst = [] (const MachineInstr *MI) {
1079353358Sdim    if (SIInstrInfo::isDS(*MI))
1080353358Sdim      return 1;
1081353358Sdim    if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1082353358Sdim      return 2;
1083353358Sdim    return 0;
1084353358Sdim  };
1085353358Sdim
1086353358Sdim  auto InstType = IsHazardInst(MI);
1087353358Sdim  if (!InstType)
1088353358Sdim    return false;
1089353358Sdim
1090353358Sdim  auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1091353358Sdim    return I && (IsHazardInst(I) ||
1092353358Sdim                 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1093353358Sdim                  I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1094353358Sdim                  !I->getOperand(1).getImm()));
1095353358Sdim  };
1096353358Sdim
1097353358Sdim  auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1098353358Sdim    if (!I->isBranch())
1099353358Sdim      return false;
1100353358Sdim
1101353358Sdim    auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1102353358Sdim      auto InstType2 = IsHazardInst(I);
1103353358Sdim      return InstType2 && InstType != InstType2;
1104353358Sdim    };
1105353358Sdim
1106353358Sdim    auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1107353358Sdim      if (!I)
1108353358Sdim        return false;
1109353358Sdim
1110353358Sdim      auto InstType2 = IsHazardInst(I);
1111353358Sdim      if (InstType == InstType2)
1112353358Sdim        return true;
1113353358Sdim
1114353358Sdim      return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1115353358Sdim             I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1116353358Sdim             !I->getOperand(1).getImm();
1117353358Sdim    };
1118353358Sdim
1119353358Sdim    return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1120353358Sdim           std::numeric_limits<int>::max();
1121353358Sdim  };
1122353358Sdim
1123353358Sdim  if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1124353358Sdim      std::numeric_limits<int>::max())
1125353358Sdim    return false;
1126353358Sdim
1127353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
1128353358Sdim  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1129353358Sdim          TII->get(AMDGPU::S_WAITCNT_VSCNT))
1130353358Sdim    .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1131353358Sdim    .addImm(0);
1132353358Sdim
1133353358Sdim  return true;
1134353358Sdim}
1135353358Sdim
1136353358Sdimint GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1137353358Sdim  int NSAtoVMEMWaitStates = 1;
1138353358Sdim
1139353358Sdim  if (!ST.hasNSAtoVMEMBug())
1140353358Sdim    return 0;
1141353358Sdim
1142353358Sdim  if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1143353358Sdim    return 0;
1144353358Sdim
1145353358Sdim  const SIInstrInfo *TII = ST.getInstrInfo();
1146353358Sdim  const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1147353358Sdim  if (!Offset || (Offset->getImm() & 6) == 0)
1148353358Sdim    return 0;
1149353358Sdim
1150353358Sdim  auto IsHazardFn = [TII] (MachineInstr *I) {
1151353358Sdim    if (!SIInstrInfo::isMIMG(*I))
1152353358Sdim      return false;
1153353358Sdim    const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1154353358Sdim    return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1155353358Sdim           TII->getInstSizeInBytes(*I) >= 16;
1156353358Sdim  };
1157353358Sdim
1158353358Sdim  return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1159353358Sdim}
1160353358Sdim
1161353358Sdimint GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1162353358Sdim  int FPAtomicToDenormModeWaitStates = 3;
1163353358Sdim
1164353358Sdim  if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1165353358Sdim    return 0;
1166353358Sdim
1167353358Sdim  auto IsHazardFn = [] (MachineInstr *I) {
1168353358Sdim    if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1169353358Sdim      return false;
1170353358Sdim    return SIInstrInfo::isFPAtomic(*I);
1171353358Sdim  };
1172353358Sdim
1173353358Sdim  auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1174353358Sdim    if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1175353358Sdim      return true;
1176353358Sdim
1177353358Sdim    switch (MI->getOpcode()) {
1178353358Sdim    case AMDGPU::S_WAITCNT:
1179353358Sdim    case AMDGPU::S_WAITCNT_VSCNT:
1180353358Sdim    case AMDGPU::S_WAITCNT_VMCNT:
1181353358Sdim    case AMDGPU::S_WAITCNT_EXPCNT:
1182353358Sdim    case AMDGPU::S_WAITCNT_LGKMCNT:
1183353358Sdim    case AMDGPU::S_WAITCNT_IDLE:
1184353358Sdim      return true;
1185353358Sdim    default:
1186353358Sdim      break;
1187353358Sdim    }
1188353358Sdim
1189353358Sdim    return false;
1190353358Sdim  };
1191353358Sdim
1192353358Sdim
1193353358Sdim  return FPAtomicToDenormModeWaitStates -
1194353358Sdim         ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1195353358Sdim}
1196353358Sdim
1197353358Sdimint GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1198353358Sdim  assert(SIInstrInfo::isMAI(*MI));
1199353358Sdim
1200353358Sdim  int WaitStatesNeeded = 0;
1201353358Sdim  unsigned Opc = MI->getOpcode();
1202353358Sdim
1203353358Sdim  auto IsVALUFn = [] (MachineInstr *MI) {
1204353358Sdim    return SIInstrInfo::isVALU(*MI);
1205353358Sdim  };
1206353358Sdim
1207353358Sdim  if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1208353358Sdim    const int LegacyVALUWritesVGPRWaitStates = 2;
1209353358Sdim    const int VALUWritesExecWaitStates = 4;
1210353358Sdim    const int MaxWaitStates = 4;
1211353358Sdim
1212353358Sdim    int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1213353358Sdim      getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1214353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1215353358Sdim
1216353358Sdim    if (WaitStatesNeeded < MaxWaitStates) {
1217353358Sdim      for (const MachineOperand &Use : MI->explicit_uses()) {
1218353358Sdim        const int MaxWaitStates = 2;
1219353358Sdim
1220353358Sdim        if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1221353358Sdim          continue;
1222353358Sdim
1223353358Sdim        int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1224353358Sdim          getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1225353358Sdim        WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1226353358Sdim
1227353358Sdim        if (WaitStatesNeeded == MaxWaitStates)
1228353358Sdim          break;
1229353358Sdim      }
1230353358Sdim    }
1231353358Sdim  }
1232353358Sdim
1233353358Sdim  auto IsMFMAFn = [] (MachineInstr *MI) {
1234353358Sdim    return SIInstrInfo::isMAI(*MI) &&
1235353358Sdim           MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1236353358Sdim           MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1237353358Sdim  };
1238353358Sdim
1239353358Sdim  for (const MachineOperand &Op : MI->explicit_operands()) {
1240353358Sdim    if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1241353358Sdim      continue;
1242353358Sdim
1243353358Sdim    if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1244353358Sdim      continue;
1245353358Sdim
1246353358Sdim    const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1247353358Sdim    const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1248353358Sdim    const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1249353358Sdim    const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1250353358Sdim    const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1251353358Sdim    const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1252353358Sdim    const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1253353358Sdim    const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1254353358Sdim    const int MaxWaitStates = 18;
1255360784Sdim    Register Reg = Op.getReg();
1256353358Sdim    unsigned HazardDefLatency = 0;
1257353358Sdim
1258353358Sdim    auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1259353358Sdim                              (MachineInstr *MI) {
1260353358Sdim      if (!IsMFMAFn(MI))
1261353358Sdim        return false;
1262360784Sdim      Register DstReg = MI->getOperand(0).getReg();
1263353358Sdim      if (DstReg == Reg)
1264353358Sdim        return false;
1265353358Sdim      HazardDefLatency = std::max(HazardDefLatency,
1266353358Sdim                                  TSchedModel.computeInstrLatency(MI));
1267353358Sdim      return TRI.regsOverlap(DstReg, Reg);
1268353358Sdim    };
1269353358Sdim
1270353358Sdim    int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1271353358Sdim                                                   MaxWaitStates);
1272353358Sdim    int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1273353358Sdim    int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1274353358Sdim    int OpNo = MI->getOperandNo(&Op);
1275353358Sdim    if (OpNo == SrcCIdx) {
1276353358Sdim      NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1277353358Sdim    } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1278353358Sdim      switch (HazardDefLatency) {
1279353358Sdim      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1280353358Sdim               break;
1281353358Sdim      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1282353358Sdim               break;
1283353358Sdim      case 16: LLVM_FALLTHROUGH;
1284353358Sdim      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1285353358Sdim               break;
1286353358Sdim      }
1287353358Sdim    } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1288353358Sdim      switch (HazardDefLatency) {
1289353358Sdim      case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1290353358Sdim               break;
1291353358Sdim      case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1292353358Sdim               break;
1293353358Sdim      case 16: LLVM_FALLTHROUGH;
1294353358Sdim      default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1295353358Sdim               break;
1296353358Sdim      }
1297353358Sdim    }
1298353358Sdim
1299353358Sdim    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1300353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1301353358Sdim
1302353358Sdim    if (WaitStatesNeeded == MaxWaitStates)
1303353358Sdim      return WaitStatesNeeded; // Early exit.
1304353358Sdim
1305353358Sdim    auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1306353358Sdim      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1307353358Sdim        return false;
1308360784Sdim      Register DstReg = MI->getOperand(0).getReg();
1309353358Sdim      return TRI.regsOverlap(Reg, DstReg);
1310353358Sdim    };
1311353358Sdim
1312353358Sdim    const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1313353358Sdim    const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1314353358Sdim    const int AccVGPRWriteAccVgprReadWaitStates = 3;
1315353358Sdim    NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1316353358Sdim    if (OpNo == SrcCIdx)
1317353358Sdim      NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1318353358Sdim    else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1319353358Sdim      NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1320353358Sdim
1321353358Sdim    WaitStatesNeededForUse = NeedWaitStates -
1322353358Sdim      getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1323353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1324353358Sdim
1325353358Sdim    if (WaitStatesNeeded == MaxWaitStates)
1326353358Sdim      return WaitStatesNeeded; // Early exit.
1327353358Sdim  }
1328353358Sdim
1329353358Sdim  if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1330353358Sdim    const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1331353358Sdim    const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1332353358Sdim    const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1333353358Sdim    const int MaxWaitStates = 13;
1334360784Sdim    Register DstReg = MI->getOperand(0).getReg();
1335353358Sdim    unsigned HazardDefLatency = 0;
1336353358Sdim
1337353358Sdim    auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1338353358Sdim                         (MachineInstr *MI) {
1339353358Sdim      if (!IsMFMAFn(MI))
1340353358Sdim        return false;
1341360784Sdim      Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1342353358Sdim      HazardDefLatency = std::max(HazardDefLatency,
1343353358Sdim                                  TSchedModel.computeInstrLatency(MI));
1344353358Sdim      return TRI.regsOverlap(Reg, DstReg);
1345353358Sdim    };
1346353358Sdim
1347353358Sdim    int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1348353358Sdim    int NeedWaitStates;
1349353358Sdim    switch (HazardDefLatency) {
1350353358Sdim    case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1351353358Sdim             break;
1352353358Sdim    case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1353353358Sdim             break;
1354353358Sdim    case 16: LLVM_FALLTHROUGH;
1355353358Sdim    default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1356353358Sdim             break;
1357353358Sdim    }
1358353358Sdim
1359353358Sdim    int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1360353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1361353358Sdim  }
1362353358Sdim
1363353358Sdim  return WaitStatesNeeded;
1364353358Sdim}
1365353358Sdim
1366353358Sdimint GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1367353358Sdim  if (!ST.hasMAIInsts())
1368353358Sdim    return 0;
1369353358Sdim
1370353358Sdim  int WaitStatesNeeded = 0;
1371353358Sdim
1372353358Sdim  auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1373353358Sdim    return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1374353358Sdim  };
1375353358Sdim
1376353358Sdim  for (const MachineOperand &Op : MI->explicit_uses()) {
1377353358Sdim    if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1378353358Sdim      continue;
1379353358Sdim
1380360784Sdim    Register Reg = Op.getReg();
1381353358Sdim
1382353358Sdim    const int AccVgprReadLdStWaitStates = 2;
1383353358Sdim    const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1384353358Sdim    const int MaxWaitStates = 2;
1385353358Sdim
1386353358Sdim    int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1387353358Sdim      getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1388353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1389353358Sdim
1390353358Sdim    if (WaitStatesNeeded == MaxWaitStates)
1391353358Sdim      return WaitStatesNeeded; // Early exit.
1392353358Sdim
1393353358Sdim    auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1394353358Sdim      if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1395353358Sdim        return false;
1396353358Sdim      auto IsVALUFn = [] (MachineInstr *MI) {
1397353358Sdim        return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1398353358Sdim      };
1399353358Sdim      return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1400353358Sdim             std::numeric_limits<int>::max();
1401353358Sdim    };
1402353358Sdim
1403353358Sdim    WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1404353358Sdim      getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1405353358Sdim    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1406353358Sdim  }
1407353358Sdim
1408353358Sdim  return WaitStatesNeeded;
1409353358Sdim}
1410