1303231Sdim//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// 2303231Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6303231Sdim// 7303231Sdim//===----------------------------------------------------------------------===// 8303231Sdim// 9303231Sdim// This file implements hazard recognizers for scheduling on GCN processors. 10303231Sdim// 11303231Sdim//===----------------------------------------------------------------------===// 12303231Sdim 13303231Sdim#include "GCNHazardRecognizer.h" 14303231Sdim#include "AMDGPUSubtarget.h" 15321369Sdim#include "SIDefines.h" 16303231Sdim#include "SIInstrInfo.h" 17321369Sdim#include "SIRegisterInfo.h" 18341825Sdim#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19321369Sdim#include "Utils/AMDGPUBaseInfo.h" 20321369Sdim#include "llvm/ADT/iterator_range.h" 21321369Sdim#include "llvm/CodeGen/MachineFunction.h" 22321369Sdim#include "llvm/CodeGen/MachineInstr.h" 23353358Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 24321369Sdim#include "llvm/CodeGen/MachineOperand.h" 25303231Sdim#include "llvm/CodeGen/ScheduleDAG.h" 26321369Sdim#include "llvm/MC/MCInstrDesc.h" 27321369Sdim#include "llvm/Support/ErrorHandling.h" 28321369Sdim#include <algorithm> 29321369Sdim#include <cassert> 30321369Sdim#include <limits> 31321369Sdim#include <set> 32321369Sdim#include <vector> 33303231Sdim 34303231Sdimusing namespace llvm; 35303231Sdim 36303231Sdim//===----------------------------------------------------------------------===// 37303231Sdim// Hazard Recoginizer Implementation 38303231Sdim//===----------------------------------------------------------------------===// 39303231Sdim 40303231SdimGCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : 41353358Sdim IsHazardRecognizerMode(false), 42303231Sdim CurrCycleInstr(nullptr), 43303231Sdim MF(MF), 44341825Sdim ST(MF.getSubtarget<GCNSubtarget>()), 45327952Sdim TII(*ST.getInstrInfo()), 46327952Sdim TRI(TII.getRegisterInfo()), 47327952Sdim ClauseUses(TRI.getNumRegUnits()), 48327952Sdim ClauseDefs(TRI.getNumRegUnits()) { 49353358Sdim MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; 50353358Sdim TSchedModel.init(&ST); 51303231Sdim} 52303231Sdim 53303231Sdimvoid GCNHazardRecognizer::EmitInstruction(SUnit *SU) { 54303231Sdim EmitInstruction(SU->getInstr()); 55303231Sdim} 56303231Sdim 57303231Sdimvoid GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { 58303231Sdim CurrCycleInstr = MI; 59303231Sdim} 60303231Sdim 61314564Sdimstatic bool isDivFMas(unsigned Opcode) { 62314564Sdim return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64; 63314564Sdim} 64314564Sdim 65314564Sdimstatic bool isSGetReg(unsigned Opcode) { 66314564Sdim return Opcode == AMDGPU::S_GETREG_B32; 67314564Sdim} 68314564Sdim 69314564Sdimstatic bool isSSetReg(unsigned Opcode) { 70314564Sdim return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32; 71314564Sdim} 72314564Sdim 73314564Sdimstatic bool isRWLane(unsigned Opcode) { 74314564Sdim return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32; 75314564Sdim} 76314564Sdim 77314564Sdimstatic bool isRFE(unsigned Opcode) { 78314564Sdim return Opcode == AMDGPU::S_RFE_B64; 79314564Sdim} 80314564Sdim 81321369Sdimstatic bool isSMovRel(unsigned Opcode) { 82321369Sdim switch (Opcode) { 83321369Sdim case AMDGPU::S_MOVRELS_B32: 84321369Sdim case AMDGPU::S_MOVRELS_B64: 85321369Sdim case AMDGPU::S_MOVRELD_B32: 86321369Sdim case AMDGPU::S_MOVRELD_B64: 87321369Sdim return true; 88321369Sdim default: 89321369Sdim return false; 90321369Sdim } 91321369Sdim} 92321369Sdim 93344779Sdimstatic bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, 94344779Sdim const MachineInstr &MI) { 95344779Sdim if (TII.isAlwaysGDS(MI.getOpcode())) 96344779Sdim return true; 97344779Sdim 98327952Sdim switch (MI.getOpcode()) { 99327952Sdim case AMDGPU::S_SENDMSG: 100327952Sdim case AMDGPU::S_SENDMSGHALT: 101327952Sdim case AMDGPU::S_TTRACEDATA: 102327952Sdim return true; 103344779Sdim // These DS opcodes don't support GDS. 104344779Sdim case AMDGPU::DS_NOP: 105344779Sdim case AMDGPU::DS_PERMUTE_B32: 106344779Sdim case AMDGPU::DS_BPERMUTE_B32: 107344779Sdim return false; 108327952Sdim default: 109344779Sdim if (TII.isDS(MI.getOpcode())) { 110344779Sdim int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 111344779Sdim AMDGPU::OpName::gds); 112344779Sdim if (MI.getOperand(GDS).getImm()) 113344779Sdim return true; 114344779Sdim } 115327952Sdim return false; 116327952Sdim } 117327952Sdim} 118327952Sdim 119353358Sdimstatic bool isPermlane(const MachineInstr &MI) { 120353358Sdim unsigned Opcode = MI.getOpcode(); 121353358Sdim return Opcode == AMDGPU::V_PERMLANE16_B32 || 122353358Sdim Opcode == AMDGPU::V_PERMLANEX16_B32; 123353358Sdim} 124353358Sdim 125314564Sdimstatic unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { 126314564Sdim const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, 127314564Sdim AMDGPU::OpName::simm16); 128314564Sdim return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_; 129314564Sdim} 130314564Sdim 131303231SdimScheduleHazardRecognizer::HazardType 132303231SdimGCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { 133303231Sdim MachineInstr *MI = SU->getInstr(); 134353358Sdim if (MI->isBundle()) 135353358Sdim return NoHazard; 136303231Sdim 137303231Sdim if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) 138303231Sdim return NoopHazard; 139303231Sdim 140327952Sdim // FIXME: Should flat be considered vmem? 141327952Sdim if ((SIInstrInfo::isVMEM(*MI) || 142327952Sdim SIInstrInfo::isFLAT(*MI)) 143327952Sdim && checkVMEMHazards(MI) > 0) 144303231Sdim return NoopHazard; 145303231Sdim 146353358Sdim if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) 147353358Sdim return NoopHazard; 148353358Sdim 149353358Sdim if (checkFPAtomicToDenormModeHazard(MI) > 0) 150353358Sdim return NoopHazard; 151353358Sdim 152353358Sdim if (ST.hasNoDataDepHazard()) 153353358Sdim return NoHazard; 154353358Sdim 155314564Sdim if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) 156314564Sdim return NoopHazard; 157314564Sdim 158303231Sdim if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) 159303231Sdim return NoopHazard; 160303231Sdim 161314564Sdim if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) 162314564Sdim return NoopHazard; 163314564Sdim 164314564Sdim if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) 165314564Sdim return NoopHazard; 166314564Sdim 167314564Sdim if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) 168314564Sdim return NoopHazard; 169314564Sdim 170314564Sdim if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) 171314564Sdim return NoopHazard; 172314564Sdim 173314564Sdim if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) 174314564Sdim return NoopHazard; 175314564Sdim 176327952Sdim if (ST.hasReadM0MovRelInterpHazard() && 177327952Sdim (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && 178321369Sdim checkReadM0Hazards(MI) > 0) 179321369Sdim return NoopHazard; 180321369Sdim 181344779Sdim if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && 182327952Sdim checkReadM0Hazards(MI) > 0) 183327952Sdim return NoopHazard; 184327952Sdim 185353358Sdim if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) 186353358Sdim return NoopHazard; 187353358Sdim 188360784Sdim if (MI->mayLoadOrStore() && checkMAILdStHazards(MI) > 0) 189353358Sdim return NoopHazard; 190353358Sdim 191327952Sdim if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) 192327952Sdim return NoopHazard; 193327952Sdim 194321369Sdim if (checkAnyInstHazards(MI) > 0) 195321369Sdim return NoopHazard; 196321369Sdim 197303231Sdim return NoHazard; 198303231Sdim} 199303231Sdim 200353358Sdimstatic void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { 201353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) 202353358Sdim .addImm(0); 203353358Sdim} 204353358Sdim 205353358Sdimvoid GCNHazardRecognizer::processBundle() { 206353358Sdim MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); 207353358Sdim MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); 208353358Sdim // Check bundled MachineInstr's for hazards. 209353358Sdim for (; MI != E && MI->isInsideBundle(); ++MI) { 210353358Sdim CurrCycleInstr = &*MI; 211353358Sdim unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); 212353358Sdim 213353358Sdim if (IsHazardRecognizerMode) 214353358Sdim fixHazards(CurrCycleInstr); 215353358Sdim 216353358Sdim for (unsigned i = 0; i < WaitStates; ++i) 217353358Sdim insertNoopInBundle(CurrCycleInstr, TII); 218353358Sdim 219353358Sdim // It���s unnecessary to track more than MaxLookAhead instructions. Since we 220353358Sdim // include the bundled MI directly after, only add a maximum of 221353358Sdim // (MaxLookAhead - 1) noops to EmittedInstrs. 222353358Sdim for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) 223353358Sdim EmittedInstrs.push_front(nullptr); 224353358Sdim 225353358Sdim EmittedInstrs.push_front(CurrCycleInstr); 226353358Sdim EmittedInstrs.resize(MaxLookAhead); 227353358Sdim } 228353358Sdim CurrCycleInstr = nullptr; 229353358Sdim} 230353358Sdim 231303231Sdimunsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { 232353358Sdim IsHazardRecognizerMode = false; 233353358Sdim return PreEmitNoopsCommon(SU->getInstr()); 234303231Sdim} 235303231Sdim 236303231Sdimunsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { 237353358Sdim IsHazardRecognizerMode = true; 238353358Sdim CurrCycleInstr = MI; 239353358Sdim unsigned W = PreEmitNoopsCommon(MI); 240353358Sdim fixHazards(MI); 241353358Sdim CurrCycleInstr = nullptr; 242353358Sdim return W; 243353358Sdim} 244353358Sdim 245353358Sdimunsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { 246353358Sdim if (MI->isBundle()) 247353358Sdim return 0; 248353358Sdim 249321369Sdim int WaitStates = std::max(0, checkAnyInstHazards(MI)); 250321369Sdim 251303231Sdim if (SIInstrInfo::isSMRD(*MI)) 252321369Sdim return std::max(WaitStates, checkSMRDHazards(MI)); 253303231Sdim 254353358Sdim if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) 255353358Sdim WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); 256353358Sdim 257353358Sdim if (ST.hasNSAtoVMEMBug()) 258353358Sdim WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); 259353358Sdim 260353358Sdim WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); 261353358Sdim 262353358Sdim if (ST.hasNoDataDepHazard()) 263353358Sdim return WaitStates; 264353358Sdim 265327952Sdim if (SIInstrInfo::isVALU(*MI)) 266327952Sdim WaitStates = std::max(WaitStates, checkVALUHazards(MI)); 267303231Sdim 268327952Sdim if (SIInstrInfo::isDPP(*MI)) 269327952Sdim WaitStates = std::max(WaitStates, checkDPPHazards(MI)); 270314564Sdim 271327952Sdim if (isDivFMas(MI->getOpcode())) 272327952Sdim WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); 273314564Sdim 274327952Sdim if (isRWLane(MI->getOpcode())) 275327952Sdim WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); 276314564Sdim 277327952Sdim if (MI->isInlineAsm()) 278327952Sdim return std::max(WaitStates, checkInlineAsmHazards(MI)); 279321369Sdim 280314564Sdim if (isSGetReg(MI->getOpcode())) 281321369Sdim return std::max(WaitStates, checkGetRegHazards(MI)); 282314564Sdim 283314564Sdim if (isSSetReg(MI->getOpcode())) 284321369Sdim return std::max(WaitStates, checkSetRegHazards(MI)); 285314564Sdim 286314564Sdim if (isRFE(MI->getOpcode())) 287321369Sdim return std::max(WaitStates, checkRFEHazards(MI)); 288314564Sdim 289327952Sdim if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || 290327952Sdim isSMovRel(MI->getOpcode()))) 291321369Sdim return std::max(WaitStates, checkReadM0Hazards(MI)); 292321369Sdim 293344779Sdim if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) 294327952Sdim return std::max(WaitStates, checkReadM0Hazards(MI)); 295327952Sdim 296353358Sdim if (SIInstrInfo::isMAI(*MI)) 297353358Sdim return std::max(WaitStates, checkMAIHazards(MI)); 298353358Sdim 299360784Sdim if (MI->mayLoadOrStore()) 300353358Sdim return std::max(WaitStates, checkMAILdStHazards(MI)); 301353358Sdim 302321369Sdim return WaitStates; 303303231Sdim} 304303231Sdim 305303231Sdimvoid GCNHazardRecognizer::EmitNoop() { 306303231Sdim EmittedInstrs.push_front(nullptr); 307303231Sdim} 308303231Sdim 309303231Sdimvoid GCNHazardRecognizer::AdvanceCycle() { 310303231Sdim // When the scheduler detects a stall, it will call AdvanceCycle() without 311303231Sdim // emitting any instructions. 312303231Sdim if (!CurrCycleInstr) 313303231Sdim return; 314303231Sdim 315344779Sdim // Do not track non-instructions which do not affect the wait states. 316344779Sdim // If included, these instructions can lead to buffer overflow such that 317344779Sdim // detectable hazards are missed. 318353358Sdim if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || 319353358Sdim CurrCycleInstr->isKill()) 320344779Sdim return; 321353358Sdim 322353358Sdim if (CurrCycleInstr->isBundle()) { 323353358Sdim processBundle(); 324344779Sdim return; 325353358Sdim } 326344779Sdim 327321369Sdim unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); 328303231Sdim 329303231Sdim // Keep track of emitted instructions 330303231Sdim EmittedInstrs.push_front(CurrCycleInstr); 331303231Sdim 332303231Sdim // Add a nullptr for each additional wait state after the first. Make sure 333303231Sdim // not to add more than getMaxLookAhead() items to the list, since we 334303231Sdim // truncate the list to that size right after this loop. 335303231Sdim for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); 336303231Sdim i < e; ++i) { 337303231Sdim EmittedInstrs.push_front(nullptr); 338303231Sdim } 339303231Sdim 340303231Sdim // getMaxLookahead() is the largest number of wait states we will ever need 341303231Sdim // to insert, so there is no point in keeping track of more than that many 342303231Sdim // wait states. 343303231Sdim EmittedInstrs.resize(getMaxLookAhead()); 344303231Sdim 345303231Sdim CurrCycleInstr = nullptr; 346303231Sdim} 347303231Sdim 348303231Sdimvoid GCNHazardRecognizer::RecedeCycle() { 349303231Sdim llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); 350303231Sdim} 351303231Sdim 352303231Sdim//===----------------------------------------------------------------------===// 353303231Sdim// Helper Functions 354303231Sdim//===----------------------------------------------------------------------===// 355303231Sdim 356353358Sdimtypedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; 357353358Sdim 358353358Sdim// Returns a minimum wait states since \p I walking all predecessors. 359353358Sdim// Only scans until \p IsExpired does not return true. 360353358Sdim// Can only be run in a hazard recognizer mode. 361353358Sdimstatic int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 362353358Sdim MachineBasicBlock *MBB, 363353358Sdim MachineBasicBlock::reverse_instr_iterator I, 364353358Sdim int WaitStates, 365353358Sdim IsExpiredFn IsExpired, 366353358Sdim DenseSet<const MachineBasicBlock *> &Visited) { 367353358Sdim for (auto E = MBB->instr_rend(); I != E; ++I) { 368353358Sdim // Don't add WaitStates for parent BUNDLE instructions. 369353358Sdim if (I->isBundle()) 370353358Sdim continue; 371353358Sdim 372353358Sdim if (IsHazard(&*I)) 373353358Sdim return WaitStates; 374353358Sdim 375353358Sdim if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) 376353358Sdim continue; 377353358Sdim 378353358Sdim WaitStates += SIInstrInfo::getNumWaitStates(*I); 379353358Sdim 380353358Sdim if (IsExpired(&*I, WaitStates)) 381353358Sdim return std::numeric_limits<int>::max(); 382353358Sdim } 383353358Sdim 384353358Sdim int MinWaitStates = WaitStates; 385353358Sdim bool Found = false; 386353358Sdim for (MachineBasicBlock *Pred : MBB->predecessors()) { 387353358Sdim if (!Visited.insert(Pred).second) 388353358Sdim continue; 389353358Sdim 390353358Sdim int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), 391353358Sdim WaitStates, IsExpired, Visited); 392353358Sdim 393353358Sdim if (W == std::numeric_limits<int>::max()) 394353358Sdim continue; 395353358Sdim 396353358Sdim MinWaitStates = Found ? std::min(MinWaitStates, W) : W; 397353358Sdim if (IsExpired(nullptr, MinWaitStates)) 398353358Sdim return MinWaitStates; 399353358Sdim 400353358Sdim Found = true; 401353358Sdim } 402353358Sdim 403353358Sdim if (Found) 404353358Sdim return MinWaitStates; 405353358Sdim 406353358Sdim return std::numeric_limits<int>::max(); 407353358Sdim} 408353358Sdim 409353358Sdimstatic int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, 410353358Sdim MachineInstr *MI, 411353358Sdim IsExpiredFn IsExpired) { 412353358Sdim DenseSet<const MachineBasicBlock *> Visited; 413353358Sdim return getWaitStatesSince(IsHazard, MI->getParent(), 414353358Sdim std::next(MI->getReverseIterator()), 415353358Sdim 0, IsExpired, Visited); 416353358Sdim} 417353358Sdim 418353358Sdimint GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { 419353358Sdim if (IsHazardRecognizerMode) { 420353358Sdim auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { 421353358Sdim return WaitStates >= Limit; 422353358Sdim }; 423353358Sdim return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); 424353358Sdim } 425353358Sdim 426326496Sdim int WaitStates = 0; 427303231Sdim for (MachineInstr *MI : EmittedInstrs) { 428326496Sdim if (MI) { 429326496Sdim if (IsHazard(MI)) 430326496Sdim return WaitStates; 431326496Sdim 432353358Sdim if (MI->isInlineAsm()) 433326496Sdim continue; 434326496Sdim } 435303231Sdim ++WaitStates; 436353358Sdim 437353358Sdim if (WaitStates >= Limit) 438353358Sdim break; 439303231Sdim } 440303231Sdim return std::numeric_limits<int>::max(); 441303231Sdim} 442303231Sdim 443353358Sdimint GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, 444353358Sdim IsHazardFn IsHazardDef, 445353358Sdim int Limit) { 446314564Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 447314564Sdim 448314564Sdim auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { 449314564Sdim return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); 450314564Sdim }; 451314564Sdim 452353358Sdim return getWaitStatesSince(IsHazardFn, Limit); 453314564Sdim} 454314564Sdim 455353358Sdimint GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, 456353358Sdim int Limit) { 457314564Sdim auto IsHazardFn = [IsHazard] (MachineInstr *MI) { 458314564Sdim return isSSetReg(MI->getOpcode()) && IsHazard(MI); 459314564Sdim }; 460314564Sdim 461353358Sdim return getWaitStatesSince(IsHazardFn, Limit); 462314564Sdim} 463314564Sdim 464303231Sdim//===----------------------------------------------------------------------===// 465303231Sdim// No-op Hazard Detection 466303231Sdim//===----------------------------------------------------------------------===// 467303231Sdim 468327952Sdimstatic void addRegUnits(const SIRegisterInfo &TRI, 469327952Sdim BitVector &BV, unsigned Reg) { 470327952Sdim for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) 471327952Sdim BV.set(*RUI); 472327952Sdim} 473327952Sdim 474327952Sdimstatic void addRegsToSet(const SIRegisterInfo &TRI, 475327952Sdim iterator_range<MachineInstr::const_mop_iterator> Ops, 476327952Sdim BitVector &Set) { 477303231Sdim for (const MachineOperand &Op : Ops) { 478303231Sdim if (Op.isReg()) 479327952Sdim addRegUnits(TRI, Set, Op.getReg()); 480303231Sdim } 481303231Sdim} 482303231Sdim 483327952Sdimvoid GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { 484327952Sdim // XXX: Do we need to worry about implicit operands 485327952Sdim addRegsToSet(TRI, MI.defs(), ClauseDefs); 486327952Sdim addRegsToSet(TRI, MI.uses(), ClauseUses); 487327952Sdim} 488327952Sdim 489327952Sdimint GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { 490327952Sdim // SMEM soft clause are only present on VI+, and only matter if xnack is 491327952Sdim // enabled. 492327952Sdim if (!ST.isXNACKEnabled()) 493303231Sdim return 0; 494303231Sdim 495327952Sdim bool IsSMRD = TII.isSMRD(*MEM); 496327952Sdim 497327952Sdim resetClause(); 498327952Sdim 499303231Sdim // A soft-clause is any group of consecutive SMEM instructions. The 500303231Sdim // instructions in this group may return out of order and/or may be 501303231Sdim // replayed (i.e. the same instruction issued more than once). 502303231Sdim // 503353358Sdim // In order to handle these situations correctly we need to make sure that 504353358Sdim // when a clause has more than one instruction, no instruction in the clause 505353358Sdim // writes to a register that is read by another instruction in the clause 506303231Sdim // (including itself). If we encounter this situaion, we need to break the 507303231Sdim // clause by inserting a non SMEM instruction. 508303231Sdim 509303231Sdim for (MachineInstr *MI : EmittedInstrs) { 510303231Sdim // When we hit a non-SMEM instruction then we have passed the start of the 511303231Sdim // clause and we can stop. 512327952Sdim if (!MI) 513303231Sdim break; 514303231Sdim 515327952Sdim if (IsSMRD != SIInstrInfo::isSMRD(*MI)) 516327952Sdim break; 517327952Sdim 518327952Sdim addClauseInst(*MI); 519303231Sdim } 520303231Sdim 521327952Sdim if (ClauseDefs.none()) 522303231Sdim return 0; 523303231Sdim 524327952Sdim // We need to make sure not to put loads and stores in the same clause if they 525327952Sdim // use the same address. For now, just start a new clause whenever we see a 526327952Sdim // store. 527327952Sdim if (MEM->mayStore()) 528303231Sdim return 1; 529303231Sdim 530327952Sdim addClauseInst(*MEM); 531303231Sdim 532303231Sdim // If the set of defs and uses intersect then we cannot add this instruction 533303231Sdim // to the clause, so we have a hazard. 534327952Sdim return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; 535303231Sdim} 536303231Sdim 537303231Sdimint GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { 538303231Sdim int WaitStatesNeeded = 0; 539303231Sdim 540327952Sdim WaitStatesNeeded = checkSoftClauseHazards(SMRD); 541303231Sdim 542303231Sdim // This SMRD hazard only affects SI. 543353358Sdim if (!ST.hasSMRDReadVALUDefHazard()) 544303231Sdim return WaitStatesNeeded; 545303231Sdim 546303231Sdim // A read of an SGPR by SMRD instruction requires 4 wait states when the 547303231Sdim // SGPR was written by a VALU instruction. 548303231Sdim int SmrdSgprWaitStates = 4; 549321369Sdim auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 550327952Sdim auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; 551303231Sdim 552327952Sdim bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); 553327952Sdim 554303231Sdim for (const MachineOperand &Use : SMRD->uses()) { 555303231Sdim if (!Use.isReg()) 556303231Sdim continue; 557303231Sdim int WaitStatesNeededForUse = 558353358Sdim SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 559353358Sdim SmrdSgprWaitStates); 560303231Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 561327952Sdim 562327952Sdim // This fixes what appears to be undocumented hardware behavior in SI where 563327952Sdim // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor 564327952Sdim // needs some number of nops in between. We don't know how many we need, but 565327952Sdim // let's use 4. This wasn't discovered before probably because the only 566327952Sdim // case when this happens is when we expand a 64-bit pointer into a full 567327952Sdim // descriptor and use s_buffer_load_dword instead of s_load_dword, which was 568327952Sdim // probably never encountered in the closed-source land. 569327952Sdim if (IsBufferSMRD) { 570327952Sdim int WaitStatesNeededForUse = 571327952Sdim SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 572353358Sdim IsBufferHazardDefFn, 573353358Sdim SmrdSgprWaitStates); 574327952Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 575327952Sdim } 576303231Sdim } 577327952Sdim 578303231Sdim return WaitStatesNeeded; 579303231Sdim} 580303231Sdim 581303231Sdimint GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { 582353358Sdim if (!ST.hasVMEMReadSGPRVALUDefHazard()) 583303231Sdim return 0; 584303231Sdim 585327952Sdim int WaitStatesNeeded = checkSoftClauseHazards(VMEM); 586303231Sdim 587303231Sdim // A read of an SGPR by a VMEM instruction requires 5 wait states when the 588303231Sdim // SGPR was written by a VALU Instruction. 589327952Sdim const int VmemSgprWaitStates = 5; 590327952Sdim auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; 591303231Sdim for (const MachineOperand &Use : VMEM->uses()) { 592303231Sdim if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 593303231Sdim continue; 594303231Sdim 595303231Sdim int WaitStatesNeededForUse = 596353358Sdim VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, 597353358Sdim VmemSgprWaitStates); 598303231Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 599303231Sdim } 600303231Sdim return WaitStatesNeeded; 601303231Sdim} 602303231Sdim 603303231Sdimint GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { 604303231Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 605327952Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 606303231Sdim 607327952Sdim // Check for DPP VGPR read after VALU VGPR write and EXEC write. 608303231Sdim int DppVgprWaitStates = 2; 609327952Sdim int DppExecWaitStates = 5; 610303231Sdim int WaitStatesNeeded = 0; 611327952Sdim auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 612303231Sdim 613303231Sdim for (const MachineOperand &Use : DPP->uses()) { 614303231Sdim if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 615303231Sdim continue; 616303231Sdim int WaitStatesNeededForUse = 617353358Sdim DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), 618353358Sdim [](MachineInstr *) { return true; }, 619353358Sdim DppVgprWaitStates); 620303231Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 621303231Sdim } 622303231Sdim 623327952Sdim WaitStatesNeeded = std::max( 624327952Sdim WaitStatesNeeded, 625353358Sdim DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, 626353358Sdim DppExecWaitStates)); 627327952Sdim 628303231Sdim return WaitStatesNeeded; 629303231Sdim} 630314564Sdim 631314564Sdimint GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { 632314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 633314564Sdim 634314564Sdim // v_div_fmas requires 4 wait states after a write to vcc from a VALU 635314564Sdim // instruction. 636314564Sdim const int DivFMasWaitStates = 4; 637314564Sdim auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; 638353358Sdim int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, 639353358Sdim DivFMasWaitStates); 640314564Sdim 641314564Sdim return DivFMasWaitStates - WaitStatesNeeded; 642314564Sdim} 643314564Sdim 644314564Sdimint GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { 645314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 646314564Sdim unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr); 647314564Sdim 648314564Sdim const int GetRegWaitStates = 2; 649314564Sdim auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { 650314564Sdim return GetRegHWReg == getHWReg(TII, *MI); 651314564Sdim }; 652353358Sdim int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); 653314564Sdim 654314564Sdim return GetRegWaitStates - WaitStatesNeeded; 655314564Sdim} 656314564Sdim 657314564Sdimint GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { 658314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 659314564Sdim unsigned HWReg = getHWReg(TII, *SetRegInstr); 660314564Sdim 661353358Sdim const int SetRegWaitStates = ST.getSetRegWaitStates(); 662314564Sdim auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { 663314564Sdim return HWReg == getHWReg(TII, *MI); 664314564Sdim }; 665353358Sdim int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); 666314564Sdim return SetRegWaitStates - WaitStatesNeeded; 667314564Sdim} 668314564Sdim 669314564Sdimint GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { 670314564Sdim if (!MI.mayStore()) 671314564Sdim return -1; 672314564Sdim 673314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 674314564Sdim unsigned Opcode = MI.getOpcode(); 675314564Sdim const MCInstrDesc &Desc = MI.getDesc(); 676314564Sdim 677314564Sdim int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 678314564Sdim int VDataRCID = -1; 679314564Sdim if (VDataIdx != -1) 680314564Sdim VDataRCID = Desc.OpInfo[VDataIdx].RegClass; 681314564Sdim 682314564Sdim if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) { 683314564Sdim // There is no hazard if the instruction does not use vector regs 684314564Sdim // (like wbinvl1) 685314564Sdim if (VDataIdx == -1) 686314564Sdim return -1; 687314564Sdim // For MUBUF/MTBUF instructions this hazard only exists if the 688314564Sdim // instruction is not using a register in the soffset field. 689314564Sdim const MachineOperand *SOffset = 690314564Sdim TII->getNamedOperand(MI, AMDGPU::OpName::soffset); 691314564Sdim // If we have no soffset operand, then assume this field has been 692314564Sdim // hardcoded to zero. 693314564Sdim if (AMDGPU::getRegBitWidth(VDataRCID) > 64 && 694314564Sdim (!SOffset || !SOffset->isReg())) 695314564Sdim return VDataIdx; 696314564Sdim } 697314564Sdim 698314564Sdim // MIMG instructions create a hazard if they don't use a 256-bit T# and 699314564Sdim // the store size is greater than 8 bytes and they have more than two bits 700314564Sdim // of their dmask set. 701314564Sdim // All our MIMG definitions use a 256-bit T#, so we can skip checking for them. 702314564Sdim if (TII->isMIMG(MI)) { 703314564Sdim int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 704314564Sdim assert(SRsrcIdx != -1 && 705314564Sdim AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256); 706314564Sdim (void)SRsrcIdx; 707314564Sdim } 708314564Sdim 709314564Sdim if (TII->isFLAT(MI)) { 710314564Sdim int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata); 711314564Sdim if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64) 712314564Sdim return DataIdx; 713314564Sdim } 714314564Sdim 715314564Sdim return -1; 716314564Sdim} 717314564Sdim 718327952Sdimint GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, 719327952Sdim const MachineRegisterInfo &MRI) { 720327952Sdim // Helper to check for the hazard where VMEM instructions that store more than 721327952Sdim // 8 bytes can have there store data over written by the next instruction. 722327952Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 723327952Sdim 724327952Sdim const int VALUWaitStates = 1; 725327952Sdim int WaitStatesNeeded = 0; 726327952Sdim 727327952Sdim if (!TRI->isVGPR(MRI, Def.getReg())) 728327952Sdim return WaitStatesNeeded; 729360784Sdim Register Reg = Def.getReg(); 730327952Sdim auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { 731327952Sdim int DataIdx = createsVALUHazard(*MI); 732327952Sdim return DataIdx >= 0 && 733327952Sdim TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); 734327952Sdim }; 735327952Sdim int WaitStatesNeededForDef = 736353358Sdim VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); 737327952Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); 738327952Sdim 739327952Sdim return WaitStatesNeeded; 740327952Sdim} 741327952Sdim 742314564Sdimint GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { 743314564Sdim // This checks for the hazard where VMEM instructions that store more than 744314564Sdim // 8 bytes can have there store data over written by the next instruction. 745314564Sdim if (!ST.has12DWordStoreHazard()) 746314564Sdim return 0; 747314564Sdim 748327952Sdim const MachineRegisterInfo &MRI = MF.getRegInfo(); 749314564Sdim int WaitStatesNeeded = 0; 750314564Sdim 751314564Sdim for (const MachineOperand &Def : VALU->defs()) { 752327952Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); 753314564Sdim } 754327952Sdim 755314564Sdim return WaitStatesNeeded; 756314564Sdim} 757314564Sdim 758327952Sdimint GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { 759327952Sdim // This checks for hazards associated with inline asm statements. 760327952Sdim // Since inline asms can contain just about anything, we use this 761327952Sdim // to call/leverage other check*Hazard routines. Note that 762327952Sdim // this function doesn't attempt to address all possible inline asm 763327952Sdim // hazards (good luck), but is a collection of what has been 764327952Sdim // problematic thus far. 765327952Sdim 766327952Sdim // see checkVALUHazards() 767327952Sdim if (!ST.has12DWordStoreHazard()) 768327952Sdim return 0; 769327952Sdim 770327952Sdim const MachineRegisterInfo &MRI = MF.getRegInfo(); 771327952Sdim int WaitStatesNeeded = 0; 772327952Sdim 773327952Sdim for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); 774327952Sdim I != E; ++I) { 775327952Sdim const MachineOperand &Op = IA->getOperand(I); 776327952Sdim if (Op.isReg() && Op.isDef()) { 777327952Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); 778327952Sdim } 779327952Sdim } 780327952Sdim 781327952Sdim return WaitStatesNeeded; 782327952Sdim} 783327952Sdim 784314564Sdimint GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { 785314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 786314564Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 787327952Sdim const MachineRegisterInfo &MRI = MF.getRegInfo(); 788314564Sdim 789314564Sdim const MachineOperand *LaneSelectOp = 790314564Sdim TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); 791314564Sdim 792314564Sdim if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg())) 793314564Sdim return 0; 794314564Sdim 795360784Sdim Register LaneSelectReg = LaneSelectOp->getReg(); 796314564Sdim auto IsHazardFn = [TII] (MachineInstr *MI) { 797314564Sdim return TII->isVALU(*MI); 798314564Sdim }; 799314564Sdim 800314564Sdim const int RWLaneWaitStates = 4; 801353358Sdim int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, 802353358Sdim RWLaneWaitStates); 803314564Sdim return RWLaneWaitStates - WaitStatesSince; 804314564Sdim} 805314564Sdim 806314564Sdimint GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { 807353358Sdim if (!ST.hasRFEHazards()) 808314564Sdim return 0; 809314564Sdim 810314564Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 811314564Sdim 812314564Sdim const int RFEWaitStates = 1; 813314564Sdim 814314564Sdim auto IsHazardFn = [TII] (MachineInstr *MI) { 815314564Sdim return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; 816314564Sdim }; 817353358Sdim int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); 818314564Sdim return RFEWaitStates - WaitStatesNeeded; 819314564Sdim} 820321369Sdim 821321369Sdimint GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { 822341825Sdim if (MI->isDebugInstr()) 823321369Sdim return 0; 824321369Sdim 825321369Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 826321369Sdim if (!ST.hasSMovFedHazard()) 827321369Sdim return 0; 828321369Sdim 829321369Sdim // Check for any instruction reading an SGPR after a write from 830321369Sdim // s_mov_fed_b32. 831321369Sdim int MovFedWaitStates = 1; 832321369Sdim int WaitStatesNeeded = 0; 833321369Sdim 834321369Sdim for (const MachineOperand &Use : MI->uses()) { 835321369Sdim if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg())) 836321369Sdim continue; 837321369Sdim auto IsHazardFn = [] (MachineInstr *MI) { 838321369Sdim return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; 839321369Sdim }; 840321369Sdim int WaitStatesNeededForUse = 841353358Sdim MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, 842353358Sdim MovFedWaitStates); 843321369Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 844321369Sdim } 845321369Sdim 846321369Sdim return WaitStatesNeeded; 847321369Sdim} 848321369Sdim 849321369Sdimint GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { 850321369Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 851327952Sdim const int SMovRelWaitStates = 1; 852321369Sdim auto IsHazardFn = [TII] (MachineInstr *MI) { 853321369Sdim return TII->isSALU(*MI); 854321369Sdim }; 855353358Sdim return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, 856353358Sdim SMovRelWaitStates); 857321369Sdim} 858353358Sdim 859353358Sdimvoid GCNHazardRecognizer::fixHazards(MachineInstr *MI) { 860353358Sdim fixVMEMtoScalarWriteHazards(MI); 861353358Sdim fixVcmpxPermlaneHazards(MI); 862353358Sdim fixSMEMtoVectorWriteHazards(MI); 863353358Sdim fixVcmpxExecWARHazard(MI); 864353358Sdim fixLdsBranchVmemWARHazard(MI); 865353358Sdim} 866353358Sdim 867353358Sdimbool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { 868353358Sdim if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) 869353358Sdim return false; 870353358Sdim 871353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 872353358Sdim auto IsHazardFn = [TII] (MachineInstr *MI) { 873353358Sdim return TII->isVOPC(*MI); 874353358Sdim }; 875353358Sdim 876353358Sdim auto IsExpiredFn = [] (MachineInstr *MI, int) { 877353358Sdim if (!MI) 878353358Sdim return false; 879353358Sdim unsigned Opc = MI->getOpcode(); 880353358Sdim return SIInstrInfo::isVALU(*MI) && 881353358Sdim Opc != AMDGPU::V_NOP_e32 && 882353358Sdim Opc != AMDGPU::V_NOP_e64 && 883353358Sdim Opc != AMDGPU::V_NOP_sdwa; 884353358Sdim }; 885353358Sdim 886353358Sdim if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 887353358Sdim std::numeric_limits<int>::max()) 888353358Sdim return false; 889353358Sdim 890353358Sdim // V_NOP will be discarded by SQ. 891353358Sdim // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* 892353358Sdim // which is always a VGPR and available. 893353358Sdim auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); 894360784Sdim Register Reg = Src0->getReg(); 895353358Sdim bool IsUndef = Src0->isUndef(); 896353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 897353358Sdim TII->get(AMDGPU::V_MOV_B32_e32)) 898353358Sdim .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) 899353358Sdim .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); 900353358Sdim 901353358Sdim return true; 902353358Sdim} 903353358Sdim 904353358Sdimbool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { 905353358Sdim if (!ST.hasVMEMtoScalarWriteHazard()) 906353358Sdim return false; 907353358Sdim 908353358Sdim if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) 909353358Sdim return false; 910353358Sdim 911353358Sdim if (MI->getNumDefs() == 0) 912353358Sdim return false; 913353358Sdim 914353358Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 915353358Sdim 916353358Sdim auto IsHazardFn = [TRI, MI] (MachineInstr *I) { 917353358Sdim if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && 918353358Sdim !SIInstrInfo::isFLAT(*I)) 919353358Sdim return false; 920353358Sdim 921353358Sdim for (const MachineOperand &Def : MI->defs()) { 922353358Sdim MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); 923353358Sdim if (!Op) 924353358Sdim continue; 925353358Sdim return true; 926353358Sdim } 927353358Sdim return false; 928353358Sdim }; 929353358Sdim 930353358Sdim auto IsExpiredFn = [] (MachineInstr *MI, int) { 931353358Sdim return MI && (SIInstrInfo::isVALU(*MI) || 932353358Sdim (MI->getOpcode() == AMDGPU::S_WAITCNT && 933353358Sdim !MI->getOperand(0).getImm())); 934353358Sdim }; 935353358Sdim 936353358Sdim if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 937353358Sdim std::numeric_limits<int>::max()) 938353358Sdim return false; 939353358Sdim 940353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 941353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); 942353358Sdim return true; 943353358Sdim} 944353358Sdim 945353358Sdimbool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { 946353358Sdim if (!ST.hasSMEMtoVectorWriteHazard()) 947353358Sdim return false; 948353358Sdim 949353358Sdim if (!SIInstrInfo::isVALU(*MI)) 950353358Sdim return false; 951353358Sdim 952353358Sdim unsigned SDSTName; 953353358Sdim switch (MI->getOpcode()) { 954353358Sdim case AMDGPU::V_READLANE_B32: 955360661Sdim case AMDGPU::V_READLANE_B32_gfx10: 956353358Sdim case AMDGPU::V_READFIRSTLANE_B32: 957353358Sdim SDSTName = AMDGPU::OpName::vdst; 958353358Sdim break; 959353358Sdim default: 960353358Sdim SDSTName = AMDGPU::OpName::sdst; 961353358Sdim break; 962353358Sdim } 963353358Sdim 964353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 965353358Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 966353358Sdim const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); 967353358Sdim const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); 968353358Sdim if (!SDST) { 969353358Sdim for (const auto &MO : MI->implicit_operands()) { 970353358Sdim if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { 971353358Sdim SDST = &MO; 972353358Sdim break; 973353358Sdim } 974353358Sdim } 975353358Sdim } 976353358Sdim 977353358Sdim if (!SDST) 978353358Sdim return false; 979353358Sdim 980360784Sdim const Register SDSTReg = SDST->getReg(); 981353358Sdim auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { 982353358Sdim return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); 983353358Sdim }; 984353358Sdim 985353358Sdim auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { 986353358Sdim if (MI) { 987353358Sdim if (TII->isSALU(*MI)) { 988353358Sdim switch (MI->getOpcode()) { 989353358Sdim case AMDGPU::S_SETVSKIP: 990353358Sdim case AMDGPU::S_VERSION: 991353358Sdim case AMDGPU::S_WAITCNT_VSCNT: 992353358Sdim case AMDGPU::S_WAITCNT_VMCNT: 993353358Sdim case AMDGPU::S_WAITCNT_EXPCNT: 994353358Sdim // These instructions cannot not mitigate the hazard. 995353358Sdim return false; 996353358Sdim case AMDGPU::S_WAITCNT_LGKMCNT: 997353358Sdim // Reducing lgkmcnt count to 0 always mitigates the hazard. 998353358Sdim return (MI->getOperand(1).getImm() == 0) && 999353358Sdim (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); 1000353358Sdim case AMDGPU::S_WAITCNT: { 1001353358Sdim const int64_t Imm = MI->getOperand(0).getImm(); 1002353358Sdim AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); 1003353358Sdim return (Decoded.LgkmCnt == 0); 1004353358Sdim } 1005353358Sdim default: 1006353358Sdim // SOPP instructions cannot mitigate the hazard. 1007353358Sdim if (TII->isSOPP(*MI)) 1008353358Sdim return false; 1009353358Sdim // At this point the SALU can be assumed to mitigate the hazard 1010353358Sdim // because either: 1011353358Sdim // (a) it is independent of the at risk SMEM (breaking chain), 1012353358Sdim // or 1013353358Sdim // (b) it is dependent on the SMEM, in which case an appropriate 1014353358Sdim // s_waitcnt lgkmcnt _must_ exist between it and the at risk 1015353358Sdim // SMEM instruction. 1016353358Sdim return true; 1017353358Sdim } 1018353358Sdim } 1019353358Sdim } 1020353358Sdim return false; 1021353358Sdim }; 1022353358Sdim 1023353358Sdim if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1024353358Sdim std::numeric_limits<int>::max()) 1025353358Sdim return false; 1026353358Sdim 1027353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1028353358Sdim TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) 1029353358Sdim .addImm(0); 1030353358Sdim return true; 1031353358Sdim} 1032353358Sdim 1033353358Sdimbool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { 1034353358Sdim if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) 1035353358Sdim return false; 1036353358Sdim 1037353358Sdim const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1038353358Sdim if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) 1039353358Sdim return false; 1040353358Sdim 1041353358Sdim auto IsHazardFn = [TRI] (MachineInstr *I) { 1042353358Sdim if (SIInstrInfo::isVALU(*I)) 1043353358Sdim return false; 1044353358Sdim return I->readsRegister(AMDGPU::EXEC, TRI); 1045353358Sdim }; 1046353358Sdim 1047353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 1048353358Sdim auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { 1049353358Sdim if (!MI) 1050353358Sdim return false; 1051353358Sdim if (SIInstrInfo::isVALU(*MI)) { 1052353358Sdim if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) 1053353358Sdim return true; 1054353358Sdim for (auto MO : MI->implicit_operands()) 1055353358Sdim if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) 1056353358Sdim return true; 1057353358Sdim } 1058353358Sdim if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 1059353358Sdim (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) 1060353358Sdim return true; 1061353358Sdim return false; 1062353358Sdim }; 1063353358Sdim 1064353358Sdim if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1065353358Sdim std::numeric_limits<int>::max()) 1066353358Sdim return false; 1067353358Sdim 1068353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1069353358Sdim TII->get(AMDGPU::S_WAITCNT_DEPCTR)) 1070353358Sdim .addImm(0xfffe); 1071353358Sdim return true; 1072353358Sdim} 1073353358Sdim 1074353358Sdimbool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { 1075353358Sdim if (!ST.hasLdsBranchVmemWARHazard()) 1076353358Sdim return false; 1077353358Sdim 1078353358Sdim auto IsHazardInst = [] (const MachineInstr *MI) { 1079353358Sdim if (SIInstrInfo::isDS(*MI)) 1080353358Sdim return 1; 1081353358Sdim if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) 1082353358Sdim return 2; 1083353358Sdim return 0; 1084353358Sdim }; 1085353358Sdim 1086353358Sdim auto InstType = IsHazardInst(MI); 1087353358Sdim if (!InstType) 1088353358Sdim return false; 1089353358Sdim 1090353358Sdim auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { 1091353358Sdim return I && (IsHazardInst(I) || 1092353358Sdim (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1093353358Sdim I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1094353358Sdim !I->getOperand(1).getImm())); 1095353358Sdim }; 1096353358Sdim 1097353358Sdim auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { 1098353358Sdim if (!I->isBranch()) 1099353358Sdim return false; 1100353358Sdim 1101353358Sdim auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { 1102353358Sdim auto InstType2 = IsHazardInst(I); 1103353358Sdim return InstType2 && InstType != InstType2; 1104353358Sdim }; 1105353358Sdim 1106353358Sdim auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { 1107353358Sdim if (!I) 1108353358Sdim return false; 1109353358Sdim 1110353358Sdim auto InstType2 = IsHazardInst(I); 1111353358Sdim if (InstType == InstType2) 1112353358Sdim return true; 1113353358Sdim 1114353358Sdim return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && 1115353358Sdim I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && 1116353358Sdim !I->getOperand(1).getImm(); 1117353358Sdim }; 1118353358Sdim 1119353358Sdim return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != 1120353358Sdim std::numeric_limits<int>::max(); 1121353358Sdim }; 1122353358Sdim 1123353358Sdim if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == 1124353358Sdim std::numeric_limits<int>::max()) 1125353358Sdim return false; 1126353358Sdim 1127353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 1128353358Sdim BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), 1129353358Sdim TII->get(AMDGPU::S_WAITCNT_VSCNT)) 1130353358Sdim .addReg(AMDGPU::SGPR_NULL, RegState::Undef) 1131353358Sdim .addImm(0); 1132353358Sdim 1133353358Sdim return true; 1134353358Sdim} 1135353358Sdim 1136353358Sdimint GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { 1137353358Sdim int NSAtoVMEMWaitStates = 1; 1138353358Sdim 1139353358Sdim if (!ST.hasNSAtoVMEMBug()) 1140353358Sdim return 0; 1141353358Sdim 1142353358Sdim if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) 1143353358Sdim return 0; 1144353358Sdim 1145353358Sdim const SIInstrInfo *TII = ST.getInstrInfo(); 1146353358Sdim const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); 1147353358Sdim if (!Offset || (Offset->getImm() & 6) == 0) 1148353358Sdim return 0; 1149353358Sdim 1150353358Sdim auto IsHazardFn = [TII] (MachineInstr *I) { 1151353358Sdim if (!SIInstrInfo::isMIMG(*I)) 1152353358Sdim return false; 1153353358Sdim const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); 1154353358Sdim return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && 1155353358Sdim TII->getInstSizeInBytes(*I) >= 16; 1156353358Sdim }; 1157353358Sdim 1158353358Sdim return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); 1159353358Sdim} 1160353358Sdim 1161353358Sdimint GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { 1162353358Sdim int FPAtomicToDenormModeWaitStates = 3; 1163353358Sdim 1164353358Sdim if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) 1165353358Sdim return 0; 1166353358Sdim 1167353358Sdim auto IsHazardFn = [] (MachineInstr *I) { 1168353358Sdim if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) 1169353358Sdim return false; 1170353358Sdim return SIInstrInfo::isFPAtomic(*I); 1171353358Sdim }; 1172353358Sdim 1173353358Sdim auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { 1174353358Sdim if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) 1175353358Sdim return true; 1176353358Sdim 1177353358Sdim switch (MI->getOpcode()) { 1178353358Sdim case AMDGPU::S_WAITCNT: 1179353358Sdim case AMDGPU::S_WAITCNT_VSCNT: 1180353358Sdim case AMDGPU::S_WAITCNT_VMCNT: 1181353358Sdim case AMDGPU::S_WAITCNT_EXPCNT: 1182353358Sdim case AMDGPU::S_WAITCNT_LGKMCNT: 1183353358Sdim case AMDGPU::S_WAITCNT_IDLE: 1184353358Sdim return true; 1185353358Sdim default: 1186353358Sdim break; 1187353358Sdim } 1188353358Sdim 1189353358Sdim return false; 1190353358Sdim }; 1191353358Sdim 1192353358Sdim 1193353358Sdim return FPAtomicToDenormModeWaitStates - 1194353358Sdim ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); 1195353358Sdim} 1196353358Sdim 1197353358Sdimint GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { 1198353358Sdim assert(SIInstrInfo::isMAI(*MI)); 1199353358Sdim 1200353358Sdim int WaitStatesNeeded = 0; 1201353358Sdim unsigned Opc = MI->getOpcode(); 1202353358Sdim 1203353358Sdim auto IsVALUFn = [] (MachineInstr *MI) { 1204353358Sdim return SIInstrInfo::isVALU(*MI); 1205353358Sdim }; 1206353358Sdim 1207353358Sdim if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write 1208353358Sdim const int LegacyVALUWritesVGPRWaitStates = 2; 1209353358Sdim const int VALUWritesExecWaitStates = 4; 1210353358Sdim const int MaxWaitStates = 4; 1211353358Sdim 1212353358Sdim int WaitStatesNeededForUse = VALUWritesExecWaitStates - 1213353358Sdim getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); 1214353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1215353358Sdim 1216353358Sdim if (WaitStatesNeeded < MaxWaitStates) { 1217353358Sdim for (const MachineOperand &Use : MI->explicit_uses()) { 1218353358Sdim const int MaxWaitStates = 2; 1219353358Sdim 1220353358Sdim if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) 1221353358Sdim continue; 1222353358Sdim 1223353358Sdim int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - 1224353358Sdim getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); 1225353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1226353358Sdim 1227353358Sdim if (WaitStatesNeeded == MaxWaitStates) 1228353358Sdim break; 1229353358Sdim } 1230353358Sdim } 1231353358Sdim } 1232353358Sdim 1233353358Sdim auto IsMFMAFn = [] (MachineInstr *MI) { 1234353358Sdim return SIInstrInfo::isMAI(*MI) && 1235353358Sdim MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && 1236353358Sdim MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; 1237353358Sdim }; 1238353358Sdim 1239353358Sdim for (const MachineOperand &Op : MI->explicit_operands()) { 1240353358Sdim if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) 1241353358Sdim continue; 1242353358Sdim 1243353358Sdim if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) 1244353358Sdim continue; 1245353358Sdim 1246353358Sdim const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; 1247353358Sdim const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; 1248353358Sdim const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; 1249353358Sdim const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; 1250353358Sdim const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; 1251353358Sdim const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; 1252353358Sdim const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; 1253353358Sdim const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; 1254353358Sdim const int MaxWaitStates = 18; 1255360784Sdim Register Reg = Op.getReg(); 1256353358Sdim unsigned HazardDefLatency = 0; 1257353358Sdim 1258353358Sdim auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] 1259353358Sdim (MachineInstr *MI) { 1260353358Sdim if (!IsMFMAFn(MI)) 1261353358Sdim return false; 1262360784Sdim Register DstReg = MI->getOperand(0).getReg(); 1263353358Sdim if (DstReg == Reg) 1264353358Sdim return false; 1265353358Sdim HazardDefLatency = std::max(HazardDefLatency, 1266353358Sdim TSchedModel.computeInstrLatency(MI)); 1267353358Sdim return TRI.regsOverlap(DstReg, Reg); 1268353358Sdim }; 1269353358Sdim 1270353358Sdim int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, 1271353358Sdim MaxWaitStates); 1272353358Sdim int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; 1273353358Sdim int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 1274353358Sdim int OpNo = MI->getOperandNo(&Op); 1275353358Sdim if (OpNo == SrcCIdx) { 1276353358Sdim NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; 1277353358Sdim } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { 1278353358Sdim switch (HazardDefLatency) { 1279353358Sdim case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; 1280353358Sdim break; 1281353358Sdim case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; 1282353358Sdim break; 1283353358Sdim case 16: LLVM_FALLTHROUGH; 1284353358Sdim default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; 1285353358Sdim break; 1286353358Sdim } 1287353358Sdim } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1288353358Sdim switch (HazardDefLatency) { 1289353358Sdim case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; 1290353358Sdim break; 1291353358Sdim case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; 1292353358Sdim break; 1293353358Sdim case 16: LLVM_FALLTHROUGH; 1294353358Sdim default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; 1295353358Sdim break; 1296353358Sdim } 1297353358Sdim } 1298353358Sdim 1299353358Sdim int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; 1300353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1301353358Sdim 1302353358Sdim if (WaitStatesNeeded == MaxWaitStates) 1303353358Sdim return WaitStatesNeeded; // Early exit. 1304353358Sdim 1305353358Sdim auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { 1306353358Sdim if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) 1307353358Sdim return false; 1308360784Sdim Register DstReg = MI->getOperand(0).getReg(); 1309353358Sdim return TRI.regsOverlap(Reg, DstReg); 1310353358Sdim }; 1311353358Sdim 1312353358Sdim const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; 1313353358Sdim const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; 1314353358Sdim const int AccVGPRWriteAccVgprReadWaitStates = 3; 1315353358Sdim NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; 1316353358Sdim if (OpNo == SrcCIdx) 1317353358Sdim NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; 1318353358Sdim else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) 1319353358Sdim NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; 1320353358Sdim 1321353358Sdim WaitStatesNeededForUse = NeedWaitStates - 1322353358Sdim getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); 1323353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1324353358Sdim 1325353358Sdim if (WaitStatesNeeded == MaxWaitStates) 1326353358Sdim return WaitStatesNeeded; // Early exit. 1327353358Sdim } 1328353358Sdim 1329353358Sdim if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { 1330353358Sdim const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; 1331353358Sdim const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; 1332353358Sdim const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; 1333353358Sdim const int MaxWaitStates = 13; 1334360784Sdim Register DstReg = MI->getOperand(0).getReg(); 1335353358Sdim unsigned HazardDefLatency = 0; 1336353358Sdim 1337353358Sdim auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] 1338353358Sdim (MachineInstr *MI) { 1339353358Sdim if (!IsMFMAFn(MI)) 1340353358Sdim return false; 1341360784Sdim Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); 1342353358Sdim HazardDefLatency = std::max(HazardDefLatency, 1343353358Sdim TSchedModel.computeInstrLatency(MI)); 1344353358Sdim return TRI.regsOverlap(Reg, DstReg); 1345353358Sdim }; 1346353358Sdim 1347353358Sdim int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); 1348353358Sdim int NeedWaitStates; 1349353358Sdim switch (HazardDefLatency) { 1350353358Sdim case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; 1351353358Sdim break; 1352353358Sdim case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; 1353353358Sdim break; 1354353358Sdim case 16: LLVM_FALLTHROUGH; 1355353358Sdim default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; 1356353358Sdim break; 1357353358Sdim } 1358353358Sdim 1359353358Sdim int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; 1360353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1361353358Sdim } 1362353358Sdim 1363353358Sdim return WaitStatesNeeded; 1364353358Sdim} 1365353358Sdim 1366353358Sdimint GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { 1367353358Sdim if (!ST.hasMAIInsts()) 1368353358Sdim return 0; 1369353358Sdim 1370353358Sdim int WaitStatesNeeded = 0; 1371353358Sdim 1372353358Sdim auto IsAccVgprReadFn = [] (MachineInstr *MI) { 1373353358Sdim return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; 1374353358Sdim }; 1375353358Sdim 1376353358Sdim for (const MachineOperand &Op : MI->explicit_uses()) { 1377353358Sdim if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) 1378353358Sdim continue; 1379353358Sdim 1380360784Sdim Register Reg = Op.getReg(); 1381353358Sdim 1382353358Sdim const int AccVgprReadLdStWaitStates = 2; 1383353358Sdim const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; 1384353358Sdim const int MaxWaitStates = 2; 1385353358Sdim 1386353358Sdim int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - 1387353358Sdim getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); 1388353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1389353358Sdim 1390353358Sdim if (WaitStatesNeeded == MaxWaitStates) 1391353358Sdim return WaitStatesNeeded; // Early exit. 1392353358Sdim 1393353358Sdim auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { 1394353358Sdim if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) 1395353358Sdim return false; 1396353358Sdim auto IsVALUFn = [] (MachineInstr *MI) { 1397353358Sdim return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); 1398353358Sdim }; 1399353358Sdim return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < 1400353358Sdim std::numeric_limits<int>::max(); 1401353358Sdim }; 1402353358Sdim 1403353358Sdim WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - 1404353358Sdim getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); 1405353358Sdim WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); 1406353358Sdim } 1407353358Sdim 1408353358Sdim return WaitStatesNeeded; 1409353358Sdim} 1410