1249259Sdim//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2249259Sdim// 3249259Sdim// The LLVM Compiler Infrastructure 4249259Sdim// 5249259Sdim// This file is distributed under the University of Illinois Open Source 6249259Sdim// License. See LICENSE.TXT for details. 7249259Sdim// 8249259Sdim//===----------------------------------------------------------------------===// 9249259Sdim// 10249259Sdim/// \file 11249259Sdim/// \brief Insert wait instructions for memory reads and writes. 12249259Sdim/// 13249259Sdim/// Memory reads and writes are issued asynchronously, so we need to insert 14249259Sdim/// S_WAITCNT instructions when we want to access any of their results or 15249259Sdim/// overwrite any register that's used asynchronously. 16249259Sdim// 17249259Sdim//===----------------------------------------------------------------------===// 18249259Sdim 19249259Sdim#include "AMDGPU.h" 20249259Sdim#include "SIInstrInfo.h" 21249259Sdim#include "SIMachineFunctionInfo.h" 22249259Sdim#include "llvm/CodeGen/MachineFunction.h" 23249259Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 24249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 25249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 26249259Sdim 27249259Sdimusing namespace llvm; 28249259Sdim 29249259Sdimnamespace { 30249259Sdim 31249259Sdim/// \brief One variable for each of the hardware counters 32249259Sdimtypedef union { 33249259Sdim struct { 34249259Sdim unsigned VM; 35249259Sdim unsigned EXP; 36249259Sdim unsigned LGKM; 37249259Sdim } Named; 38249259Sdim unsigned Array[3]; 39249259Sdim 40249259Sdim} Counters; 41249259Sdim 42249259Sdimtypedef Counters RegCounters[512]; 43249259Sdimtypedef std::pair<unsigned, unsigned> RegInterval; 44249259Sdim 45249259Sdimclass SIInsertWaits : public MachineFunctionPass { 46249259Sdim 47249259Sdimprivate: 48249259Sdim static char ID; 49249259Sdim const SIInstrInfo *TII; 50263508Sdim const SIRegisterInfo *TRI; 51249259Sdim const MachineRegisterInfo *MRI; 52249259Sdim 53249259Sdim /// \brief Constant hardware limits 54249259Sdim static const Counters WaitCounts; 55249259Sdim 56249259Sdim /// \brief Constant zero value 57249259Sdim static const Counters ZeroCounts; 58249259Sdim 59249259Sdim /// \brief Counter values we have already waited on. 60249259Sdim Counters WaitedOn; 61249259Sdim 62249259Sdim /// \brief Counter values for last instruction issued. 63249259Sdim Counters LastIssued; 64249259Sdim 65249259Sdim /// \brief Registers used by async instructions. 66249259Sdim RegCounters UsedRegs; 67249259Sdim 68249259Sdim /// \brief Registers defined by async instructions. 69249259Sdim RegCounters DefinedRegs; 70249259Sdim 71249259Sdim /// \brief Different export instruction types seen since last wait. 72249259Sdim unsigned ExpInstrTypesSeen; 73249259Sdim 74249259Sdim /// \brief Get increment/decrement amount for this instruction. 75249259Sdim Counters getHwCounts(MachineInstr &MI); 76249259Sdim 77249259Sdim /// \brief Is operand relevant for async execution? 78249259Sdim bool isOpRelevant(MachineOperand &Op); 79249259Sdim 80249259Sdim /// \brief Get register interval an operand affects. 81249259Sdim RegInterval getRegInterval(MachineOperand &Op); 82249259Sdim 83249259Sdim /// \brief Handle instructions async components 84249259Sdim void pushInstruction(MachineInstr &MI); 85249259Sdim 86249259Sdim /// \brief Insert the actual wait instruction 87249259Sdim bool insertWait(MachineBasicBlock &MBB, 88249259Sdim MachineBasicBlock::iterator I, 89249259Sdim const Counters &Counts); 90249259Sdim 91249259Sdim /// \brief Do we need def2def checks? 92249259Sdim bool unorderedDefines(MachineInstr &MI); 93249259Sdim 94249259Sdim /// \brief Resolve all operand dependencies to counter requirements 95249259Sdim Counters handleOperands(MachineInstr &MI); 96249259Sdim 97249259Sdimpublic: 98249259Sdim SIInsertWaits(TargetMachine &tm) : 99249259Sdim MachineFunctionPass(ID), 100263508Sdim TII(0), 101263508Sdim TRI(0), 102263508Sdim ExpInstrTypesSeen(0) { } 103249259Sdim 104249259Sdim virtual bool runOnMachineFunction(MachineFunction &MF); 105249259Sdim 106249259Sdim const char *getPassName() const { 107249259Sdim return "SI insert wait instructions"; 108249259Sdim } 109249259Sdim 110249259Sdim}; 111249259Sdim 112249259Sdim} // End anonymous namespace 113249259Sdim 114249259Sdimchar SIInsertWaits::ID = 0; 115249259Sdim 116249259Sdimconst Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; 117249259Sdimconst Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; 118249259Sdim 119249259SdimFunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { 120249259Sdim return new SIInsertWaits(tm); 121249259Sdim} 122249259Sdim 123249259SdimCounters SIInsertWaits::getHwCounts(MachineInstr &MI) { 124249259Sdim 125249259Sdim uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; 126249259Sdim Counters Result; 127249259Sdim 128249259Sdim Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); 129249259Sdim 130249259Sdim // Only consider stores or EXP for EXP_CNT 131249259Sdim Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && 132249259Sdim (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); 133249259Sdim 134249259Sdim // LGKM may uses larger values 135249259Sdim if (TSFlags & SIInstrFlags::LGKM_CNT) { 136249259Sdim 137263508Sdim if (TII->isSMRD(MI.getOpcode())) { 138249259Sdim 139263508Sdim MachineOperand &Op = MI.getOperand(0); 140263508Sdim assert(Op.isReg() && "First LGKM operand must be a register!"); 141249259Sdim 142263508Sdim unsigned Reg = Op.getReg(); 143263508Sdim unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); 144263508Sdim Result.Named.LGKM = Size > 4 ? 2 : 1; 145263508Sdim 146263508Sdim } else { 147263508Sdim // DS 148263508Sdim Result.Named.LGKM = 1; 149263508Sdim } 150263508Sdim 151249259Sdim } else { 152249259Sdim Result.Named.LGKM = 0; 153249259Sdim } 154249259Sdim 155249259Sdim return Result; 156249259Sdim} 157249259Sdim 158249259Sdimbool SIInsertWaits::isOpRelevant(MachineOperand &Op) { 159249259Sdim 160249259Sdim // Constants are always irrelevant 161249259Sdim if (!Op.isReg()) 162249259Sdim return false; 163249259Sdim 164249259Sdim // Defines are always relevant 165249259Sdim if (Op.isDef()) 166249259Sdim return true; 167249259Sdim 168249259Sdim // For exports all registers are relevant 169249259Sdim MachineInstr &MI = *Op.getParent(); 170249259Sdim if (MI.getOpcode() == AMDGPU::EXP) 171249259Sdim return true; 172249259Sdim 173249259Sdim // For stores the stored value is also relevant 174249259Sdim if (!MI.getDesc().mayStore()) 175249259Sdim return false; 176249259Sdim 177249259Sdim for (MachineInstr::mop_iterator I = MI.operands_begin(), 178249259Sdim E = MI.operands_end(); I != E; ++I) { 179249259Sdim 180249259Sdim if (I->isReg() && I->isUse()) 181249259Sdim return Op.isIdenticalTo(*I); 182249259Sdim } 183249259Sdim 184249259Sdim return false; 185249259Sdim} 186249259Sdim 187249259SdimRegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { 188249259Sdim 189263508Sdim if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) 190249259Sdim return std::make_pair(0, 0); 191249259Sdim 192249259Sdim unsigned Reg = Op.getReg(); 193263508Sdim unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); 194249259Sdim 195249259Sdim assert(Size >= 4); 196249259Sdim 197249259Sdim RegInterval Result; 198263508Sdim Result.first = TRI->getEncodingValue(Reg); 199249259Sdim Result.second = Result.first + Size / 4; 200249259Sdim 201249259Sdim return Result; 202249259Sdim} 203249259Sdim 204249259Sdimvoid SIInsertWaits::pushInstruction(MachineInstr &MI) { 205249259Sdim 206249259Sdim // Get the hardware counter increments and sum them up 207249259Sdim Counters Increment = getHwCounts(MI); 208249259Sdim unsigned Sum = 0; 209249259Sdim 210249259Sdim for (unsigned i = 0; i < 3; ++i) { 211249259Sdim LastIssued.Array[i] += Increment.Array[i]; 212249259Sdim Sum += Increment.Array[i]; 213249259Sdim } 214249259Sdim 215249259Sdim // If we don't increase anything then that's it 216249259Sdim if (Sum == 0) 217249259Sdim return; 218249259Sdim 219249259Sdim // Remember which export instructions we have seen 220249259Sdim if (Increment.Named.EXP) { 221249259Sdim ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2; 222249259Sdim } 223249259Sdim 224249259Sdim for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 225249259Sdim 226249259Sdim MachineOperand &Op = MI.getOperand(i); 227249259Sdim if (!isOpRelevant(Op)) 228249259Sdim continue; 229249259Sdim 230249259Sdim RegInterval Interval = getRegInterval(Op); 231249259Sdim for (unsigned j = Interval.first; j < Interval.second; ++j) { 232249259Sdim 233249259Sdim // Remember which registers we define 234249259Sdim if (Op.isDef()) 235249259Sdim DefinedRegs[j] = LastIssued; 236249259Sdim 237249259Sdim // and which one we are using 238249259Sdim if (Op.isUse()) 239249259Sdim UsedRegs[j] = LastIssued; 240249259Sdim } 241249259Sdim } 242249259Sdim} 243249259Sdim 244249259Sdimbool SIInsertWaits::insertWait(MachineBasicBlock &MBB, 245249259Sdim MachineBasicBlock::iterator I, 246249259Sdim const Counters &Required) { 247249259Sdim 248249259Sdim // End of program? No need to wait on anything 249249259Sdim if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) 250249259Sdim return false; 251249259Sdim 252249259Sdim // Figure out if the async instructions execute in order 253249259Sdim bool Ordered[3]; 254249259Sdim 255249259Sdim // VM_CNT is always ordered 256249259Sdim Ordered[0] = true; 257249259Sdim 258249259Sdim // EXP_CNT is unordered if we have both EXP & VM-writes 259249259Sdim Ordered[1] = ExpInstrTypesSeen == 3; 260249259Sdim 261249259Sdim // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS 262249259Sdim Ordered[2] = false; 263249259Sdim 264249259Sdim // The values we are going to put into the S_WAITCNT instruction 265249259Sdim Counters Counts = WaitCounts; 266249259Sdim 267249259Sdim // Do we really need to wait? 268249259Sdim bool NeedWait = false; 269249259Sdim 270249259Sdim for (unsigned i = 0; i < 3; ++i) { 271249259Sdim 272249259Sdim if (Required.Array[i] <= WaitedOn.Array[i]) 273249259Sdim continue; 274249259Sdim 275249259Sdim NeedWait = true; 276249259Sdim 277249259Sdim if (Ordered[i]) { 278249259Sdim unsigned Value = LastIssued.Array[i] - Required.Array[i]; 279249259Sdim 280249259Sdim // adjust the value to the real hardware posibilities 281249259Sdim Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); 282249259Sdim 283249259Sdim } else 284249259Sdim Counts.Array[i] = 0; 285249259Sdim 286249259Sdim // Remember on what we have waited on 287249259Sdim WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; 288249259Sdim } 289249259Sdim 290249259Sdim if (!NeedWait) 291249259Sdim return false; 292249259Sdim 293249259Sdim // Reset EXP_CNT instruction types 294249259Sdim if (Counts.Named.EXP == 0) 295249259Sdim ExpInstrTypesSeen = 0; 296249259Sdim 297249259Sdim // Build the wait instruction 298249259Sdim BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 299249259Sdim .addImm((Counts.Named.VM & 0xF) | 300249259Sdim ((Counts.Named.EXP & 0x7) << 4) | 301249259Sdim ((Counts.Named.LGKM & 0x7) << 8)); 302249259Sdim 303249259Sdim return true; 304249259Sdim} 305249259Sdim 306249259Sdim/// \brief helper function for handleOperands 307249259Sdimstatic void increaseCounters(Counters &Dst, const Counters &Src) { 308249259Sdim 309249259Sdim for (unsigned i = 0; i < 3; ++i) 310249259Sdim Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); 311249259Sdim} 312249259Sdim 313249259SdimCounters SIInsertWaits::handleOperands(MachineInstr &MI) { 314249259Sdim 315249259Sdim Counters Result = ZeroCounts; 316249259Sdim 317266715Sdim // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, 318266715Sdim // but we also want to wait for any other outstanding transfers before 319266715Sdim // signalling other hardware blocks 320266715Sdim if (MI.getOpcode() == AMDGPU::S_SENDMSG) 321266715Sdim return LastIssued; 322266715Sdim 323249259Sdim // For each register affected by this 324249259Sdim // instruction increase the result sequence 325249259Sdim for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 326249259Sdim 327249259Sdim MachineOperand &Op = MI.getOperand(i); 328249259Sdim RegInterval Interval = getRegInterval(Op); 329249259Sdim for (unsigned j = Interval.first; j < Interval.second; ++j) { 330249259Sdim 331249259Sdim if (Op.isDef()) { 332249259Sdim increaseCounters(Result, UsedRegs[j]); 333249259Sdim increaseCounters(Result, DefinedRegs[j]); 334249259Sdim } 335249259Sdim 336249259Sdim if (Op.isUse()) 337249259Sdim increaseCounters(Result, DefinedRegs[j]); 338249259Sdim } 339249259Sdim } 340249259Sdim 341249259Sdim return Result; 342249259Sdim} 343249259Sdim 344249259Sdimbool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { 345249259Sdim bool Changes = false; 346249259Sdim 347263508Sdim TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); 348263508Sdim TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); 349263508Sdim 350249259Sdim MRI = &MF.getRegInfo(); 351249259Sdim 352249259Sdim WaitedOn = ZeroCounts; 353249259Sdim LastIssued = ZeroCounts; 354249259Sdim 355249259Sdim memset(&UsedRegs, 0, sizeof(UsedRegs)); 356249259Sdim memset(&DefinedRegs, 0, sizeof(DefinedRegs)); 357249259Sdim 358249259Sdim for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 359249259Sdim BI != BE; ++BI) { 360249259Sdim 361249259Sdim MachineBasicBlock &MBB = *BI; 362249259Sdim for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 363249259Sdim I != E; ++I) { 364249259Sdim 365249259Sdim Changes |= insertWait(MBB, I, handleOperands(*I)); 366249259Sdim pushInstruction(*I); 367249259Sdim } 368249259Sdim 369249259Sdim // Wait for everything at the end of the MBB 370249259Sdim Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); 371249259Sdim } 372249259Sdim 373249259Sdim return Changes; 374249259Sdim} 375