1284677Sdim//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2284677Sdim// 3284677Sdim// The LLVM Compiler Infrastructure 4284677Sdim// 5284677Sdim// This file is distributed under the University of Illinois Open Source 6284677Sdim// License. See LICENSE.TXT for details. 7284677Sdim// 8284677Sdim//===----------------------------------------------------------------------===// 9284677Sdim// 10284677Sdim/// \file 11284677Sdim/// \brief This pass lowers the pseudo control flow instructions to real 12284677Sdim/// machine instructions. 13284677Sdim/// 14284677Sdim/// All control flow is handled using predicated instructions and 15284677Sdim/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 16284677Sdim/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 17284677Sdim/// by writting to the 64-bit EXEC register (each bit corresponds to a 18284677Sdim/// single vector ALU). Typically, for predicates, a vector ALU will write 19284677Sdim/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 20284677Sdim/// Vector ALU) and then the ScalarALU will AND the VCC register with the 21284677Sdim/// EXEC to update the predicates. 22284677Sdim/// 23284677Sdim/// For example: 24284677Sdim/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 25284677Sdim/// %SGPR0 = SI_IF %VCC 26284677Sdim/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 27284677Sdim/// %SGPR0 = SI_ELSE %SGPR0 28284677Sdim/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 29284677Sdim/// SI_END_CF %SGPR0 30284677Sdim/// 31284677Sdim/// becomes: 32284677Sdim/// 33284677Sdim/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 34284677Sdim/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 35284677Sdim/// S_CBRANCH_EXECZ label0 // This instruction is an optional 36284677Sdim/// // optimization which allows us to 37284677Sdim/// // branch if all the bits of 38284677Sdim/// // EXEC are zero. 39284677Sdim/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 40284677Sdim/// 41284677Sdim/// label0: 42284677Sdim/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 43284677Sdim/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 44284677Sdim/// S_BRANCH_EXECZ label1 // Use our branch optimization 45284677Sdim/// // instruction again. 46284677Sdim/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 47284677Sdim/// label1: 48284677Sdim/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 49284677Sdim//===----------------------------------------------------------------------===// 50284677Sdim 51284677Sdim#include "AMDGPU.h" 52284677Sdim#include "AMDGPUSubtarget.h" 53284677Sdim#include "SIInstrInfo.h" 54284677Sdim#include "SIMachineFunctionInfo.h" 55284677Sdim#include "llvm/CodeGen/MachineFrameInfo.h" 56284677Sdim#include "llvm/CodeGen/MachineFunction.h" 57284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 58284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 59284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 60284677Sdim#include "llvm/IR/Constants.h" 61284677Sdim 62284677Sdimusing namespace llvm; 63284677Sdim 64284677Sdimnamespace { 65284677Sdim 66284677Sdimclass SILowerControlFlowPass : public MachineFunctionPass { 67284677Sdim 68284677Sdimprivate: 69284677Sdim static const unsigned SkipThreshold = 12; 70284677Sdim 71284677Sdim static char ID; 72284677Sdim const SIRegisterInfo *TRI; 73284677Sdim const SIInstrInfo *TII; 74284677Sdim 75284677Sdim bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); 76284677Sdim 77284677Sdim void Skip(MachineInstr &From, MachineOperand &To); 78284677Sdim void SkipIfDead(MachineInstr &MI); 79284677Sdim 80284677Sdim void If(MachineInstr &MI); 81284677Sdim void Else(MachineInstr &MI); 82284677Sdim void Break(MachineInstr &MI); 83284677Sdim void IfBreak(MachineInstr &MI); 84284677Sdim void ElseBreak(MachineInstr &MI); 85284677Sdim void Loop(MachineInstr &MI); 86284677Sdim void EndCf(MachineInstr &MI); 87284677Sdim 88284677Sdim void Kill(MachineInstr &MI); 89284677Sdim void Branch(MachineInstr &MI); 90284677Sdim 91284677Sdim void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); 92284677Sdim void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); 93284677Sdim void IndirectSrc(MachineInstr &MI); 94284677Sdim void IndirectDst(MachineInstr &MI); 95284677Sdim 96284677Sdimpublic: 97284677Sdim SILowerControlFlowPass(TargetMachine &tm) : 98284677Sdim MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } 99284677Sdim 100284677Sdim bool runOnMachineFunction(MachineFunction &MF) override; 101284677Sdim 102284677Sdim const char *getPassName() const override { 103284677Sdim return "SI Lower control flow instructions"; 104284677Sdim } 105284677Sdim 106296417Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 107296417Sdim AU.setPreservesCFG(); 108296417Sdim MachineFunctionPass::getAnalysisUsage(AU); 109296417Sdim } 110284677Sdim}; 111284677Sdim 112284677Sdim} // End anonymous namespace 113284677Sdim 114284677Sdimchar SILowerControlFlowPass::ID = 0; 115284677Sdim 116284677SdimFunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { 117284677Sdim return new SILowerControlFlowPass(tm); 118284677Sdim} 119284677Sdim 120284677Sdimbool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, 121284677Sdim MachineBasicBlock *To) { 122284677Sdim 123284677Sdim unsigned NumInstr = 0; 124284677Sdim 125284677Sdim for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); 126284677Sdim MBB = *MBB->succ_begin()) { 127284677Sdim 128284677Sdim for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 129284677Sdim NumInstr < SkipThreshold && I != E; ++I) { 130284677Sdim 131284677Sdim if (I->isBundle() || !I->isBundled()) 132284677Sdim if (++NumInstr >= SkipThreshold) 133284677Sdim return true; 134284677Sdim } 135284677Sdim } 136284677Sdim 137284677Sdim return false; 138284677Sdim} 139284677Sdim 140284677Sdimvoid SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { 141284677Sdim 142284677Sdim if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) 143284677Sdim return; 144284677Sdim 145284677Sdim DebugLoc DL = From.getDebugLoc(); 146284677Sdim BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 147296417Sdim .addOperand(To); 148284677Sdim} 149284677Sdim 150284677Sdimvoid SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { 151284677Sdim 152284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 153284677Sdim DebugLoc DL = MI.getDebugLoc(); 154284677Sdim 155284677Sdim if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != 156284677Sdim ShaderType::PIXEL || 157284677Sdim !shouldSkip(&MBB, &MBB.getParent()->back())) 158284677Sdim return; 159284677Sdim 160284677Sdim MachineBasicBlock::iterator Insert = &MI; 161284677Sdim ++Insert; 162284677Sdim 163284677Sdim // If the exec mask is non-zero, skip the next two instructions 164284677Sdim BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 165296417Sdim .addImm(3); 166284677Sdim 167284677Sdim // Exec mask is zero: Export to NULL target... 168284677Sdim BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) 169284677Sdim .addImm(0) 170284677Sdim .addImm(0x09) // V_008DFC_SQ_EXP_NULL 171284677Sdim .addImm(0) 172284677Sdim .addImm(1) 173284677Sdim .addImm(1) 174284677Sdim .addReg(AMDGPU::VGPR0) 175284677Sdim .addReg(AMDGPU::VGPR0) 176284677Sdim .addReg(AMDGPU::VGPR0) 177284677Sdim .addReg(AMDGPU::VGPR0); 178284677Sdim 179284677Sdim // ... and terminate wavefront 180284677Sdim BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 181284677Sdim} 182284677Sdim 183284677Sdimvoid SILowerControlFlowPass::If(MachineInstr &MI) { 184284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 185284677Sdim DebugLoc DL = MI.getDebugLoc(); 186284677Sdim unsigned Reg = MI.getOperand(0).getReg(); 187284677Sdim unsigned Vcc = MI.getOperand(1).getReg(); 188284677Sdim 189284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) 190284677Sdim .addReg(Vcc); 191284677Sdim 192284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) 193284677Sdim .addReg(AMDGPU::EXEC) 194284677Sdim .addReg(Reg); 195284677Sdim 196284677Sdim Skip(MI, MI.getOperand(2)); 197284677Sdim 198284677Sdim MI.eraseFromParent(); 199284677Sdim} 200284677Sdim 201284677Sdimvoid SILowerControlFlowPass::Else(MachineInstr &MI) { 202284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 203284677Sdim DebugLoc DL = MI.getDebugLoc(); 204284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 205284677Sdim unsigned Src = MI.getOperand(1).getReg(); 206284677Sdim 207284677Sdim BuildMI(MBB, MBB.getFirstNonPHI(), DL, 208284677Sdim TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) 209284677Sdim .addReg(Src); // Saved EXEC 210284677Sdim 211284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 212284677Sdim .addReg(AMDGPU::EXEC) 213284677Sdim .addReg(Dst); 214284677Sdim 215284677Sdim Skip(MI, MI.getOperand(2)); 216284677Sdim 217284677Sdim MI.eraseFromParent(); 218284677Sdim} 219284677Sdim 220284677Sdimvoid SILowerControlFlowPass::Break(MachineInstr &MI) { 221284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 222284677Sdim DebugLoc DL = MI.getDebugLoc(); 223284677Sdim 224284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 225284677Sdim unsigned Src = MI.getOperand(1).getReg(); 226284677Sdim 227284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 228284677Sdim .addReg(AMDGPU::EXEC) 229284677Sdim .addReg(Src); 230284677Sdim 231284677Sdim MI.eraseFromParent(); 232284677Sdim} 233284677Sdim 234284677Sdimvoid SILowerControlFlowPass::IfBreak(MachineInstr &MI) { 235284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 236284677Sdim DebugLoc DL = MI.getDebugLoc(); 237284677Sdim 238284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 239284677Sdim unsigned Vcc = MI.getOperand(1).getReg(); 240284677Sdim unsigned Src = MI.getOperand(2).getReg(); 241284677Sdim 242284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 243284677Sdim .addReg(Vcc) 244284677Sdim .addReg(Src); 245284677Sdim 246284677Sdim MI.eraseFromParent(); 247284677Sdim} 248284677Sdim 249284677Sdimvoid SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { 250284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 251284677Sdim DebugLoc DL = MI.getDebugLoc(); 252284677Sdim 253284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 254284677Sdim unsigned Saved = MI.getOperand(1).getReg(); 255284677Sdim unsigned Src = MI.getOperand(2).getReg(); 256284677Sdim 257284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 258284677Sdim .addReg(Saved) 259284677Sdim .addReg(Src); 260284677Sdim 261284677Sdim MI.eraseFromParent(); 262284677Sdim} 263284677Sdim 264284677Sdimvoid SILowerControlFlowPass::Loop(MachineInstr &MI) { 265284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 266284677Sdim DebugLoc DL = MI.getDebugLoc(); 267284677Sdim unsigned Src = MI.getOperand(0).getReg(); 268284677Sdim 269284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 270284677Sdim .addReg(AMDGPU::EXEC) 271284677Sdim .addReg(Src); 272284677Sdim 273284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 274296417Sdim .addOperand(MI.getOperand(1)); 275284677Sdim 276284677Sdim MI.eraseFromParent(); 277284677Sdim} 278284677Sdim 279284677Sdimvoid SILowerControlFlowPass::EndCf(MachineInstr &MI) { 280284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 281284677Sdim DebugLoc DL = MI.getDebugLoc(); 282284677Sdim unsigned Reg = MI.getOperand(0).getReg(); 283284677Sdim 284284677Sdim BuildMI(MBB, MBB.getFirstNonPHI(), DL, 285284677Sdim TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 286284677Sdim .addReg(AMDGPU::EXEC) 287284677Sdim .addReg(Reg); 288284677Sdim 289284677Sdim MI.eraseFromParent(); 290284677Sdim} 291284677Sdim 292284677Sdimvoid SILowerControlFlowPass::Branch(MachineInstr &MI) { 293284677Sdim if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) 294284677Sdim MI.eraseFromParent(); 295284677Sdim 296284677Sdim // If these aren't equal, this is probably an infinite loop. 297284677Sdim} 298284677Sdim 299284677Sdimvoid SILowerControlFlowPass::Kill(MachineInstr &MI) { 300284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 301284677Sdim DebugLoc DL = MI.getDebugLoc(); 302284677Sdim const MachineOperand &Op = MI.getOperand(0); 303284677Sdim 304284677Sdim#ifndef NDEBUG 305284677Sdim const SIMachineFunctionInfo *MFI 306284677Sdim = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); 307284677Sdim // Kill is only allowed in pixel / geometry shaders. 308284677Sdim assert(MFI->getShaderType() == ShaderType::PIXEL || 309284677Sdim MFI->getShaderType() == ShaderType::GEOMETRY); 310284677Sdim#endif 311284677Sdim 312284677Sdim // Clear this thread from the exec mask if the operand is negative 313284677Sdim if ((Op.isImm())) { 314284677Sdim // Constant operand: Set exec mask to 0 or do nothing 315284677Sdim if (Op.getImm() & 0x80000000) { 316284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 317284677Sdim .addImm(0); 318284677Sdim } 319284677Sdim } else { 320296417Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) 321284677Sdim .addImm(0) 322284677Sdim .addOperand(Op); 323284677Sdim } 324284677Sdim 325284677Sdim MI.eraseFromParent(); 326284677Sdim} 327284677Sdim 328284677Sdimvoid SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { 329284677Sdim 330284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 331284677Sdim DebugLoc DL = MI.getDebugLoc(); 332284677Sdim MachineBasicBlock::iterator I = MI; 333284677Sdim 334284677Sdim unsigned Save = MI.getOperand(1).getReg(); 335284677Sdim unsigned Idx = MI.getOperand(3).getReg(); 336284677Sdim 337284677Sdim if (AMDGPU::SReg_32RegClass.contains(Idx)) { 338284677Sdim if (Offset) { 339284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 340284677Sdim .addReg(Idx) 341284677Sdim .addImm(Offset); 342284677Sdim } else { 343284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 344284677Sdim .addReg(Idx); 345284677Sdim } 346284677Sdim MBB.insert(I, MovRel); 347284677Sdim } else { 348284677Sdim 349284677Sdim assert(AMDGPU::SReg_64RegClass.contains(Save)); 350284677Sdim assert(AMDGPU::VGPR_32RegClass.contains(Idx)); 351284677Sdim 352284677Sdim // Save the EXEC mask 353284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) 354284677Sdim .addReg(AMDGPU::EXEC); 355284677Sdim 356284677Sdim // Read the next variant into VCC (lower 32 bits) <- also loop target 357284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 358284677Sdim AMDGPU::VCC_LO) 359284677Sdim .addReg(Idx); 360284677Sdim 361284677Sdim // Move index from VCC into M0 362284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 363284677Sdim .addReg(AMDGPU::VCC_LO); 364284677Sdim 365284677Sdim // Compare the just read M0 value to all possible Idx values 366296417Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) 367296417Sdim .addReg(AMDGPU::M0) 368296417Sdim .addReg(Idx); 369284677Sdim 370284677Sdim // Update EXEC, save the original EXEC value to VCC 371284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) 372284677Sdim .addReg(AMDGPU::VCC); 373284677Sdim 374284677Sdim if (Offset) { 375284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 376284677Sdim .addReg(AMDGPU::M0) 377284677Sdim .addImm(Offset); 378284677Sdim } 379284677Sdim // Do the actual move 380284677Sdim MBB.insert(I, MovRel); 381284677Sdim 382284677Sdim // Update EXEC, switch all done bits to 0 and all todo bits to 1 383284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 384284677Sdim .addReg(AMDGPU::EXEC) 385284677Sdim .addReg(AMDGPU::VCC); 386284677Sdim 387284677Sdim // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover 388284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 389296417Sdim .addImm(-7); 390284677Sdim 391284677Sdim // Restore EXEC 392284677Sdim BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 393284677Sdim .addReg(Save); 394284677Sdim 395284677Sdim } 396284677Sdim MI.eraseFromParent(); 397284677Sdim} 398284677Sdim 399284677Sdim/// \param @VecReg The register which holds element zero of the vector 400284677Sdim/// being addressed into. 401284677Sdim/// \param[out] @Reg The base register to use in the indirect addressing instruction. 402284677Sdim/// \param[in,out] @Offset As an input, this is the constant offset part of the 403284677Sdim// indirect Index. e.g. v0 = v[VecReg + Offset] 404284677Sdim// As an output, this is a constant value that needs 405284677Sdim// to be added to the value stored in M0. 406284677Sdimvoid SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, 407284677Sdim unsigned &Reg, 408284677Sdim int &Offset) { 409284677Sdim unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); 410284677Sdim if (!SubReg) 411284677Sdim SubReg = VecReg; 412284677Sdim 413284677Sdim const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); 414284677Sdim int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; 415284677Sdim 416284677Sdim if (RegIdx < 0) { 417284677Sdim Offset = RegIdx; 418284677Sdim RegIdx = 0; 419284677Sdim } else { 420284677Sdim Offset = 0; 421284677Sdim } 422284677Sdim 423284677Sdim Reg = RC->getRegister(RegIdx); 424284677Sdim} 425284677Sdim 426284677Sdimvoid SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { 427284677Sdim 428284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 429284677Sdim DebugLoc DL = MI.getDebugLoc(); 430284677Sdim 431284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 432284677Sdim unsigned Vec = MI.getOperand(2).getReg(); 433284677Sdim int Off = MI.getOperand(4).getImm(); 434284677Sdim unsigned Reg; 435284677Sdim 436284677Sdim computeIndirectRegAndOffset(Vec, Reg, Off); 437284677Sdim 438284677Sdim MachineInstr *MovRel = 439284677Sdim BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 440284677Sdim .addReg(Reg) 441284677Sdim .addReg(Vec, RegState::Implicit); 442284677Sdim 443284677Sdim LoadM0(MI, MovRel, Off); 444284677Sdim} 445284677Sdim 446284677Sdimvoid SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { 447284677Sdim 448284677Sdim MachineBasicBlock &MBB = *MI.getParent(); 449284677Sdim DebugLoc DL = MI.getDebugLoc(); 450284677Sdim 451284677Sdim unsigned Dst = MI.getOperand(0).getReg(); 452284677Sdim int Off = MI.getOperand(4).getImm(); 453284677Sdim unsigned Val = MI.getOperand(5).getReg(); 454284677Sdim unsigned Reg; 455284677Sdim 456284677Sdim computeIndirectRegAndOffset(Dst, Reg, Off); 457284677Sdim 458284677Sdim MachineInstr *MovRel = 459284677Sdim BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) 460284677Sdim .addReg(Reg, RegState::Define) 461284677Sdim .addReg(Val) 462284677Sdim .addReg(Dst, RegState::Implicit); 463284677Sdim 464284677Sdim LoadM0(MI, MovRel, Off); 465284677Sdim} 466284677Sdim 467284677Sdimbool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { 468284677Sdim TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 469284677Sdim TRI = 470284677Sdim static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); 471284677Sdim SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 472284677Sdim 473284677Sdim bool HaveKill = false; 474284677Sdim bool NeedWQM = false; 475284677Sdim bool NeedFlat = false; 476284677Sdim unsigned Depth = 0; 477284677Sdim 478284677Sdim for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 479284677Sdim BI != BE; ++BI) { 480284677Sdim 481284677Sdim MachineBasicBlock &MBB = *BI; 482284677Sdim MachineBasicBlock::iterator I, Next; 483284677Sdim for (I = MBB.begin(); I != MBB.end(); I = Next) { 484284677Sdim Next = std::next(I); 485284677Sdim 486284677Sdim MachineInstr &MI = *I; 487296417Sdim if (TII->isWQM(MI) || TII->isDS(MI)) 488284677Sdim NeedWQM = true; 489284677Sdim 490284677Sdim // Flat uses m0 in case it needs to access LDS. 491296417Sdim if (TII->isFLAT(MI)) 492284677Sdim NeedFlat = true; 493284677Sdim 494284677Sdim switch (MI.getOpcode()) { 495284677Sdim default: break; 496284677Sdim case AMDGPU::SI_IF: 497284677Sdim ++Depth; 498284677Sdim If(MI); 499284677Sdim break; 500284677Sdim 501284677Sdim case AMDGPU::SI_ELSE: 502284677Sdim Else(MI); 503284677Sdim break; 504284677Sdim 505284677Sdim case AMDGPU::SI_BREAK: 506284677Sdim Break(MI); 507284677Sdim break; 508284677Sdim 509284677Sdim case AMDGPU::SI_IF_BREAK: 510284677Sdim IfBreak(MI); 511284677Sdim break; 512284677Sdim 513284677Sdim case AMDGPU::SI_ELSE_BREAK: 514284677Sdim ElseBreak(MI); 515284677Sdim break; 516284677Sdim 517284677Sdim case AMDGPU::SI_LOOP: 518284677Sdim ++Depth; 519284677Sdim Loop(MI); 520284677Sdim break; 521284677Sdim 522284677Sdim case AMDGPU::SI_END_CF: 523284677Sdim if (--Depth == 0 && HaveKill) { 524284677Sdim SkipIfDead(MI); 525284677Sdim HaveKill = false; 526284677Sdim } 527284677Sdim EndCf(MI); 528284677Sdim break; 529284677Sdim 530284677Sdim case AMDGPU::SI_KILL: 531284677Sdim if (Depth == 0) 532284677Sdim SkipIfDead(MI); 533284677Sdim else 534284677Sdim HaveKill = true; 535284677Sdim Kill(MI); 536284677Sdim break; 537284677Sdim 538284677Sdim case AMDGPU::S_BRANCH: 539284677Sdim Branch(MI); 540284677Sdim break; 541284677Sdim 542296417Sdim case AMDGPU::SI_INDIRECT_SRC_V1: 543296417Sdim case AMDGPU::SI_INDIRECT_SRC_V2: 544296417Sdim case AMDGPU::SI_INDIRECT_SRC_V4: 545296417Sdim case AMDGPU::SI_INDIRECT_SRC_V8: 546296417Sdim case AMDGPU::SI_INDIRECT_SRC_V16: 547284677Sdim IndirectSrc(MI); 548284677Sdim break; 549284677Sdim 550284677Sdim case AMDGPU::SI_INDIRECT_DST_V1: 551284677Sdim case AMDGPU::SI_INDIRECT_DST_V2: 552284677Sdim case AMDGPU::SI_INDIRECT_DST_V4: 553284677Sdim case AMDGPU::SI_INDIRECT_DST_V8: 554284677Sdim case AMDGPU::SI_INDIRECT_DST_V16: 555284677Sdim IndirectDst(MI); 556284677Sdim break; 557284677Sdim } 558284677Sdim } 559284677Sdim } 560284677Sdim 561284677Sdim if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { 562284677Sdim MachineBasicBlock &MBB = MF.front(); 563284677Sdim BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 564284677Sdim AMDGPU::EXEC).addReg(AMDGPU::EXEC); 565284677Sdim } 566284677Sdim 567284677Sdim // FIXME: This seems inappropriate to do here. 568284677Sdim if (NeedFlat && MFI->IsKernel) { 569284677Sdim // Insert the prologue initializing the SGPRs pointing to the scratch space 570284677Sdim // for flat accesses. 571284677Sdim const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); 572284677Sdim 573284677Sdim // TODO: What to use with function calls? 574284677Sdim 575284677Sdim // FIXME: This is reporting stack size that is used in a scratch buffer 576284677Sdim // rather than registers as well. 577284677Sdim uint64_t StackSizeBytes = FrameInfo->getStackSize(); 578284677Sdim 579284677Sdim int IndirectBegin 580284677Sdim = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); 581284677Sdim // Convert register index to 256-byte unit. 582284677Sdim uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); 583284677Sdim 584284677Sdim assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && 585284677Sdim "Stack limits should be smaller than 16-bits"); 586284677Sdim 587284677Sdim // Initialize the flat scratch register pair. 588284677Sdim // TODO: Can we use one s_mov_b64 here? 589284677Sdim 590284677Sdim // Offset is in units of 256-bytes. 591284677Sdim MachineBasicBlock &MBB = MF.front(); 592284677Sdim DebugLoc NoDL; 593284677Sdim MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); 594284677Sdim const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); 595284677Sdim 596284677Sdim assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); 597284677Sdim 598284677Sdim BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) 599284677Sdim .addImm(StackOffset); 600284677Sdim 601284677Sdim // Documentation says size is "per-thread scratch size in bytes" 602284677Sdim BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) 603284677Sdim .addImm(StackSizeBytes); 604284677Sdim } 605284677Sdim 606284677Sdim return true; 607284677Sdim} 608