1//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// This pass inserts branches on the 0 exec mask over divergent branches 11/// branches when it's expected that jumping over the untaken control flow will 12/// be cheaper than having every workitem no-op through it. 13// 14//===----------------------------------------------------------------------===// 15 16#include "AMDGPU.h" 17#include "AMDGPUSubtarget.h" 18#include "SIInstrInfo.h" 19#include "SIMachineFunctionInfo.h" 20#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21#include "llvm/ADT/DepthFirstIterator.h" 22#include "llvm/ADT/SmallVector.h" 23#include "llvm/ADT/StringRef.h" 24#include "llvm/CodeGen/MachineBasicBlock.h" 25#include "llvm/CodeGen/MachineDominators.h" 26#include "llvm/CodeGen/MachineFunction.h" 27#include "llvm/CodeGen/MachineFunctionPass.h" 28#include "llvm/CodeGen/MachineInstr.h" 29#include "llvm/CodeGen/MachineInstrBuilder.h" 30#include "llvm/CodeGen/MachineOperand.h" 31#include "llvm/IR/CallingConv.h" 32#include "llvm/IR/DebugLoc.h" 33#include "llvm/InitializePasses.h" 34#include "llvm/MC/MCAsmInfo.h" 35#include "llvm/Pass.h" 36#include "llvm/Support/CommandLine.h" 37#include "llvm/Target/TargetMachine.h" 38#include <cassert> 39#include <cstdint> 40#include <iterator> 41 42using namespace llvm; 43 44#define DEBUG_TYPE "si-insert-skips" 45 46static cl::opt<unsigned> SkipThresholdFlag( 47 "amdgpu-skip-threshold-legacy", 48 cl::desc("Number of instructions before jumping over divergent control flow"), 49 cl::init(12), cl::Hidden); 50 51namespace { 52 53class SIInsertSkips : public MachineFunctionPass { 54private: 55 const SIRegisterInfo *TRI = nullptr; 56 const SIInstrInfo *TII = nullptr; 57 unsigned SkipThreshold = 0; 58 MachineDominatorTree *MDT = nullptr; 59 60 MachineBasicBlock *EarlyExitBlock = nullptr; 61 62 bool shouldSkip(const MachineBasicBlock &From, 63 const MachineBasicBlock &To) const; 64 65 bool dominatesAllReachable(MachineBasicBlock &MBB); 66 void createEarlyExitBlock(MachineBasicBlock &MBB); 67 void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, 68 DebugLoc DL); 69 70 bool kill(MachineInstr &MI); 71 72 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); 73 74public: 75 static char ID; 76 77 SIInsertSkips() : MachineFunctionPass(ID) {} 78 79 bool runOnMachineFunction(MachineFunction &MF) override; 80 81 StringRef getPassName() const override { 82 return "SI insert s_cbranch_execz instructions"; 83 } 84 85 void getAnalysisUsage(AnalysisUsage &AU) const override { 86 AU.addRequired<MachineDominatorTree>(); 87 AU.addPreserved<MachineDominatorTree>(); 88 MachineFunctionPass::getAnalysisUsage(AU); 89 } 90}; 91 92} // end anonymous namespace 93 94char SIInsertSkips::ID = 0; 95 96INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, 97 "SI insert s_cbranch_execz instructions", false, false) 98INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) 99INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, 100 "SI insert s_cbranch_execz instructions", false, false) 101 102char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; 103 104static bool opcodeEmitsNoInsts(const MachineInstr &MI) { 105 if (MI.isMetaInstruction()) 106 return true; 107 108 // Handle target specific opcodes. 109 switch (MI.getOpcode()) { 110 case AMDGPU::SI_MASK_BRANCH: 111 return true; 112 default: 113 return false; 114 } 115} 116 117bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, 118 const MachineBasicBlock &To) const { 119 unsigned NumInstr = 0; 120 const MachineFunction *MF = From.getParent(); 121 122 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 123 MBBI != End && MBBI != ToI; ++MBBI) { 124 const MachineBasicBlock &MBB = *MBBI; 125 126 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); 127 NumInstr < SkipThreshold && I != E; ++I) { 128 if (opcodeEmitsNoInsts(*I)) 129 continue; 130 131 // FIXME: Since this is required for correctness, this should be inserted 132 // during SILowerControlFlow. 133 134 // When a uniform loop is inside non-uniform control flow, the branch 135 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken 136 // when EXEC = 0. We should skip the loop lest it becomes infinite. 137 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || 138 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) 139 return true; 140 141 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) 142 return true; 143 144 // These instructions are potentially expensive even if EXEC = 0. 145 if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || 146 I->getOpcode() == AMDGPU::S_WAITCNT) 147 return true; 148 149 ++NumInstr; 150 if (NumInstr >= SkipThreshold) 151 return true; 152 } 153 } 154 155 return false; 156} 157 158/// Check whether \p MBB dominates all blocks that are reachable from it. 159bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { 160 for (MachineBasicBlock *Other : depth_first(&MBB)) { 161 if (!MDT->dominates(&MBB, Other)) 162 return false; 163 } 164 return true; 165} 166 167static void generatePsEndPgm(MachineBasicBlock &MBB, 168 MachineBasicBlock::iterator I, DebugLoc DL, 169 const SIInstrInfo *TII) { 170 // Generate "null export; s_endpgm". 171 BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) 172 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 173 .addReg(AMDGPU::VGPR0, RegState::Undef) 174 .addReg(AMDGPU::VGPR0, RegState::Undef) 175 .addReg(AMDGPU::VGPR0, RegState::Undef) 176 .addReg(AMDGPU::VGPR0, RegState::Undef) 177 .addImm(1) // vm 178 .addImm(0) // compr 179 .addImm(0); // en 180 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); 181} 182 183void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) { 184 MachineFunction *MF = MBB.getParent(); 185 DebugLoc DL; 186 187 assert(!EarlyExitBlock); 188 EarlyExitBlock = MF->CreateMachineBasicBlock(); 189 MF->insert(MF->end(), EarlyExitBlock); 190 191 generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII); 192} 193 194/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given 195/// iterator. Only applies to pixel shaders. 196void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, 197 MachineBasicBlock::iterator I, DebugLoc DL) { 198 MachineFunction *MF = MBB.getParent(); 199 assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); 200 201 // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a 202 // basic block that has no further successors (e.g., there was an 203 // `unreachable` there in IR). This can happen with original source of the 204 // form: 205 // 206 // if (uniform_condition) { 207 // write_to_memory(); 208 // discard; 209 // } 210 // 211 // In this case, we write the "null_export; s_endpgm" skip code in the 212 // already-existing basic block. 213 auto NextBBI = std::next(MBB.getIterator()); 214 bool NoSuccessor = I == MBB.end() && 215 llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); 216 217 if (NoSuccessor) { 218 generatePsEndPgm(MBB, I, DL, TII); 219 } else { 220 if (!EarlyExitBlock) { 221 createEarlyExitBlock(MBB); 222 // Update next block pointer to reflect any new blocks 223 NextBBI = std::next(MBB.getIterator()); 224 } 225 226 auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 227 .addMBB(EarlyExitBlock); 228 229 // Split the block if the branch will not come at the end. 230 auto Next = std::next(BranchMI->getIterator()); 231 if (Next != MBB.end() && !Next->isTerminator()) { 232 MachineBasicBlock *SplitBB = 233 MF->CreateMachineBasicBlock(MBB.getBasicBlock()); 234 MF->insert(NextBBI, SplitBB); 235 SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end()); 236 SplitBB->transferSuccessorsAndUpdatePHIs(&MBB); 237 // FIXME: the expectation is that this will be used near the beginning 238 // of a block so just assume all registers are still live. 239 for (auto LiveIn : MBB.liveins()) 240 SplitBB->addLiveIn(LiveIn); 241 MBB.addSuccessor(SplitBB); 242 243 // Update dominator tree 244 using DomTreeT = DomTreeBase<MachineBasicBlock>; 245 SmallVector<DomTreeT::UpdateType, 16> DTUpdates; 246 for (MachineBasicBlock *Succ : SplitBB->successors()) { 247 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); 248 DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); 249 } 250 DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); 251 MDT->getBase().applyUpdates(DTUpdates); 252 } 253 254 MBB.addSuccessor(EarlyExitBlock); 255 MDT->getBase().insertEdge(&MBB, EarlyExitBlock); 256 } 257} 258 259/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. 260/// Return true unless the terminator is a no-op. 261bool SIInsertSkips::kill(MachineInstr &MI) { 262 MachineBasicBlock &MBB = *MI.getParent(); 263 DebugLoc DL = MI.getDebugLoc(); 264 265 switch (MI.getOpcode()) { 266 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { 267 unsigned Opcode = 0; 268 269 // The opcodes are inverted because the inline immediate has to be 270 // the first operand, e.g. from "x < imm" to "imm > x" 271 switch (MI.getOperand(2).getImm()) { 272 case ISD::SETOEQ: 273 case ISD::SETEQ: 274 Opcode = AMDGPU::V_CMPX_EQ_F32_e64; 275 break; 276 case ISD::SETOGT: 277 case ISD::SETGT: 278 Opcode = AMDGPU::V_CMPX_LT_F32_e64; 279 break; 280 case ISD::SETOGE: 281 case ISD::SETGE: 282 Opcode = AMDGPU::V_CMPX_LE_F32_e64; 283 break; 284 case ISD::SETOLT: 285 case ISD::SETLT: 286 Opcode = AMDGPU::V_CMPX_GT_F32_e64; 287 break; 288 case ISD::SETOLE: 289 case ISD::SETLE: 290 Opcode = AMDGPU::V_CMPX_GE_F32_e64; 291 break; 292 case ISD::SETONE: 293 case ISD::SETNE: 294 Opcode = AMDGPU::V_CMPX_LG_F32_e64; 295 break; 296 case ISD::SETO: 297 Opcode = AMDGPU::V_CMPX_O_F32_e64; 298 break; 299 case ISD::SETUO: 300 Opcode = AMDGPU::V_CMPX_U_F32_e64; 301 break; 302 case ISD::SETUEQ: 303 Opcode = AMDGPU::V_CMPX_NLG_F32_e64; 304 break; 305 case ISD::SETUGT: 306 Opcode = AMDGPU::V_CMPX_NGE_F32_e64; 307 break; 308 case ISD::SETUGE: 309 Opcode = AMDGPU::V_CMPX_NGT_F32_e64; 310 break; 311 case ISD::SETULT: 312 Opcode = AMDGPU::V_CMPX_NLE_F32_e64; 313 break; 314 case ISD::SETULE: 315 Opcode = AMDGPU::V_CMPX_NLT_F32_e64; 316 break; 317 case ISD::SETUNE: 318 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; 319 break; 320 default: 321 llvm_unreachable("invalid ISD:SET cond code"); 322 } 323 324 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 325 if (ST.hasNoSdstCMPX()) 326 Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode); 327 328 assert(MI.getOperand(0).isReg()); 329 330 if (TRI->isVGPR(MBB.getParent()->getRegInfo(), 331 MI.getOperand(0).getReg())) { 332 Opcode = AMDGPU::getVOPe32(Opcode); 333 BuildMI(MBB, &MI, DL, TII->get(Opcode)) 334 .add(MI.getOperand(1)) 335 .add(MI.getOperand(0)); 336 } else { 337 auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode)); 338 if (!ST.hasNoSdstCMPX()) 339 I.addReg(AMDGPU::VCC, RegState::Define); 340 341 I.addImm(0) // src0 modifiers 342 .add(MI.getOperand(1)) 343 .addImm(0) // src1 modifiers 344 .add(MI.getOperand(0)); 345 346 I.addImm(0); // omod 347 } 348 return true; 349 } 350 case AMDGPU::SI_KILL_I1_TERMINATOR: { 351 const MachineFunction *MF = MI.getParent()->getParent(); 352 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 353 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 354 const MachineOperand &Op = MI.getOperand(0); 355 int64_t KillVal = MI.getOperand(1).getImm(); 356 assert(KillVal == 0 || KillVal == -1); 357 358 // Kill all threads if Op0 is an immediate and equal to the Kill value. 359 if (Op.isImm()) { 360 int64_t Imm = Op.getImm(); 361 assert(Imm == 0 || Imm == -1); 362 363 if (Imm == KillVal) { 364 BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 365 : AMDGPU::S_MOV_B64), Exec) 366 .addImm(0); 367 return true; 368 } 369 return false; 370 } 371 372 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; 373 if (ST.isWave32()) 374 Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32; 375 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) 376 .addReg(Exec) 377 .add(Op); 378 return true; 379 } 380 default: 381 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); 382 } 383} 384 385// Returns true if a branch over the block was inserted. 386bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, 387 MachineBasicBlock &SrcMBB) { 388 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB(); 389 390 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB)) 391 return false; 392 393 const DebugLoc &DL = MI.getDebugLoc(); 394 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator()); 395 396 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 397 .addMBB(DestBB); 398 399 return true; 400} 401 402bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { 403 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 404 TII = ST.getInstrInfo(); 405 TRI = &TII->getRegisterInfo(); 406 MDT = &getAnalysis<MachineDominatorTree>(); 407 SkipThreshold = SkipThresholdFlag; 408 409 SmallVector<MachineInstr *, 4> KillInstrs; 410 bool MadeChange = false; 411 412 for (MachineBasicBlock &MBB : MF) { 413 MachineBasicBlock::iterator I, Next; 414 for (I = MBB.begin(); I != MBB.end(); I = Next) { 415 Next = std::next(I); 416 MachineInstr &MI = *I; 417 418 switch (MI.getOpcode()) { 419 case AMDGPU::SI_MASK_BRANCH: 420 MadeChange |= skipMaskBranch(MI, MBB); 421 break; 422 423 case AMDGPU::S_BRANCH: 424 // Optimize out branches to the next block. 425 // FIXME: Shouldn't this be handled by BranchFolding? 426 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { 427 assert(&MI == &MBB.back()); 428 MI.eraseFromParent(); 429 MadeChange = true; 430 } 431 break; 432 433 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 434 case AMDGPU::SI_KILL_I1_TERMINATOR: { 435 MadeChange = true; 436 bool CanKill = kill(MI); 437 438 // Check if we can add an early "if exec=0 { end shader }". 439 // 440 // Note that we _always_ do this if it is correct, even if the kill 441 // happens fairly late in the shader, because the null export should 442 // generally still be cheaper than normal export(s). 443 // 444 // TODO: The dominatesAllReachable check is conservative: if the 445 // dominance is only missing due to _uniform_ branches, we could 446 // in fact insert the early-exit as well. 447 if (CanKill && 448 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && 449 dominatesAllReachable(MBB)) { 450 // Mark the instruction for kill-if-dead insertion. We delay this 451 // change because it modifies the CFG. 452 KillInstrs.push_back(&MI); 453 } else { 454 MI.eraseFromParent(); 455 } 456 break; 457 } 458 459 case AMDGPU::SI_KILL_CLEANUP: 460 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && 461 dominatesAllReachable(MBB)) { 462 KillInstrs.push_back(&MI); 463 } else { 464 MI.eraseFromParent(); 465 } 466 break; 467 468 default: 469 break; 470 } 471 } 472 } 473 474 for (MachineInstr *Kill : KillInstrs) { 475 skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), 476 Kill->getDebugLoc()); 477 Kill->eraseFromParent(); 478 } 479 KillInstrs.clear(); 480 EarlyExitBlock = nullptr; 481 482 return MadeChange; 483} 484