R600ControlFlowFinalizer.cpp revision 256281
1//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// This pass compute turns all control flow pseudo instructions into native one 12/// computing their address on the fly ; it also sets STACK_SIZE info. 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "r600cf" 16#include "llvm/Support/Debug.h" 17#include "llvm/Support/raw_ostream.h" 18 19#include "AMDGPU.h" 20#include "R600Defines.h" 21#include "R600InstrInfo.h" 22#include "R600MachineFunctionInfo.h" 23#include "R600RegisterInfo.h" 24#include "llvm/CodeGen/MachineFunctionPass.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27 28namespace llvm { 29 30class R600ControlFlowFinalizer : public MachineFunctionPass { 31 32private: 33 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; 34 35 enum ControlFlowInstruction { 36 CF_TC, 37 CF_VC, 38 CF_CALL_FS, 39 CF_WHILE_LOOP, 40 CF_END_LOOP, 41 CF_LOOP_BREAK, 42 CF_LOOP_CONTINUE, 43 CF_JUMP, 44 CF_ELSE, 45 CF_POP, 46 CF_END 47 }; 48 49 static char ID; 50 const R600InstrInfo *TII; 51 const R600RegisterInfo &TRI; 52 unsigned MaxFetchInst; 53 const AMDGPUSubtarget &ST; 54 55 bool IsTrivialInst(MachineInstr *MI) const { 56 switch (MI->getOpcode()) { 57 case AMDGPU::KILL: 58 case AMDGPU::RETURN: 59 return true; 60 default: 61 return false; 62 } 63 } 64 65 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { 66 unsigned Opcode = 0; 67 bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX); 68 switch (CFI) { 69 case CF_TC: 70 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; 71 break; 72 case CF_VC: 73 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; 74 break; 75 case CF_CALL_FS: 76 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; 77 break; 78 case CF_WHILE_LOOP: 79 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; 80 break; 81 case CF_END_LOOP: 82 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; 83 break; 84 case CF_LOOP_BREAK: 85 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; 86 break; 87 case CF_LOOP_CONTINUE: 88 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; 89 break; 90 case CF_JUMP: 91 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; 92 break; 93 case CF_ELSE: 94 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; 95 break; 96 case CF_POP: 97 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; 98 break; 99 case CF_END: 100 if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) { 101 Opcode = AMDGPU::CF_END_CM; 102 break; 103 } 104 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; 105 break; 106 } 107 assert (Opcode && "No opcode selected"); 108 return TII->get(Opcode); 109 } 110 111 bool isCompatibleWithClause(const MachineInstr *MI, 112 std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const { 113 unsigned DstMI, SrcMI; 114 for (MachineInstr::const_mop_iterator I = MI->operands_begin(), 115 E = MI->operands_end(); I != E; ++I) { 116 const MachineOperand &MO = *I; 117 if (!MO.isReg()) 118 continue; 119 if (MO.isDef()) 120 DstMI = MO.getReg(); 121 if (MO.isUse()) { 122 unsigned Reg = MO.getReg(); 123 if (AMDGPU::R600_Reg128RegClass.contains(Reg)) 124 SrcMI = Reg; 125 else 126 SrcMI = TRI.getMatchingSuperReg(Reg, 127 TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)), 128 &AMDGPU::R600_Reg128RegClass); 129 } 130 } 131 if ((DstRegs.find(SrcMI) == DstRegs.end()) && 132 (SrcRegs.find(DstMI) == SrcRegs.end())) { 133 SrcRegs.insert(SrcMI); 134 DstRegs.insert(DstMI); 135 return true; 136 } else 137 return false; 138 } 139 140 ClauseFile 141 MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 142 const { 143 MachineBasicBlock::iterator ClauseHead = I; 144 std::vector<MachineInstr *> ClauseContent; 145 unsigned AluInstCount = 0; 146 bool IsTex = TII->usesTextureCache(ClauseHead); 147 std::set<unsigned> DstRegs, SrcRegs; 148 for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 149 if (IsTrivialInst(I)) 150 continue; 151 if (AluInstCount > MaxFetchInst) 152 break; 153 if ((IsTex && !TII->usesTextureCache(I)) || 154 (!IsTex && !TII->usesVertexCache(I))) 155 break; 156 if (!isCompatibleWithClause(I, DstRegs, SrcRegs)) 157 break; 158 AluInstCount ++; 159 ClauseContent.push_back(I); 160 } 161 MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 162 getHWInstrDesc(IsTex?CF_TC:CF_VC)) 163 .addImm(0) // ADDR 164 .addImm(AluInstCount - 1); // COUNT 165 return ClauseFile(MIb, ClauseContent); 166 } 167 168 void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { 169 unsigned LiteralRegs[] = { 170 AMDGPU::ALU_LITERAL_X, 171 AMDGPU::ALU_LITERAL_Y, 172 AMDGPU::ALU_LITERAL_Z, 173 AMDGPU::ALU_LITERAL_W 174 }; 175 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { 176 MachineOperand &MO = MI->getOperand(i); 177 if (!MO.isReg()) 178 continue; 179 if (MO.getReg() != AMDGPU::ALU_LITERAL_X) 180 continue; 181 unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM); 182 int64_t Imm = MI->getOperand(ImmIdx).getImm(); 183 std::vector<int64_t>::iterator It = 184 std::find(Lits.begin(), Lits.end(), Imm); 185 if (It != Lits.end()) { 186 unsigned Index = It - Lits.begin(); 187 MO.setReg(LiteralRegs[Index]); 188 } else { 189 assert(Lits.size() < 4 && "Too many literals in Instruction Group"); 190 MO.setReg(LiteralRegs[Lits.size()]); 191 Lits.push_back(Imm); 192 } 193 } 194 } 195 196 MachineBasicBlock::iterator insertLiterals( 197 MachineBasicBlock::iterator InsertPos, 198 const std::vector<unsigned> &Literals) const { 199 MachineBasicBlock *MBB = InsertPos->getParent(); 200 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 201 unsigned LiteralPair0 = Literals[i]; 202 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; 203 InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), 204 TII->get(AMDGPU::LITERALS)) 205 .addImm(LiteralPair0) 206 .addImm(LiteralPair1); 207 } 208 return InsertPos; 209 } 210 211 ClauseFile 212 MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) 213 const { 214 MachineBasicBlock::iterator ClauseHead = I; 215 std::vector<MachineInstr *> ClauseContent; 216 I++; 217 for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { 218 if (IsTrivialInst(I)) { 219 ++I; 220 continue; 221 } 222 if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) 223 break; 224 std::vector<int64_t> Literals; 225 if (I->isBundle()) { 226 MachineInstr *DeleteMI = I; 227 MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); 228 while (++BI != E && BI->isBundledWithPred()) { 229 BI->unbundleFromPred(); 230 for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { 231 MachineOperand &MO = BI->getOperand(i); 232 if (MO.isReg() && MO.isInternalRead()) 233 MO.setIsInternalRead(false); 234 } 235 getLiteral(BI, Literals); 236 ClauseContent.push_back(BI); 237 } 238 I = BI; 239 DeleteMI->eraseFromParent(); 240 } else { 241 getLiteral(I, Literals); 242 ClauseContent.push_back(I); 243 I++; 244 } 245 for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { 246 unsigned literal0 = Literals[i]; 247 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; 248 MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), 249 TII->get(AMDGPU::LITERALS)) 250 .addImm(literal0) 251 .addImm(literal2); 252 ClauseContent.push_back(MILit); 253 } 254 } 255 ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); 256 return ClauseFile(ClauseHead, ClauseContent); 257 } 258 259 void 260 EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 261 unsigned &CfCount) { 262 CounterPropagateAddr(Clause.first, CfCount); 263 MachineBasicBlock *BB = Clause.first->getParent(); 264 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) 265 .addImm(CfCount); 266 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 267 BB->splice(InsertPos, BB, Clause.second[i]); 268 } 269 CfCount += 2 * Clause.second.size(); 270 } 271 272 void 273 EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, 274 unsigned &CfCount) { 275 CounterPropagateAddr(Clause.first, CfCount); 276 MachineBasicBlock *BB = Clause.first->getParent(); 277 BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) 278 .addImm(CfCount); 279 for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { 280 BB->splice(InsertPos, BB, Clause.second[i]); 281 } 282 CfCount += Clause.second.size(); 283 } 284 285 void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { 286 MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); 287 } 288 void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr) 289 const { 290 for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end(); 291 It != E; ++It) { 292 MachineInstr *MI = *It; 293 CounterPropagateAddr(MI, Addr); 294 } 295 } 296 297 unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { 298 switch (ST.device()->getGeneration()) { 299 case AMDGPUDeviceInfo::HD4XXX: 300 if (hasPush) 301 StackSubEntry += 2; 302 break; 303 case AMDGPUDeviceInfo::HD5XXX: 304 if (hasPush) 305 StackSubEntry ++; 306 case AMDGPUDeviceInfo::HD6XXX: 307 StackSubEntry += 2; 308 break; 309 } 310 return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 311 } 312 313public: 314 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), 315 TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())), 316 TRI(TII->getRegisterInfo()), 317 ST(tm.getSubtarget<AMDGPUSubtarget>()) { 318 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); 319 if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX) 320 MaxFetchInst = 8; 321 else 322 MaxFetchInst = 16; 323 } 324 325 virtual bool runOnMachineFunction(MachineFunction &MF) { 326 unsigned MaxStack = 0; 327 unsigned CurrentStack = 0; 328 bool HasPush = false; 329 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 330 ++MB) { 331 MachineBasicBlock &MBB = *MB; 332 unsigned CfCount = 0; 333 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; 334 std::vector<MachineInstr * > IfThenElseStack; 335 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 336 if (MFI->ShaderType == 1) { 337 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 338 getHWInstrDesc(CF_CALL_FS)); 339 CfCount++; 340 MaxStack = 1; 341 } 342 std::vector<ClauseFile> FetchClauses, AluClauses; 343 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); 344 I != E;) { 345 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { 346 DEBUG(dbgs() << CfCount << ":"; I->dump();); 347 FetchClauses.push_back(MakeFetchClause(MBB, I)); 348 CfCount++; 349 continue; 350 } 351 352 MachineBasicBlock::iterator MI = I; 353 I++; 354 switch (MI->getOpcode()) { 355 case AMDGPU::CF_ALU_PUSH_BEFORE: 356 CurrentStack++; 357 MaxStack = std::max(MaxStack, CurrentStack); 358 HasPush = true; 359 case AMDGPU::CF_ALU: 360 I = MI; 361 AluClauses.push_back(MakeALUClause(MBB, I)); 362 case AMDGPU::EG_ExportBuf: 363 case AMDGPU::EG_ExportSwz: 364 case AMDGPU::R600_ExportBuf: 365 case AMDGPU::R600_ExportSwz: 366 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 367 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: 368 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 369 CfCount++; 370 break; 371 case AMDGPU::WHILELOOP: { 372 CurrentStack+=4; 373 MaxStack = std::max(MaxStack, CurrentStack); 374 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 375 getHWInstrDesc(CF_WHILE_LOOP)) 376 .addImm(1); 377 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, 378 std::set<MachineInstr *>()); 379 Pair.second.insert(MIb); 380 LoopStack.push_back(Pair); 381 MI->eraseFromParent(); 382 CfCount++; 383 break; 384 } 385 case AMDGPU::ENDLOOP: { 386 CurrentStack-=4; 387 std::pair<unsigned, std::set<MachineInstr *> > Pair = 388 LoopStack.back(); 389 LoopStack.pop_back(); 390 CounterPropagateAddr(Pair.second, CfCount); 391 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 392 .addImm(Pair.first + 1); 393 MI->eraseFromParent(); 394 CfCount++; 395 break; 396 } 397 case AMDGPU::IF_PREDICATE_SET: { 398 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 399 getHWInstrDesc(CF_JUMP)) 400 .addImm(0) 401 .addImm(0); 402 IfThenElseStack.push_back(MIb); 403 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 404 MI->eraseFromParent(); 405 CfCount++; 406 break; 407 } 408 case AMDGPU::ELSE: { 409 MachineInstr * JumpInst = IfThenElseStack.back(); 410 IfThenElseStack.pop_back(); 411 CounterPropagateAddr(JumpInst, CfCount); 412 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 413 getHWInstrDesc(CF_ELSE)) 414 .addImm(0) 415 .addImm(1); 416 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 417 IfThenElseStack.push_back(MIb); 418 MI->eraseFromParent(); 419 CfCount++; 420 break; 421 } 422 case AMDGPU::ENDIF: { 423 CurrentStack--; 424 MachineInstr *IfOrElseInst = IfThenElseStack.back(); 425 IfThenElseStack.pop_back(); 426 CounterPropagateAddr(IfOrElseInst, CfCount + 1); 427 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 428 getHWInstrDesc(CF_POP)) 429 .addImm(CfCount + 1) 430 .addImm(1); 431 (void)MIb; 432 DEBUG(dbgs() << CfCount << ":"; MIb->dump();); 433 MI->eraseFromParent(); 434 CfCount++; 435 break; 436 } 437 case AMDGPU::PREDICATED_BREAK: { 438 CurrentStack--; 439 CfCount += 3; 440 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) 441 .addImm(CfCount) 442 .addImm(1); 443 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 444 getHWInstrDesc(CF_LOOP_BREAK)) 445 .addImm(0); 446 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP)) 447 .addImm(CfCount) 448 .addImm(1); 449 LoopStack.back().second.insert(MIb); 450 MI->eraseFromParent(); 451 break; 452 } 453 case AMDGPU::CONTINUE: { 454 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 455 getHWInstrDesc(CF_LOOP_CONTINUE)) 456 .addImm(0); 457 LoopStack.back().second.insert(MIb); 458 MI->eraseFromParent(); 459 CfCount++; 460 break; 461 } 462 case AMDGPU::RETURN: { 463 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); 464 CfCount++; 465 MI->eraseFromParent(); 466 if (CfCount % 2) { 467 BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); 468 CfCount++; 469 } 470 for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) 471 EmitFetchClause(I, FetchClauses[i], CfCount); 472 for (unsigned i = 0, e = AluClauses.size(); i < e; i++) 473 EmitALUClause(I, AluClauses[i], CfCount); 474 } 475 default: 476 break; 477 } 478 } 479 MFI->StackSize = getHWStackSize(MaxStack, HasPush); 480 } 481 482 return false; 483 } 484 485 const char *getPassName() const { 486 return "R600 Control Flow Finalizer Pass"; 487 } 488}; 489 490char R600ControlFlowFinalizer::ID = 0; 491 492} 493 494 495llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { 496 return new R600ControlFlowFinalizer(TM); 497} 498