R600EmitClauseMarkers.cpp revision 288943
1214571Sdim//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// 2214571Sdim// 3214571Sdim// The LLVM Compiler Infrastructure 4214571Sdim// 5214571Sdim// This file is distributed under the University of Illinois Open Source 6214571Sdim// License. See LICENSE.TXT for details. 7214571Sdim// 8214571Sdim//===----------------------------------------------------------------------===// 9214571Sdim// 10214571Sdim/// \file 11214571Sdim/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold 12214571Sdim/// 128 Alu instructions ; these instructions can access up to 4 prefetched 13214571Sdim/// 4 lines of 16 registers from constant buffers. Such ALU clauses are 14214571Sdim/// initiated by CF_ALU instructions. 15214571Sdim//===----------------------------------------------------------------------===// 16214571Sdim 17214571Sdim#include "AMDGPU.h" 18214571Sdim#include "R600Defines.h" 19214571Sdim#include "R600InstrInfo.h" 20214571Sdim#include "R600MachineFunctionInfo.h" 21214571Sdim#include "R600RegisterInfo.h" 22214571Sdim#include "AMDGPUSubtarget.h" 23214571Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 24214571Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 25214571Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 26214571Sdim 27214571Sdimusing namespace llvm; 28214571Sdim 29214571Sdimnamespace llvm { 30214571Sdim void initializeR600EmitClauseMarkersPass(PassRegistry&); 31214571Sdim} 32214571Sdim 33214571Sdimnamespace { 34214571Sdim 35214571Sdimclass R600EmitClauseMarkers : public MachineFunctionPass { 36214571Sdim 37214571Sdimprivate: 38214571Sdim const R600InstrInfo *TII; 39214571Sdim int Address; 40214571Sdim 41214571Sdim unsigned OccupiedDwords(MachineInstr *MI) const { 42214571Sdim switch (MI->getOpcode()) { 43214571Sdim case AMDGPU::INTERP_PAIR_XY: 44214571Sdim case AMDGPU::INTERP_PAIR_ZW: 45214571Sdim case AMDGPU::INTERP_VEC_LOAD: 46214571Sdim case AMDGPU::DOT_4: 47214571Sdim return 4; 48214571Sdim case AMDGPU::KILL: 49214571Sdim return 0; 50214571Sdim default: 51214571Sdim break; 52214571Sdim } 53214571Sdim 54214571Sdim // These will be expanded to two ALU instructions in the 55214571Sdim // ExpandSpecialInstructions pass. 56214571Sdim if (TII->isLDSRetInstr(MI->getOpcode())) 57214571Sdim return 2; 58214571Sdim 59214571Sdim if(TII->isVector(*MI) || 60214571Sdim TII->isCubeOp(MI->getOpcode()) || 61214571Sdim TII->isReductionOp(MI->getOpcode())) 62214571Sdim return 4; 63214571Sdim 64214571Sdim unsigned NumLiteral = 0; 65214571Sdim for (MachineInstr::mop_iterator It = MI->operands_begin(), 66214571Sdim E = MI->operands_end(); It != E; ++It) { 67214571Sdim MachineOperand &MO = *It; 68214571Sdim if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 69214571Sdim ++NumLiteral; 70214571Sdim } 71214571Sdim return 1 + NumLiteral; 72214571Sdim } 73214571Sdim 74214571Sdim bool isALU(const MachineInstr *MI) const { 75214571Sdim if (TII->isALUInstr(MI->getOpcode())) 76214571Sdim return true; 77214571Sdim if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) 78214571Sdim return true; 79214571Sdim switch (MI->getOpcode()) { 80214571Sdim case AMDGPU::PRED_X: 81214571Sdim case AMDGPU::INTERP_PAIR_XY: 82214571Sdim case AMDGPU::INTERP_PAIR_ZW: 83214571Sdim case AMDGPU::INTERP_VEC_LOAD: 84214571Sdim case AMDGPU::COPY: 85214571Sdim case AMDGPU::DOT_4: 86214571Sdim return true; 87214571Sdim default: 88214571Sdim return false; 89214571Sdim } 90214571Sdim } 91214571Sdim 92214571Sdim bool IsTrivialInst(MachineInstr *MI) const { 93214571Sdim switch (MI->getOpcode()) { 94214571Sdim case AMDGPU::KILL: 95214571Sdim case AMDGPU::RETURN: 96214571Sdim case AMDGPU::IMPLICIT_DEF: 97214571Sdim return true; 98214571Sdim default: 99214571Sdim return false; 100214571Sdim } 101214571Sdim } 102214571Sdim 103214571Sdim std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { 104214571Sdim // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 105214571Sdim // (See also R600ISelLowering.cpp) 106214571Sdim // ConstIndex value is in [0, 4095]; 107214571Sdim return std::pair<unsigned, unsigned>( 108214571Sdim ((Sel >> 2) - 512) >> 12, // KC_BANK 109214571Sdim // Line Number of ConstIndex 110214571Sdim // A line contains 16 constant registers however KCX bank can lock 111214571Sdim // two line at the same time ; thus we want to get an even line number. 112214571Sdim // Line number can be retrieved with (>>4), using (>>5) <<1 generates 113214571Sdim // an even number. 114214571Sdim ((((Sel >> 2) - 512) & 4095) >> 5) << 1); 115214571Sdim } 116214571Sdim 117214571Sdim bool SubstituteKCacheBank(MachineInstr *MI, 118214571Sdim std::vector<std::pair<unsigned, unsigned> > &CachedConsts, 119214571Sdim bool UpdateInstr = true) const { 120214571Sdim std::vector<std::pair<unsigned, unsigned> > UsedKCache; 121214571Sdim 122214571Sdim if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) 123214571Sdim return true; 124214571Sdim 125214571Sdim const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = 126214571Sdim TII->getSrcs(MI); 127214571Sdim assert((TII->isALUInstr(MI->getOpcode()) || 128214571Sdim MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); 129214571Sdim for (unsigned i = 0, n = Consts.size(); i < n; ++i) { 130214571Sdim if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 131214571Sdim continue; 132214571Sdim unsigned Sel = Consts[i].second; 133214571Sdim unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; 134214571Sdim unsigned KCacheIndex = Index * 4 + Chan; 135214571Sdim const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); 136214571Sdim if (CachedConsts.empty()) { 137214571Sdim CachedConsts.push_back(BankLine); 138214571Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 139214571Sdim continue; 140214571Sdim } 141214571Sdim if (CachedConsts[0] == BankLine) { 142214571Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 143214571Sdim continue; 144214571Sdim } 145214571Sdim if (CachedConsts.size() == 1) { 146214571Sdim CachedConsts.push_back(BankLine); 147214571Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 148214571Sdim continue; 149214571Sdim } 150214571Sdim if (CachedConsts[1] == BankLine) { 151214571Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 152214571Sdim continue; 153214571Sdim } 154214571Sdim return false; 155214571Sdim } 156214571Sdim 157214571Sdim if (!UpdateInstr) 158214571Sdim return true; 159214571Sdim 160214571Sdim for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { 161214571Sdim if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 162214571Sdim continue; 163214571Sdim switch(UsedKCache[j].first) { 164214571Sdim case 0: 165214571Sdim Consts[i].first->setReg( 166214571Sdim AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); 167214571Sdim break; 168214571Sdim case 1: 169214571Sdim Consts[i].first->setReg( 170214571Sdim AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); 171214571Sdim break; 172214571Sdim default: 173214571Sdim llvm_unreachable("Wrong Cache Line"); 174214571Sdim } 175214571Sdim j++; 176214571Sdim } 177214571Sdim return true; 178214571Sdim } 179214571Sdim 180214571Sdim bool canClauseLocalKillFitInClause( 181214571Sdim unsigned AluInstCount, 182214571Sdim std::vector<std::pair<unsigned, unsigned> > KCacheBanks, 183214571Sdim MachineBasicBlock::iterator Def, 184214571Sdim MachineBasicBlock::iterator BBEnd) { 185214571Sdim const R600RegisterInfo &TRI = TII->getRegisterInfo(); 186214571Sdim for (MachineInstr::const_mop_iterator 187214571Sdim MOI = Def->operands_begin(), 188214571Sdim MOE = Def->operands_end(); MOI != MOE; ++MOI) { 189214571Sdim if (!MOI->isReg() || !MOI->isDef() || 190214571Sdim TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) 191214571Sdim continue; 192214571Sdim 193214571Sdim // Def defines a clause local register, so check that its use will fit 194214571Sdim // in the clause. 195214571Sdim unsigned LastUseCount = 0; 196214571Sdim for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { 197214571Sdim AluInstCount += OccupiedDwords(UseI); 198214571Sdim // Make sure we won't need to end the clause due to KCache limitations. 199214571Sdim if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) 200214571Sdim return false; 201214571Sdim 202214571Sdim // We have reached the maximum instruction limit before finding the 203214571Sdim // use that kills this register, so we cannot use this def in the 204214571Sdim // current clause. 205214571Sdim if (AluInstCount >= TII->getMaxAlusPerClause()) 206214571Sdim return false; 207214571Sdim 208214571Sdim // Register kill flags have been cleared by the time we get to this 209214571Sdim // pass, but it is safe to assume that all uses of this register 210214571Sdim // occur in the same basic block as its definition, because 211214571Sdim // it is illegal for the scheduler to schedule them in 212214571Sdim // different blocks. 213214571Sdim if (UseI->findRegisterUseOperandIdx(MOI->getReg())) 214214571Sdim LastUseCount = AluInstCount; 215214571Sdim 216214571Sdim if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) 217214571Sdim break; 218214571Sdim } 219214571Sdim if (LastUseCount) 220214571Sdim return LastUseCount <= TII->getMaxAlusPerClause(); 221214571Sdim llvm_unreachable("Clause local register live at end of clause."); 222214571Sdim } 223214571Sdim return true; 224214571Sdim } 225214571Sdim 226214571Sdim MachineBasicBlock::iterator 227214571Sdim MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { 228214571Sdim MachineBasicBlock::iterator ClauseHead = I; 229214571Sdim std::vector<std::pair<unsigned, unsigned> > KCacheBanks; 230214571Sdim bool PushBeforeModifier = false; 231214571Sdim unsigned AluInstCount = 0; 232214571Sdim for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 233214571Sdim if (IsTrivialInst(I)) 234214571Sdim continue; 235214571Sdim if (!isALU(I)) 236214571Sdim break; 237214571Sdim if (AluInstCount > TII->getMaxAlusPerClause()) 238214571Sdim break; 239214571Sdim if (I->getOpcode() == AMDGPU::PRED_X) { 240214571Sdim // We put PRED_X in its own clause to ensure that ifcvt won't create 241214571Sdim // clauses with more than 128 insts. 242214571Sdim // IfCvt is indeed checking that "then" and "else" branches of an if 243214571Sdim // statement have less than ~60 insts thus converted clauses can't be 244214571Sdim // bigger than ~121 insts (predicate setter needs to be in the same 245214571Sdim // clause as predicated alus). 246214571Sdim if (AluInstCount > 0) 247214571Sdim break; 248214571Sdim if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) 249214571Sdim PushBeforeModifier = true; 250214571Sdim AluInstCount ++; 251214571Sdim continue; 252214571Sdim } 253214571Sdim // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: 254214571Sdim // 255214571Sdim // * KILL or INTERP instructions 256214571Sdim // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits 257214571Sdim // * Uses waterfalling (i.e. INDEX_MODE = AR.X) 258214571Sdim // 259214571Sdim // XXX: These checks have not been implemented yet. 260214571Sdim if (TII->mustBeLastInClause(I->getOpcode())) { 261214571Sdim I++; 262214571Sdim break; 263214571Sdim } 264214571Sdim 265214571Sdim // If this instruction defines a clause local register, make sure 266214571Sdim // its use can fit in this clause. 267214571Sdim if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) 268214571Sdim break; 269214571Sdim 270214571Sdim if (!SubstituteKCacheBank(I, KCacheBanks)) 271214571Sdim break; 272214571Sdim AluInstCount += OccupiedDwords(I); 273214571Sdim } 274214571Sdim unsigned Opcode = PushBeforeModifier ? 275214571Sdim AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; 276214571Sdim BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) 277214571Sdim // We don't use the ADDR field until R600ControlFlowFinalizer pass, where 278214571Sdim // it is safe to assume it is 0. However if we always put 0 here, the ifcvt 279214571Sdim // pass may assume that identical ALU clause starter at the beginning of a 280214571Sdim // true and false branch can be factorized which is not the case. 281214571Sdim .addImm(Address++) // ADDR 282214571Sdim .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 283214571Sdim .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 284214571Sdim .addImm(KCacheBanks.empty()?0:2) // KM0 285214571Sdim .addImm((KCacheBanks.size() < 2)?0:2) // KM1 286214571Sdim .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 287214571Sdim .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 288214571Sdim .addImm(AluInstCount) // COUNT 289214571Sdim .addImm(1); // Enabled 290214571Sdim return I; 291214571Sdim } 292214571Sdim 293214571Sdimpublic: 294214571Sdim static char ID; 295214571Sdim R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { 296214571Sdim 297214571Sdim initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); 298214571Sdim } 299214571Sdim 300214571Sdim bool runOnMachineFunction(MachineFunction &MF) override { 301214571Sdim TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); 302214571Sdim 303214571Sdim for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); 304214571Sdim BB != BB_E; ++BB) { 305214571Sdim MachineBasicBlock &MBB = *BB; 306214571Sdim MachineBasicBlock::iterator I = MBB.begin(); 307214571Sdim if (I->getOpcode() == AMDGPU::CF_ALU) 308214571Sdim continue; // BB was already parsed 309214571Sdim for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { 310214571Sdim if (isALU(I)) 311214571Sdim I = MakeALUClause(MBB, I); 312214571Sdim else 313214571Sdim ++I; 314214571Sdim } 315214571Sdim } 316214571Sdim return false; 317214571Sdim } 318214571Sdim 319214571Sdim const char *getPassName() const override { 320214571Sdim return "R600 Emit Clause Markers Pass"; 321214571Sdim } 322214571Sdim}; 323214571Sdim 324214571Sdimchar R600EmitClauseMarkers::ID = 0; 325214571Sdim 326214571Sdim} // end anonymous namespace 327214571Sdim 328214571SdimINITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", 329214571Sdim "R600 Emit Clause Markters", false, false) 330214571SdimINITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", 331214571Sdim "R600 Emit Clause Markters", false, false) 332214571Sdim 333214571Sdimllvm::FunctionPass *llvm::createR600EmitClauseMarkers() { 334214571Sdim return new R600EmitClauseMarkers(); 335214571Sdim} 336214571Sdim 337214571Sdim