1284677Sdim//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// 2284677Sdim// 3284677Sdim// The LLVM Compiler Infrastructure 4284677Sdim// 5284677Sdim// This file is distributed under the University of Illinois Open Source 6284677Sdim// License. See LICENSE.TXT for details. 7284677Sdim// 8284677Sdim//===----------------------------------------------------------------------===// 9284677Sdim// 10284677Sdim/// \file 11284677Sdim/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold 12284677Sdim/// 128 Alu instructions ; these instructions can access up to 4 prefetched 13284677Sdim/// 4 lines of 16 registers from constant buffers. Such ALU clauses are 14284677Sdim/// initiated by CF_ALU instructions. 15284677Sdim//===----------------------------------------------------------------------===// 16284677Sdim 17284677Sdim#include "AMDGPU.h" 18284677Sdim#include "R600Defines.h" 19284677Sdim#include "R600InstrInfo.h" 20284677Sdim#include "R600MachineFunctionInfo.h" 21284677Sdim#include "R600RegisterInfo.h" 22284677Sdim#include "AMDGPUSubtarget.h" 23284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h" 24284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 25284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 26284677Sdim 27284677Sdimusing namespace llvm; 28284677Sdim 29284677Sdimnamespace llvm { 30284677Sdim void initializeR600EmitClauseMarkersPass(PassRegistry&); 31284677Sdim} 32284677Sdim 33284677Sdimnamespace { 34284677Sdim 35284677Sdimclass R600EmitClauseMarkers : public MachineFunctionPass { 36284677Sdim 37284677Sdimprivate: 38284677Sdim const R600InstrInfo *TII; 39284677Sdim int Address; 40284677Sdim 41284677Sdim unsigned OccupiedDwords(MachineInstr *MI) const { 42284677Sdim switch (MI->getOpcode()) { 43284677Sdim case AMDGPU::INTERP_PAIR_XY: 44284677Sdim case AMDGPU::INTERP_PAIR_ZW: 45284677Sdim case AMDGPU::INTERP_VEC_LOAD: 46284677Sdim case AMDGPU::DOT_4: 47284677Sdim return 4; 48284677Sdim case AMDGPU::KILL: 49284677Sdim return 0; 50284677Sdim default: 51284677Sdim break; 52284677Sdim } 53284677Sdim 54284677Sdim // These will be expanded to two ALU instructions in the 55284677Sdim // ExpandSpecialInstructions pass. 56284677Sdim if (TII->isLDSRetInstr(MI->getOpcode())) 57284677Sdim return 2; 58284677Sdim 59284677Sdim if(TII->isVector(*MI) || 60284677Sdim TII->isCubeOp(MI->getOpcode()) || 61284677Sdim TII->isReductionOp(MI->getOpcode())) 62284677Sdim return 4; 63284677Sdim 64284677Sdim unsigned NumLiteral = 0; 65284677Sdim for (MachineInstr::mop_iterator It = MI->operands_begin(), 66284677Sdim E = MI->operands_end(); It != E; ++It) { 67284677Sdim MachineOperand &MO = *It; 68284677Sdim if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 69284677Sdim ++NumLiteral; 70284677Sdim } 71284677Sdim return 1 + NumLiteral; 72284677Sdim } 73284677Sdim 74284677Sdim bool isALU(const MachineInstr *MI) const { 75284677Sdim if (TII->isALUInstr(MI->getOpcode())) 76284677Sdim return true; 77284677Sdim if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) 78284677Sdim return true; 79284677Sdim switch (MI->getOpcode()) { 80284677Sdim case AMDGPU::PRED_X: 81284677Sdim case AMDGPU::INTERP_PAIR_XY: 82284677Sdim case AMDGPU::INTERP_PAIR_ZW: 83284677Sdim case AMDGPU::INTERP_VEC_LOAD: 84284677Sdim case AMDGPU::COPY: 85284677Sdim case AMDGPU::DOT_4: 86284677Sdim return true; 87284677Sdim default: 88284677Sdim return false; 89284677Sdim } 90284677Sdim } 91284677Sdim 92284677Sdim bool IsTrivialInst(MachineInstr *MI) const { 93284677Sdim switch (MI->getOpcode()) { 94284677Sdim case AMDGPU::KILL: 95284677Sdim case AMDGPU::RETURN: 96284677Sdim case AMDGPU::IMPLICIT_DEF: 97284677Sdim return true; 98284677Sdim default: 99284677Sdim return false; 100284677Sdim } 101284677Sdim } 102284677Sdim 103284677Sdim std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const { 104284677Sdim // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 105284677Sdim // (See also R600ISelLowering.cpp) 106284677Sdim // ConstIndex value is in [0, 4095]; 107284677Sdim return std::pair<unsigned, unsigned>( 108284677Sdim ((Sel >> 2) - 512) >> 12, // KC_BANK 109284677Sdim // Line Number of ConstIndex 110284677Sdim // A line contains 16 constant registers however KCX bank can lock 111284677Sdim // two line at the same time ; thus we want to get an even line number. 112284677Sdim // Line number can be retrieved with (>>4), using (>>5) <<1 generates 113284677Sdim // an even number. 114284677Sdim ((((Sel >> 2) - 512) & 4095) >> 5) << 1); 115284677Sdim } 116284677Sdim 117284677Sdim bool SubstituteKCacheBank(MachineInstr *MI, 118284677Sdim std::vector<std::pair<unsigned, unsigned> > &CachedConsts, 119284677Sdim bool UpdateInstr = true) const { 120284677Sdim std::vector<std::pair<unsigned, unsigned> > UsedKCache; 121284677Sdim 122284677Sdim if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) 123284677Sdim return true; 124284677Sdim 125284677Sdim const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = 126284677Sdim TII->getSrcs(MI); 127284677Sdim assert((TII->isALUInstr(MI->getOpcode()) || 128284677Sdim MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); 129284677Sdim for (unsigned i = 0, n = Consts.size(); i < n; ++i) { 130284677Sdim if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 131284677Sdim continue; 132284677Sdim unsigned Sel = Consts[i].second; 133284677Sdim unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; 134284677Sdim unsigned KCacheIndex = Index * 4 + Chan; 135284677Sdim const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel); 136284677Sdim if (CachedConsts.empty()) { 137284677Sdim CachedConsts.push_back(BankLine); 138284677Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 139284677Sdim continue; 140284677Sdim } 141284677Sdim if (CachedConsts[0] == BankLine) { 142284677Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex)); 143284677Sdim continue; 144284677Sdim } 145284677Sdim if (CachedConsts.size() == 1) { 146284677Sdim CachedConsts.push_back(BankLine); 147284677Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 148284677Sdim continue; 149284677Sdim } 150284677Sdim if (CachedConsts[1] == BankLine) { 151284677Sdim UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex)); 152284677Sdim continue; 153284677Sdim } 154284677Sdim return false; 155284677Sdim } 156284677Sdim 157284677Sdim if (!UpdateInstr) 158284677Sdim return true; 159284677Sdim 160284677Sdim for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { 161284677Sdim if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) 162284677Sdim continue; 163284677Sdim switch(UsedKCache[j].first) { 164284677Sdim case 0: 165284677Sdim Consts[i].first->setReg( 166284677Sdim AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); 167284677Sdim break; 168284677Sdim case 1: 169284677Sdim Consts[i].first->setReg( 170284677Sdim AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); 171284677Sdim break; 172284677Sdim default: 173284677Sdim llvm_unreachable("Wrong Cache Line"); 174284677Sdim } 175284677Sdim j++; 176284677Sdim } 177284677Sdim return true; 178284677Sdim } 179284677Sdim 180284677Sdim bool canClauseLocalKillFitInClause( 181284677Sdim unsigned AluInstCount, 182284677Sdim std::vector<std::pair<unsigned, unsigned> > KCacheBanks, 183284677Sdim MachineBasicBlock::iterator Def, 184284677Sdim MachineBasicBlock::iterator BBEnd) { 185284677Sdim const R600RegisterInfo &TRI = TII->getRegisterInfo(); 186284677Sdim for (MachineInstr::const_mop_iterator 187284677Sdim MOI = Def->operands_begin(), 188284677Sdim MOE = Def->operands_end(); MOI != MOE; ++MOI) { 189284677Sdim if (!MOI->isReg() || !MOI->isDef() || 190284677Sdim TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) 191284677Sdim continue; 192284677Sdim 193284677Sdim // Def defines a clause local register, so check that its use will fit 194284677Sdim // in the clause. 195284677Sdim unsigned LastUseCount = 0; 196284677Sdim for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { 197284677Sdim AluInstCount += OccupiedDwords(UseI); 198284677Sdim // Make sure we won't need to end the clause due to KCache limitations. 199284677Sdim if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) 200284677Sdim return false; 201284677Sdim 202284677Sdim // We have reached the maximum instruction limit before finding the 203284677Sdim // use that kills this register, so we cannot use this def in the 204284677Sdim // current clause. 205284677Sdim if (AluInstCount >= TII->getMaxAlusPerClause()) 206284677Sdim return false; 207284677Sdim 208284677Sdim // Register kill flags have been cleared by the time we get to this 209284677Sdim // pass, but it is safe to assume that all uses of this register 210284677Sdim // occur in the same basic block as its definition, because 211284677Sdim // it is illegal for the scheduler to schedule them in 212284677Sdim // different blocks. 213284677Sdim if (UseI->findRegisterUseOperandIdx(MOI->getReg())) 214284677Sdim LastUseCount = AluInstCount; 215284677Sdim 216284677Sdim if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) 217284677Sdim break; 218284677Sdim } 219284677Sdim if (LastUseCount) 220284677Sdim return LastUseCount <= TII->getMaxAlusPerClause(); 221284677Sdim llvm_unreachable("Clause local register live at end of clause."); 222284677Sdim } 223284677Sdim return true; 224284677Sdim } 225284677Sdim 226284677Sdim MachineBasicBlock::iterator 227284677Sdim MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { 228284677Sdim MachineBasicBlock::iterator ClauseHead = I; 229284677Sdim std::vector<std::pair<unsigned, unsigned> > KCacheBanks; 230284677Sdim bool PushBeforeModifier = false; 231284677Sdim unsigned AluInstCount = 0; 232284677Sdim for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { 233284677Sdim if (IsTrivialInst(I)) 234284677Sdim continue; 235284677Sdim if (!isALU(I)) 236284677Sdim break; 237284677Sdim if (AluInstCount > TII->getMaxAlusPerClause()) 238284677Sdim break; 239284677Sdim if (I->getOpcode() == AMDGPU::PRED_X) { 240284677Sdim // We put PRED_X in its own clause to ensure that ifcvt won't create 241284677Sdim // clauses with more than 128 insts. 242284677Sdim // IfCvt is indeed checking that "then" and "else" branches of an if 243284677Sdim // statement have less than ~60 insts thus converted clauses can't be 244284677Sdim // bigger than ~121 insts (predicate setter needs to be in the same 245284677Sdim // clause as predicated alus). 246284677Sdim if (AluInstCount > 0) 247284677Sdim break; 248284677Sdim if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) 249284677Sdim PushBeforeModifier = true; 250284677Sdim AluInstCount ++; 251284677Sdim continue; 252284677Sdim } 253284677Sdim // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: 254284677Sdim // 255284677Sdim // * KILL or INTERP instructions 256284677Sdim // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits 257284677Sdim // * Uses waterfalling (i.e. INDEX_MODE = AR.X) 258284677Sdim // 259284677Sdim // XXX: These checks have not been implemented yet. 260284677Sdim if (TII->mustBeLastInClause(I->getOpcode())) { 261284677Sdim I++; 262284677Sdim break; 263284677Sdim } 264284677Sdim 265284677Sdim // If this instruction defines a clause local register, make sure 266284677Sdim // its use can fit in this clause. 267284677Sdim if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) 268284677Sdim break; 269284677Sdim 270284677Sdim if (!SubstituteKCacheBank(I, KCacheBanks)) 271284677Sdim break; 272284677Sdim AluInstCount += OccupiedDwords(I); 273284677Sdim } 274284677Sdim unsigned Opcode = PushBeforeModifier ? 275284677Sdim AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; 276284677Sdim BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) 277284677Sdim // We don't use the ADDR field until R600ControlFlowFinalizer pass, where 278284677Sdim // it is safe to assume it is 0. However if we always put 0 here, the ifcvt 279284677Sdim // pass may assume that identical ALU clause starter at the beginning of a 280284677Sdim // true and false branch can be factorized which is not the case. 281284677Sdim .addImm(Address++) // ADDR 282284677Sdim .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 283284677Sdim .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 284284677Sdim .addImm(KCacheBanks.empty()?0:2) // KM0 285284677Sdim .addImm((KCacheBanks.size() < 2)?0:2) // KM1 286284677Sdim .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 287284677Sdim .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 288284677Sdim .addImm(AluInstCount) // COUNT 289284677Sdim .addImm(1); // Enabled 290284677Sdim return I; 291284677Sdim } 292284677Sdim 293284677Sdimpublic: 294284677Sdim static char ID; 295284677Sdim R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { 296284677Sdim 297284677Sdim initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); 298284677Sdim } 299284677Sdim 300284677Sdim bool runOnMachineFunction(MachineFunction &MF) override { 301284677Sdim TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); 302284677Sdim 303284677Sdim for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); 304284677Sdim BB != BB_E; ++BB) { 305284677Sdim MachineBasicBlock &MBB = *BB; 306284677Sdim MachineBasicBlock::iterator I = MBB.begin(); 307284677Sdim if (I->getOpcode() == AMDGPU::CF_ALU) 308284677Sdim continue; // BB was already parsed 309284677Sdim for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { 310284677Sdim if (isALU(I)) 311284677Sdim I = MakeALUClause(MBB, I); 312284677Sdim else 313284677Sdim ++I; 314284677Sdim } 315284677Sdim } 316284677Sdim return false; 317284677Sdim } 318284677Sdim 319284677Sdim const char *getPassName() const override { 320284677Sdim return "R600 Emit Clause Markers Pass"; 321284677Sdim } 322284677Sdim}; 323284677Sdim 324284677Sdimchar R600EmitClauseMarkers::ID = 0; 325284677Sdim 326284677Sdim} // end anonymous namespace 327284677Sdim 328284677SdimINITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", 329284677Sdim "R600 Emit Clause Markters", false, false) 330284677SdimINITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", 331284677Sdim "R600 Emit Clause Markters", false, false) 332284677Sdim 333284677Sdimllvm::FunctionPass *llvm::createR600EmitClauseMarkers() { 334284677Sdim return new R600EmitClauseMarkers(); 335284677Sdim} 336284677Sdim 337