R600EmitClauseMarkers.cpp revision 288943
1214571Sdim//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
2214571Sdim//
3214571Sdim//                     The LLVM Compiler Infrastructure
4214571Sdim//
5214571Sdim// This file is distributed under the University of Illinois Open Source
6214571Sdim// License. See LICENSE.TXT for details.
7214571Sdim//
8214571Sdim//===----------------------------------------------------------------------===//
9214571Sdim//
10214571Sdim/// \file
11214571Sdim/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
12214571Sdim/// 128 Alu instructions ; these instructions can access up to 4 prefetched
13214571Sdim/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
14214571Sdim/// initiated by CF_ALU instructions.
15214571Sdim//===----------------------------------------------------------------------===//
16214571Sdim
17214571Sdim#include "AMDGPU.h"
18214571Sdim#include "R600Defines.h"
19214571Sdim#include "R600InstrInfo.h"
20214571Sdim#include "R600MachineFunctionInfo.h"
21214571Sdim#include "R600RegisterInfo.h"
22214571Sdim#include "AMDGPUSubtarget.h"
23214571Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
24214571Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
25214571Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
26214571Sdim
27214571Sdimusing namespace llvm;
28214571Sdim
29214571Sdimnamespace llvm {
30214571Sdim  void initializeR600EmitClauseMarkersPass(PassRegistry&);
31214571Sdim}
32214571Sdim
33214571Sdimnamespace {
34214571Sdim
35214571Sdimclass R600EmitClauseMarkers : public MachineFunctionPass {
36214571Sdim
37214571Sdimprivate:
38214571Sdim  const R600InstrInfo *TII;
39214571Sdim  int Address;
40214571Sdim
41214571Sdim  unsigned OccupiedDwords(MachineInstr *MI) const {
42214571Sdim    switch (MI->getOpcode()) {
43214571Sdim    case AMDGPU::INTERP_PAIR_XY:
44214571Sdim    case AMDGPU::INTERP_PAIR_ZW:
45214571Sdim    case AMDGPU::INTERP_VEC_LOAD:
46214571Sdim    case AMDGPU::DOT_4:
47214571Sdim      return 4;
48214571Sdim    case AMDGPU::KILL:
49214571Sdim      return 0;
50214571Sdim    default:
51214571Sdim      break;
52214571Sdim    }
53214571Sdim
54214571Sdim    // These will be expanded to two ALU instructions in the
55214571Sdim    // ExpandSpecialInstructions pass.
56214571Sdim    if (TII->isLDSRetInstr(MI->getOpcode()))
57214571Sdim      return 2;
58214571Sdim
59214571Sdim    if(TII->isVector(*MI) ||
60214571Sdim        TII->isCubeOp(MI->getOpcode()) ||
61214571Sdim        TII->isReductionOp(MI->getOpcode()))
62214571Sdim      return 4;
63214571Sdim
64214571Sdim    unsigned NumLiteral = 0;
65214571Sdim    for (MachineInstr::mop_iterator It = MI->operands_begin(),
66214571Sdim        E = MI->operands_end(); It != E; ++It) {
67214571Sdim      MachineOperand &MO = *It;
68214571Sdim      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
69214571Sdim        ++NumLiteral;
70214571Sdim    }
71214571Sdim    return 1 + NumLiteral;
72214571Sdim  }
73214571Sdim
74214571Sdim  bool isALU(const MachineInstr *MI) const {
75214571Sdim    if (TII->isALUInstr(MI->getOpcode()))
76214571Sdim      return true;
77214571Sdim    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
78214571Sdim      return true;
79214571Sdim    switch (MI->getOpcode()) {
80214571Sdim    case AMDGPU::PRED_X:
81214571Sdim    case AMDGPU::INTERP_PAIR_XY:
82214571Sdim    case AMDGPU::INTERP_PAIR_ZW:
83214571Sdim    case AMDGPU::INTERP_VEC_LOAD:
84214571Sdim    case AMDGPU::COPY:
85214571Sdim    case AMDGPU::DOT_4:
86214571Sdim      return true;
87214571Sdim    default:
88214571Sdim      return false;
89214571Sdim    }
90214571Sdim  }
91214571Sdim
92214571Sdim  bool IsTrivialInst(MachineInstr *MI) const {
93214571Sdim    switch (MI->getOpcode()) {
94214571Sdim    case AMDGPU::KILL:
95214571Sdim    case AMDGPU::RETURN:
96214571Sdim    case AMDGPU::IMPLICIT_DEF:
97214571Sdim      return true;
98214571Sdim    default:
99214571Sdim      return false;
100214571Sdim    }
101214571Sdim  }
102214571Sdim
103214571Sdim  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
104214571Sdim    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
105214571Sdim    // (See also R600ISelLowering.cpp)
106214571Sdim    // ConstIndex value is in [0, 4095];
107214571Sdim    return std::pair<unsigned, unsigned>(
108214571Sdim        ((Sel >> 2) - 512) >> 12, // KC_BANK
109214571Sdim        // Line Number of ConstIndex
110214571Sdim        // A line contains 16 constant registers however KCX bank can lock
111214571Sdim        // two line at the same time ; thus we want to get an even line number.
112214571Sdim        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
113214571Sdim        // an even number.
114214571Sdim        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
115214571Sdim  }
116214571Sdim
117214571Sdim  bool SubstituteKCacheBank(MachineInstr *MI,
118214571Sdim      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
119214571Sdim      bool UpdateInstr = true) const {
120214571Sdim    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
121214571Sdim
122214571Sdim    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
123214571Sdim      return true;
124214571Sdim
125214571Sdim    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
126214571Sdim        TII->getSrcs(MI);
127214571Sdim    assert((TII->isALUInstr(MI->getOpcode()) ||
128214571Sdim        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
129214571Sdim    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
130214571Sdim      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
131214571Sdim        continue;
132214571Sdim      unsigned Sel = Consts[i].second;
133214571Sdim      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
134214571Sdim      unsigned KCacheIndex = Index * 4 + Chan;
135214571Sdim      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
136214571Sdim      if (CachedConsts.empty()) {
137214571Sdim        CachedConsts.push_back(BankLine);
138214571Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
139214571Sdim        continue;
140214571Sdim      }
141214571Sdim      if (CachedConsts[0] == BankLine) {
142214571Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
143214571Sdim        continue;
144214571Sdim      }
145214571Sdim      if (CachedConsts.size() == 1) {
146214571Sdim        CachedConsts.push_back(BankLine);
147214571Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
148214571Sdim        continue;
149214571Sdim      }
150214571Sdim      if (CachedConsts[1] == BankLine) {
151214571Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
152214571Sdim        continue;
153214571Sdim      }
154214571Sdim      return false;
155214571Sdim    }
156214571Sdim
157214571Sdim    if (!UpdateInstr)
158214571Sdim      return true;
159214571Sdim
160214571Sdim    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
161214571Sdim      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
162214571Sdim        continue;
163214571Sdim      switch(UsedKCache[j].first) {
164214571Sdim      case 0:
165214571Sdim        Consts[i].first->setReg(
166214571Sdim            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
167214571Sdim        break;
168214571Sdim      case 1:
169214571Sdim        Consts[i].first->setReg(
170214571Sdim            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
171214571Sdim        break;
172214571Sdim      default:
173214571Sdim        llvm_unreachable("Wrong Cache Line");
174214571Sdim      }
175214571Sdim      j++;
176214571Sdim    }
177214571Sdim    return true;
178214571Sdim  }
179214571Sdim
180214571Sdim  bool canClauseLocalKillFitInClause(
181214571Sdim                        unsigned AluInstCount,
182214571Sdim                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
183214571Sdim                        MachineBasicBlock::iterator Def,
184214571Sdim                        MachineBasicBlock::iterator BBEnd) {
185214571Sdim    const R600RegisterInfo &TRI = TII->getRegisterInfo();
186214571Sdim    for (MachineInstr::const_mop_iterator
187214571Sdim           MOI = Def->operands_begin(),
188214571Sdim           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
189214571Sdim      if (!MOI->isReg() || !MOI->isDef() ||
190214571Sdim          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
191214571Sdim        continue;
192214571Sdim
193214571Sdim      // Def defines a clause local register, so check that its use will fit
194214571Sdim      // in the clause.
195214571Sdim      unsigned LastUseCount = 0;
196214571Sdim      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
197214571Sdim        AluInstCount += OccupiedDwords(UseI);
198214571Sdim        // Make sure we won't need to end the clause due to KCache limitations.
199214571Sdim        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
200214571Sdim          return false;
201214571Sdim
202214571Sdim        // We have reached the maximum instruction limit before finding the
203214571Sdim        // use that kills this register, so we cannot use this def in the
204214571Sdim        // current clause.
205214571Sdim        if (AluInstCount >= TII->getMaxAlusPerClause())
206214571Sdim          return false;
207214571Sdim
208214571Sdim        // Register kill flags have been cleared by the time we get to this
209214571Sdim        // pass, but it is safe to assume that all uses of this register
210214571Sdim        // occur in the same basic block as its definition, because
211214571Sdim        // it is illegal for the scheduler to schedule them in
212214571Sdim        // different blocks.
213214571Sdim        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
214214571Sdim          LastUseCount = AluInstCount;
215214571Sdim
216214571Sdim        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
217214571Sdim          break;
218214571Sdim      }
219214571Sdim      if (LastUseCount)
220214571Sdim        return LastUseCount <= TII->getMaxAlusPerClause();
221214571Sdim      llvm_unreachable("Clause local register live at end of clause.");
222214571Sdim    }
223214571Sdim    return true;
224214571Sdim  }
225214571Sdim
226214571Sdim  MachineBasicBlock::iterator
227214571Sdim  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
228214571Sdim    MachineBasicBlock::iterator ClauseHead = I;
229214571Sdim    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
230214571Sdim    bool PushBeforeModifier = false;
231214571Sdim    unsigned AluInstCount = 0;
232214571Sdim    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
233214571Sdim      if (IsTrivialInst(I))
234214571Sdim        continue;
235214571Sdim      if (!isALU(I))
236214571Sdim        break;
237214571Sdim      if (AluInstCount > TII->getMaxAlusPerClause())
238214571Sdim        break;
239214571Sdim      if (I->getOpcode() == AMDGPU::PRED_X) {
240214571Sdim        // We put PRED_X in its own clause to ensure that ifcvt won't create
241214571Sdim        // clauses with more than 128 insts.
242214571Sdim        // IfCvt is indeed checking that "then" and "else" branches of an if
243214571Sdim        // statement have less than ~60 insts thus converted clauses can't be
244214571Sdim        // bigger than ~121 insts (predicate setter needs to be in the same
245214571Sdim        // clause as predicated alus).
246214571Sdim        if (AluInstCount > 0)
247214571Sdim          break;
248214571Sdim        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
249214571Sdim          PushBeforeModifier = true;
250214571Sdim        AluInstCount ++;
251214571Sdim        continue;
252214571Sdim      }
253214571Sdim      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
254214571Sdim      //
255214571Sdim      // * KILL or INTERP instructions
256214571Sdim      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
257214571Sdim      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
258214571Sdim      //
259214571Sdim      // XXX: These checks have not been implemented yet.
260214571Sdim      if (TII->mustBeLastInClause(I->getOpcode())) {
261214571Sdim        I++;
262214571Sdim        break;
263214571Sdim      }
264214571Sdim
265214571Sdim      // If this instruction defines a clause local register, make sure
266214571Sdim      // its use can fit in this clause.
267214571Sdim      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
268214571Sdim        break;
269214571Sdim
270214571Sdim      if (!SubstituteKCacheBank(I, KCacheBanks))
271214571Sdim        break;
272214571Sdim      AluInstCount += OccupiedDwords(I);
273214571Sdim    }
274214571Sdim    unsigned Opcode = PushBeforeModifier ?
275214571Sdim        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
276214571Sdim    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
277214571Sdim    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
278214571Sdim    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
279214571Sdim    // pass may assume that identical ALU clause starter at the beginning of a
280214571Sdim    // true and false branch can be factorized which is not the case.
281214571Sdim        .addImm(Address++) // ADDR
282214571Sdim        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
283214571Sdim        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
284214571Sdim        .addImm(KCacheBanks.empty()?0:2) // KM0
285214571Sdim        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
286214571Sdim        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
287214571Sdim        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
288214571Sdim        .addImm(AluInstCount) // COUNT
289214571Sdim        .addImm(1); // Enabled
290214571Sdim    return I;
291214571Sdim  }
292214571Sdim
293214571Sdimpublic:
294214571Sdim  static char ID;
295214571Sdim  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
296214571Sdim
297214571Sdim    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
298214571Sdim  }
299214571Sdim
300214571Sdim  bool runOnMachineFunction(MachineFunction &MF) override {
301214571Sdim    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
302214571Sdim
303214571Sdim    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
304214571Sdim                                                    BB != BB_E; ++BB) {
305214571Sdim      MachineBasicBlock &MBB = *BB;
306214571Sdim      MachineBasicBlock::iterator I = MBB.begin();
307214571Sdim      if (I->getOpcode() == AMDGPU::CF_ALU)
308214571Sdim        continue; // BB was already parsed
309214571Sdim      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
310214571Sdim        if (isALU(I))
311214571Sdim          I = MakeALUClause(MBB, I);
312214571Sdim        else
313214571Sdim          ++I;
314214571Sdim      }
315214571Sdim    }
316214571Sdim    return false;
317214571Sdim  }
318214571Sdim
319214571Sdim  const char *getPassName() const override {
320214571Sdim    return "R600 Emit Clause Markers Pass";
321214571Sdim  }
322214571Sdim};
323214571Sdim
324214571Sdimchar R600EmitClauseMarkers::ID = 0;
325214571Sdim
326214571Sdim} // end anonymous namespace
327214571Sdim
328214571SdimINITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
329214571Sdim                      "R600 Emit Clause Markters", false, false)
330214571SdimINITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
331214571Sdim                      "R600 Emit Clause Markters", false, false)
332214571Sdim
333214571Sdimllvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
334214571Sdim  return new R600EmitClauseMarkers();
335214571Sdim}
336214571Sdim
337214571Sdim