1284677Sdim//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
2284677Sdim//
3284677Sdim//                     The LLVM Compiler Infrastructure
4284677Sdim//
5284677Sdim// This file is distributed under the University of Illinois Open Source
6284677Sdim// License. See LICENSE.TXT for details.
7284677Sdim//
8284677Sdim//===----------------------------------------------------------------------===//
9284677Sdim//
10284677Sdim/// \file
11284677Sdim/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
12284677Sdim/// 128 Alu instructions ; these instructions can access up to 4 prefetched
13284677Sdim/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
14284677Sdim/// initiated by CF_ALU instructions.
15284677Sdim//===----------------------------------------------------------------------===//
16284677Sdim
17284677Sdim#include "AMDGPU.h"
18284677Sdim#include "R600Defines.h"
19284677Sdim#include "R600InstrInfo.h"
20284677Sdim#include "R600MachineFunctionInfo.h"
21284677Sdim#include "R600RegisterInfo.h"
22284677Sdim#include "AMDGPUSubtarget.h"
23284677Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
24284677Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
25284677Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
26284677Sdim
27284677Sdimusing namespace llvm;
28284677Sdim
29284677Sdimnamespace llvm {
30284677Sdim  void initializeR600EmitClauseMarkersPass(PassRegistry&);
31284677Sdim}
32284677Sdim
33284677Sdimnamespace {
34284677Sdim
35284677Sdimclass R600EmitClauseMarkers : public MachineFunctionPass {
36284677Sdim
37284677Sdimprivate:
38284677Sdim  const R600InstrInfo *TII;
39284677Sdim  int Address;
40284677Sdim
41284677Sdim  unsigned OccupiedDwords(MachineInstr *MI) const {
42284677Sdim    switch (MI->getOpcode()) {
43284677Sdim    case AMDGPU::INTERP_PAIR_XY:
44284677Sdim    case AMDGPU::INTERP_PAIR_ZW:
45284677Sdim    case AMDGPU::INTERP_VEC_LOAD:
46284677Sdim    case AMDGPU::DOT_4:
47284677Sdim      return 4;
48284677Sdim    case AMDGPU::KILL:
49284677Sdim      return 0;
50284677Sdim    default:
51284677Sdim      break;
52284677Sdim    }
53284677Sdim
54284677Sdim    // These will be expanded to two ALU instructions in the
55284677Sdim    // ExpandSpecialInstructions pass.
56284677Sdim    if (TII->isLDSRetInstr(MI->getOpcode()))
57284677Sdim      return 2;
58284677Sdim
59284677Sdim    if(TII->isVector(*MI) ||
60284677Sdim        TII->isCubeOp(MI->getOpcode()) ||
61284677Sdim        TII->isReductionOp(MI->getOpcode()))
62284677Sdim      return 4;
63284677Sdim
64284677Sdim    unsigned NumLiteral = 0;
65284677Sdim    for (MachineInstr::mop_iterator It = MI->operands_begin(),
66284677Sdim        E = MI->operands_end(); It != E; ++It) {
67284677Sdim      MachineOperand &MO = *It;
68284677Sdim      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
69284677Sdim        ++NumLiteral;
70284677Sdim    }
71284677Sdim    return 1 + NumLiteral;
72284677Sdim  }
73284677Sdim
74284677Sdim  bool isALU(const MachineInstr *MI) const {
75284677Sdim    if (TII->isALUInstr(MI->getOpcode()))
76284677Sdim      return true;
77284677Sdim    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
78284677Sdim      return true;
79284677Sdim    switch (MI->getOpcode()) {
80284677Sdim    case AMDGPU::PRED_X:
81284677Sdim    case AMDGPU::INTERP_PAIR_XY:
82284677Sdim    case AMDGPU::INTERP_PAIR_ZW:
83284677Sdim    case AMDGPU::INTERP_VEC_LOAD:
84284677Sdim    case AMDGPU::COPY:
85284677Sdim    case AMDGPU::DOT_4:
86284677Sdim      return true;
87284677Sdim    default:
88284677Sdim      return false;
89284677Sdim    }
90284677Sdim  }
91284677Sdim
92284677Sdim  bool IsTrivialInst(MachineInstr *MI) const {
93284677Sdim    switch (MI->getOpcode()) {
94284677Sdim    case AMDGPU::KILL:
95284677Sdim    case AMDGPU::RETURN:
96284677Sdim    case AMDGPU::IMPLICIT_DEF:
97284677Sdim      return true;
98284677Sdim    default:
99284677Sdim      return false;
100284677Sdim    }
101284677Sdim  }
102284677Sdim
103284677Sdim  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
104284677Sdim    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
105284677Sdim    // (See also R600ISelLowering.cpp)
106284677Sdim    // ConstIndex value is in [0, 4095];
107284677Sdim    return std::pair<unsigned, unsigned>(
108284677Sdim        ((Sel >> 2) - 512) >> 12, // KC_BANK
109284677Sdim        // Line Number of ConstIndex
110284677Sdim        // A line contains 16 constant registers however KCX bank can lock
111284677Sdim        // two line at the same time ; thus we want to get an even line number.
112284677Sdim        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
113284677Sdim        // an even number.
114284677Sdim        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
115284677Sdim  }
116284677Sdim
117284677Sdim  bool SubstituteKCacheBank(MachineInstr *MI,
118284677Sdim      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
119284677Sdim      bool UpdateInstr = true) const {
120284677Sdim    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
121284677Sdim
122284677Sdim    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
123284677Sdim      return true;
124284677Sdim
125284677Sdim    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
126284677Sdim        TII->getSrcs(MI);
127284677Sdim    assert((TII->isALUInstr(MI->getOpcode()) ||
128284677Sdim        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
129284677Sdim    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
130284677Sdim      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
131284677Sdim        continue;
132284677Sdim      unsigned Sel = Consts[i].second;
133284677Sdim      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
134284677Sdim      unsigned KCacheIndex = Index * 4 + Chan;
135284677Sdim      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
136284677Sdim      if (CachedConsts.empty()) {
137284677Sdim        CachedConsts.push_back(BankLine);
138284677Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
139284677Sdim        continue;
140284677Sdim      }
141284677Sdim      if (CachedConsts[0] == BankLine) {
142284677Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
143284677Sdim        continue;
144284677Sdim      }
145284677Sdim      if (CachedConsts.size() == 1) {
146284677Sdim        CachedConsts.push_back(BankLine);
147284677Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
148284677Sdim        continue;
149284677Sdim      }
150284677Sdim      if (CachedConsts[1] == BankLine) {
151284677Sdim        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
152284677Sdim        continue;
153284677Sdim      }
154284677Sdim      return false;
155284677Sdim    }
156284677Sdim
157284677Sdim    if (!UpdateInstr)
158284677Sdim      return true;
159284677Sdim
160284677Sdim    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
161284677Sdim      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
162284677Sdim        continue;
163284677Sdim      switch(UsedKCache[j].first) {
164284677Sdim      case 0:
165284677Sdim        Consts[i].first->setReg(
166284677Sdim            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
167284677Sdim        break;
168284677Sdim      case 1:
169284677Sdim        Consts[i].first->setReg(
170284677Sdim            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
171284677Sdim        break;
172284677Sdim      default:
173284677Sdim        llvm_unreachable("Wrong Cache Line");
174284677Sdim      }
175284677Sdim      j++;
176284677Sdim    }
177284677Sdim    return true;
178284677Sdim  }
179284677Sdim
180284677Sdim  bool canClauseLocalKillFitInClause(
181284677Sdim                        unsigned AluInstCount,
182284677Sdim                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
183284677Sdim                        MachineBasicBlock::iterator Def,
184284677Sdim                        MachineBasicBlock::iterator BBEnd) {
185284677Sdim    const R600RegisterInfo &TRI = TII->getRegisterInfo();
186284677Sdim    for (MachineInstr::const_mop_iterator
187284677Sdim           MOI = Def->operands_begin(),
188284677Sdim           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
189284677Sdim      if (!MOI->isReg() || !MOI->isDef() ||
190284677Sdim          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
191284677Sdim        continue;
192284677Sdim
193284677Sdim      // Def defines a clause local register, so check that its use will fit
194284677Sdim      // in the clause.
195284677Sdim      unsigned LastUseCount = 0;
196284677Sdim      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
197284677Sdim        AluInstCount += OccupiedDwords(UseI);
198284677Sdim        // Make sure we won't need to end the clause due to KCache limitations.
199284677Sdim        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
200284677Sdim          return false;
201284677Sdim
202284677Sdim        // We have reached the maximum instruction limit before finding the
203284677Sdim        // use that kills this register, so we cannot use this def in the
204284677Sdim        // current clause.
205284677Sdim        if (AluInstCount >= TII->getMaxAlusPerClause())
206284677Sdim          return false;
207284677Sdim
208284677Sdim        // Register kill flags have been cleared by the time we get to this
209284677Sdim        // pass, but it is safe to assume that all uses of this register
210284677Sdim        // occur in the same basic block as its definition, because
211284677Sdim        // it is illegal for the scheduler to schedule them in
212284677Sdim        // different blocks.
213284677Sdim        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
214284677Sdim          LastUseCount = AluInstCount;
215284677Sdim
216284677Sdim        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
217284677Sdim          break;
218284677Sdim      }
219284677Sdim      if (LastUseCount)
220284677Sdim        return LastUseCount <= TII->getMaxAlusPerClause();
221284677Sdim      llvm_unreachable("Clause local register live at end of clause.");
222284677Sdim    }
223284677Sdim    return true;
224284677Sdim  }
225284677Sdim
226284677Sdim  MachineBasicBlock::iterator
227284677Sdim  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
228284677Sdim    MachineBasicBlock::iterator ClauseHead = I;
229284677Sdim    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
230284677Sdim    bool PushBeforeModifier = false;
231284677Sdim    unsigned AluInstCount = 0;
232284677Sdim    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
233284677Sdim      if (IsTrivialInst(I))
234284677Sdim        continue;
235284677Sdim      if (!isALU(I))
236284677Sdim        break;
237284677Sdim      if (AluInstCount > TII->getMaxAlusPerClause())
238284677Sdim        break;
239284677Sdim      if (I->getOpcode() == AMDGPU::PRED_X) {
240284677Sdim        // We put PRED_X in its own clause to ensure that ifcvt won't create
241284677Sdim        // clauses with more than 128 insts.
242284677Sdim        // IfCvt is indeed checking that "then" and "else" branches of an if
243284677Sdim        // statement have less than ~60 insts thus converted clauses can't be
244284677Sdim        // bigger than ~121 insts (predicate setter needs to be in the same
245284677Sdim        // clause as predicated alus).
246284677Sdim        if (AluInstCount > 0)
247284677Sdim          break;
248284677Sdim        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
249284677Sdim          PushBeforeModifier = true;
250284677Sdim        AluInstCount ++;
251284677Sdim        continue;
252284677Sdim      }
253284677Sdim      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
254284677Sdim      //
255284677Sdim      // * KILL or INTERP instructions
256284677Sdim      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
257284677Sdim      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
258284677Sdim      //
259284677Sdim      // XXX: These checks have not been implemented yet.
260284677Sdim      if (TII->mustBeLastInClause(I->getOpcode())) {
261284677Sdim        I++;
262284677Sdim        break;
263284677Sdim      }
264284677Sdim
265284677Sdim      // If this instruction defines a clause local register, make sure
266284677Sdim      // its use can fit in this clause.
267284677Sdim      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
268284677Sdim        break;
269284677Sdim
270284677Sdim      if (!SubstituteKCacheBank(I, KCacheBanks))
271284677Sdim        break;
272284677Sdim      AluInstCount += OccupiedDwords(I);
273284677Sdim    }
274284677Sdim    unsigned Opcode = PushBeforeModifier ?
275284677Sdim        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
276284677Sdim    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
277284677Sdim    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
278284677Sdim    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
279284677Sdim    // pass may assume that identical ALU clause starter at the beginning of a
280284677Sdim    // true and false branch can be factorized which is not the case.
281284677Sdim        .addImm(Address++) // ADDR
282284677Sdim        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
283284677Sdim        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
284284677Sdim        .addImm(KCacheBanks.empty()?0:2) // KM0
285284677Sdim        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
286284677Sdim        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
287284677Sdim        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
288284677Sdim        .addImm(AluInstCount) // COUNT
289284677Sdim        .addImm(1); // Enabled
290284677Sdim    return I;
291284677Sdim  }
292284677Sdim
293284677Sdimpublic:
294284677Sdim  static char ID;
295284677Sdim  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
296284677Sdim
297284677Sdim    initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
298284677Sdim  }
299284677Sdim
300284677Sdim  bool runOnMachineFunction(MachineFunction &MF) override {
301284677Sdim    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
302284677Sdim
303284677Sdim    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
304284677Sdim                                                    BB != BB_E; ++BB) {
305284677Sdim      MachineBasicBlock &MBB = *BB;
306284677Sdim      MachineBasicBlock::iterator I = MBB.begin();
307284677Sdim      if (I->getOpcode() == AMDGPU::CF_ALU)
308284677Sdim        continue; // BB was already parsed
309284677Sdim      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
310284677Sdim        if (isALU(I))
311284677Sdim          I = MakeALUClause(MBB, I);
312284677Sdim        else
313284677Sdim          ++I;
314284677Sdim      }
315284677Sdim    }
316284677Sdim    return false;
317284677Sdim  }
318284677Sdim
319284677Sdim  const char *getPassName() const override {
320284677Sdim    return "R600 Emit Clause Markers Pass";
321284677Sdim  }
322284677Sdim};
323284677Sdim
324284677Sdimchar R600EmitClauseMarkers::ID = 0;
325284677Sdim
326284677Sdim} // end anonymous namespace
327284677Sdim
328284677SdimINITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
329284677Sdim                      "R600 Emit Clause Markters", false, false)
330284677SdimINITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
331284677Sdim                      "R600 Emit Clause Markters", false, false)
332284677Sdim
333284677Sdimllvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
334284677Sdim  return new R600EmitClauseMarkers();
335284677Sdim}
336284677Sdim
337