1//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// R600 Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "R600MachineScheduler.h"
15#include "AMDGPUSubtarget.h"
16#include "R600InstrInfo.h"
17#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18#include "llvm/CodeGen/MachineRegisterInfo.h"
19#include "llvm/IR/LegacyPassManager.h"
20#include "llvm/Pass.h"
21#include "llvm/Support/raw_ostream.h"
22
23using namespace llvm;
24
25#define DEBUG_TYPE "machine-scheduler"
26
27void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
28  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
29  DAG = static_cast<ScheduleDAGMILive*>(dag);
30  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
31  TII = static_cast<const R600InstrInfo*>(DAG->TII);
32  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
33  VLIW5 = !ST.hasCaymanISA();
34  MRI = &DAG->MRI;
35  CurInstKind = IDOther;
36  CurEmitted = 0;
37  OccupedSlotsMask = 31;
38  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
39  InstKindLimit[IDOther] = 32;
40  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
41  AluInstCount = 0;
42  FetchInstCount = 0;
43}
44
45void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
46                                  std::vector<SUnit *> &QDst)
47{
48  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
49  QSrc.clear();
50}
51
52static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
53  assert (GPRCount && "GPRCount cannot be 0");
54  return 248 / GPRCount;
55}
56
57SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
58  SUnit *SU = nullptr;
59  NextInstKind = IDOther;
60
61  IsTopNode = false;
62
63  // check if we might want to switch current clause type
64  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
65      (Available[CurInstKind].empty());
66  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
67      (!Available[IDFetch].empty() || !Available[IDOther].empty());
68
69  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
70    // We use the heuristic provided by AMD Accelerated Parallel Processing
71    // OpenCL Programming Guide :
72    // The approx. number of WF that allows TEX inst to hide ALU inst is :
73    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
74    float ALUFetchRationEstimate =
75        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
76        (FetchInstCount + Available[IDFetch].size());
77    if (ALUFetchRationEstimate == 0) {
78      AllowSwitchFromAlu = true;
79    } else {
80      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
81      LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
82      // We assume the local GPR requirements to be "dominated" by the requirement
83      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
84      // after TEX are indeed likely to consume or generate values from/for the
85      // TEX clause.
86      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
87      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
88      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
89      // (TODO : use RegisterPressure)
90      // If we are going too use too many GPR, we flush Fetch instruction to lower
91      // register pressure on 128 bits regs.
92      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
93      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
94        AllowSwitchFromAlu = true;
95    }
96  }
97
98  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
99      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
100    // try to pick ALU
101    SU = pickAlu();
102    if (!SU && !PhysicalRegCopy.empty()) {
103      SU = PhysicalRegCopy.front();
104      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
105    }
106    if (SU) {
107      if (CurEmitted >= InstKindLimit[IDAlu])
108        CurEmitted = 0;
109      NextInstKind = IDAlu;
110    }
111  }
112
113  if (!SU) {
114    // try to pick FETCH
115    SU = pickOther(IDFetch);
116    if (SU)
117      NextInstKind = IDFetch;
118  }
119
120  // try to pick other
121  if (!SU) {
122    SU = pickOther(IDOther);
123    if (SU)
124      NextInstKind = IDOther;
125  }
126
127  LLVM_DEBUG(if (SU) {
128    dbgs() << " ** Pick node **\n";
129    DAG->dumpNode(*SU);
130  } else {
131    dbgs() << "NO NODE \n";
132    for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
133      const SUnit &S = DAG->SUnits[i];
134      if (!S.isScheduled)
135        DAG->dumpNode(S);
136    }
137  });
138
139  return SU;
140}
141
142void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
143  if (NextInstKind != CurInstKind) {
144    LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
145    if (NextInstKind != IDAlu)
146      OccupedSlotsMask |= 31;
147    CurEmitted = 0;
148    CurInstKind = NextInstKind;
149  }
150
151  if (CurInstKind == IDAlu) {
152    AluInstCount ++;
153    switch (getAluKind(SU)) {
154    case AluT_XYZW:
155      CurEmitted += 4;
156      break;
157    case AluDiscarded:
158      break;
159    default: {
160      ++CurEmitted;
161      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
162          E = SU->getInstr()->operands_end(); It != E; ++It) {
163        MachineOperand &MO = *It;
164        if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
165          ++CurEmitted;
166      }
167    }
168    }
169  } else {
170    ++CurEmitted;
171  }
172
173  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
174
175  if (CurInstKind != IDFetch) {
176    MoveUnits(Pending[IDFetch], Available[IDFetch]);
177  } else
178    FetchInstCount++;
179}
180
181static bool
182isPhysicalRegCopy(MachineInstr *MI) {
183  if (MI->getOpcode() != R600::COPY)
184    return false;
185
186  return !Register::isVirtualRegister(MI->getOperand(1).getReg());
187}
188
189void R600SchedStrategy::releaseTopNode(SUnit *SU) {
190  LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
191}
192
193void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
194  LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
195  if (isPhysicalRegCopy(SU->getInstr())) {
196    PhysicalRegCopy.push_back(SU);
197    return;
198  }
199
200  int IK = getInstKind(SU);
201
202  // There is no export clause, we can schedule one as soon as its ready
203  if (IK == IDOther)
204    Available[IDOther].push_back(SU);
205  else
206    Pending[IK].push_back(SU);
207
208}
209
210bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
211                                          const TargetRegisterClass *RC) const {
212  if (!Register::isVirtualRegister(Reg)) {
213    return RC->contains(Reg);
214  } else {
215    return MRI->getRegClass(Reg) == RC;
216  }
217}
218
219R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
220  MachineInstr *MI = SU->getInstr();
221
222  if (TII->isTransOnly(*MI))
223    return AluTrans;
224
225  switch (MI->getOpcode()) {
226  case R600::PRED_X:
227    return AluPredX;
228  case R600::INTERP_PAIR_XY:
229  case R600::INTERP_PAIR_ZW:
230  case R600::INTERP_VEC_LOAD:
231  case R600::DOT_4:
232    return AluT_XYZW;
233  case R600::COPY:
234    if (MI->getOperand(1).isUndef()) {
235      // MI will become a KILL, don't considers it in scheduling
236      return AluDiscarded;
237    }
238    break;
239  default:
240    break;
241  }
242
243  // Does the instruction take a whole IG ?
244  // XXX: Is it possible to add a helper function in R600InstrInfo that can
245  // be used here and in R600PacketizerList::isSoloInstruction() ?
246  if(TII->isVector(*MI) ||
247     TII->isCubeOp(MI->getOpcode()) ||
248     TII->isReductionOp(MI->getOpcode()) ||
249     MI->getOpcode() == R600::GROUP_BARRIER) {
250    return AluT_XYZW;
251  }
252
253  if (TII->isLDSInstr(MI->getOpcode())) {
254    return AluT_X;
255  }
256
257  // Is the result already assigned to a channel ?
258  unsigned DestSubReg = MI->getOperand(0).getSubReg();
259  switch (DestSubReg) {
260  case R600::sub0:
261    return AluT_X;
262  case R600::sub1:
263    return AluT_Y;
264  case R600::sub2:
265    return AluT_Z;
266  case R600::sub3:
267    return AluT_W;
268  default:
269    break;
270  }
271
272  // Is the result already member of a X/Y/Z/W class ?
273  Register DestReg = MI->getOperand(0).getReg();
274  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
275      regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
276    return AluT_X;
277  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
278    return AluT_Y;
279  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
280    return AluT_Z;
281  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
282    return AluT_W;
283  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
284    return AluT_XYZW;
285
286  // LDS src registers cannot be used in the Trans slot.
287  if (TII->readsLDSSrcReg(*MI))
288    return AluT_XYZW;
289
290  return AluAny;
291}
292
293int R600SchedStrategy::getInstKind(SUnit* SU) {
294  int Opcode = SU->getInstr()->getOpcode();
295
296  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
297    return IDFetch;
298
299  if (TII->isALUInstr(Opcode)) {
300    return IDAlu;
301  }
302
303  switch (Opcode) {
304  case R600::PRED_X:
305  case R600::COPY:
306  case R600::CONST_COPY:
307  case R600::INTERP_PAIR_XY:
308  case R600::INTERP_PAIR_ZW:
309  case R600::INTERP_VEC_LOAD:
310  case R600::DOT_4:
311    return IDAlu;
312  default:
313    return IDOther;
314  }
315}
316
317SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
318  if (Q.empty())
319    return nullptr;
320  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
321      It != E; ++It) {
322    SUnit *SU = *It;
323    InstructionsGroupCandidate.push_back(SU->getInstr());
324    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
325        (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
326      InstructionsGroupCandidate.pop_back();
327      Q.erase((It + 1).base());
328      return SU;
329    } else {
330      InstructionsGroupCandidate.pop_back();
331    }
332  }
333  return nullptr;
334}
335
336void R600SchedStrategy::LoadAlu() {
337  std::vector<SUnit *> &QSrc = Pending[IDAlu];
338  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
339    AluKind AK = getAluKind(QSrc[i]);
340    AvailableAlus[AK].push_back(QSrc[i]);
341  }
342  QSrc.clear();
343}
344
345void R600SchedStrategy::PrepareNextSlot() {
346  LLVM_DEBUG(dbgs() << "New Slot\n");
347  assert (OccupedSlotsMask && "Slot wasn't filled");
348  OccupedSlotsMask = 0;
349//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
350//    OccupedSlotsMask |= 16;
351  InstructionsGroupCandidate.clear();
352  LoadAlu();
353}
354
355void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
356  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
357  if (DstIndex == -1) {
358    return;
359  }
360  Register DestReg = MI->getOperand(DstIndex).getReg();
361  // PressureRegister crashes if an operand is def and used in the same inst
362  // and we try to constraint its regclass
363  for (MachineInstr::mop_iterator It = MI->operands_begin(),
364      E = MI->operands_end(); It != E; ++It) {
365    MachineOperand &MO = *It;
366    if (MO.isReg() && !MO.isDef() &&
367        MO.getReg() == DestReg)
368      return;
369  }
370  // Constrains the regclass of DestReg to assign it to Slot
371  switch (Slot) {
372  case 0:
373    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
374    break;
375  case 1:
376    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
377    break;
378  case 2:
379    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
380    break;
381  case 3:
382    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
383    break;
384  }
385}
386
387SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
388  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
389  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
390  if (SlotedSU)
391    return SlotedSU;
392  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
393  if (UnslotedSU)
394    AssignSlot(UnslotedSU->getInstr(), Slot);
395  return UnslotedSU;
396}
397
398unsigned R600SchedStrategy::AvailablesAluCount() const {
399  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
400      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
401      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
402      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
403      AvailableAlus[AluPredX].size();
404}
405
406SUnit* R600SchedStrategy::pickAlu() {
407  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
408    if (!OccupedSlotsMask) {
409      // Bottom up scheduling : predX must comes first
410      if (!AvailableAlus[AluPredX].empty()) {
411        OccupedSlotsMask |= 31;
412        return PopInst(AvailableAlus[AluPredX], false);
413      }
414      // Flush physical reg copies (RA will discard them)
415      if (!AvailableAlus[AluDiscarded].empty()) {
416        OccupedSlotsMask |= 31;
417        return PopInst(AvailableAlus[AluDiscarded], false);
418      }
419      // If there is a T_XYZW alu available, use it
420      if (!AvailableAlus[AluT_XYZW].empty()) {
421        OccupedSlotsMask |= 15;
422        return PopInst(AvailableAlus[AluT_XYZW], false);
423      }
424    }
425    bool TransSlotOccuped = OccupedSlotsMask & 16;
426    if (!TransSlotOccuped && VLIW5) {
427      if (!AvailableAlus[AluTrans].empty()) {
428        OccupedSlotsMask |= 16;
429        return PopInst(AvailableAlus[AluTrans], false);
430      }
431      SUnit *SU = AttemptFillSlot(3, true);
432      if (SU) {
433        OccupedSlotsMask |= 16;
434        return SU;
435      }
436    }
437    for (int Chan = 3; Chan > -1; --Chan) {
438      bool isOccupied = OccupedSlotsMask & (1 << Chan);
439      if (!isOccupied) {
440        SUnit *SU = AttemptFillSlot(Chan, false);
441        if (SU) {
442          OccupedSlotsMask |= (1 << Chan);
443          InstructionsGroupCandidate.push_back(SU->getInstr());
444          return SU;
445        }
446      }
447    }
448    PrepareNextSlot();
449  }
450  return nullptr;
451}
452
453SUnit* R600SchedStrategy::pickOther(int QID) {
454  SUnit *SU = nullptr;
455  std::vector<SUnit *> &AQ = Available[QID];
456
457  if (AQ.empty()) {
458    MoveUnits(Pending[QID], AQ);
459  }
460  if (!AQ.empty()) {
461    SU = AQ.back();
462    AQ.pop_back();
463  }
464  return SU;
465}
466