1//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// R600 Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "R600MachineScheduler.h"
15#include "MCTargetDesc/R600MCTargetDesc.h"
16#include "R600Subtarget.h"
17
18using namespace llvm;
19
20#define DEBUG_TYPE "machine-scheduler"
21
22void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
23  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
24  DAG = static_cast<ScheduleDAGMILive*>(dag);
25  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
26  TII = static_cast<const R600InstrInfo*>(DAG->TII);
27  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
28  VLIW5 = !ST.hasCaymanISA();
29  MRI = &DAG->MRI;
30  CurInstKind = IDOther;
31  CurEmitted = 0;
32  OccupiedSlotsMask = 31;
33  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
34  InstKindLimit[IDOther] = 32;
35  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
36  AluInstCount = 0;
37  FetchInstCount = 0;
38}
39
40void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
41                                  std::vector<SUnit *> &QDst)
42{
43  llvm::append_range(QDst, QSrc);
44  QSrc.clear();
45}
46
47static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
48  assert (GPRCount && "GPRCount cannot be 0");
49  return 248 / GPRCount;
50}
51
52SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
53  SUnit *SU = nullptr;
54  NextInstKind = IDOther;
55
56  IsTopNode = false;
57
58  // check if we might want to switch current clause type
59  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
60      (Available[CurInstKind].empty());
61  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
62      (!Available[IDFetch].empty() || !Available[IDOther].empty());
63
64  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
65    // We use the heuristic provided by AMD Accelerated Parallel Processing
66    // OpenCL Programming Guide :
67    // The approx. number of WF that allows TEX inst to hide ALU inst is :
68    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
69    float ALUFetchRationEstimate =
70        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
71        (FetchInstCount + Available[IDFetch].size());
72    if (ALUFetchRationEstimate == 0) {
73      AllowSwitchFromAlu = true;
74    } else {
75      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
76      LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
77      // We assume the local GPR requirements to be "dominated" by the requirement
78      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
79      // after TEX are indeed likely to consume or generate values from/for the
80      // TEX clause.
81      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
82      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
83      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
84      // (TODO : use RegisterPressure)
85      // If we are going too use too many GPR, we flush Fetch instruction to lower
86      // register pressure on 128 bits regs.
87      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
88      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
89        AllowSwitchFromAlu = true;
90    }
91  }
92
93  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
94      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
95    // try to pick ALU
96    SU = pickAlu();
97    if (!SU && !PhysicalRegCopy.empty()) {
98      SU = PhysicalRegCopy.front();
99      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
100    }
101    if (SU) {
102      if (CurEmitted >= InstKindLimit[IDAlu])
103        CurEmitted = 0;
104      NextInstKind = IDAlu;
105    }
106  }
107
108  if (!SU) {
109    // try to pick FETCH
110    SU = pickOther(IDFetch);
111    if (SU)
112      NextInstKind = IDFetch;
113  }
114
115  // try to pick other
116  if (!SU) {
117    SU = pickOther(IDOther);
118    if (SU)
119      NextInstKind = IDOther;
120  }
121
122  LLVM_DEBUG(if (SU) {
123    dbgs() << " ** Pick node **\n";
124    DAG->dumpNode(*SU);
125  } else {
126    dbgs() << "NO NODE \n";
127    for (const SUnit &S : DAG->SUnits)
128      if (!S.isScheduled)
129        DAG->dumpNode(S);
130  });
131
132  return SU;
133}
134
135void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
136  if (NextInstKind != CurInstKind) {
137    LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
138    if (NextInstKind != IDAlu)
139      OccupiedSlotsMask |= 31;
140    CurEmitted = 0;
141    CurInstKind = NextInstKind;
142  }
143
144  if (CurInstKind == IDAlu) {
145    AluInstCount ++;
146    switch (getAluKind(SU)) {
147    case AluT_XYZW:
148      CurEmitted += 4;
149      break;
150    case AluDiscarded:
151      break;
152    default: {
153      ++CurEmitted;
154      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
155          E = SU->getInstr()->operands_end(); It != E; ++It) {
156        MachineOperand &MO = *It;
157        if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
158          ++CurEmitted;
159      }
160    }
161    }
162  } else {
163    ++CurEmitted;
164  }
165
166  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
167
168  if (CurInstKind != IDFetch) {
169    MoveUnits(Pending[IDFetch], Available[IDFetch]);
170  } else
171    FetchInstCount++;
172}
173
174static bool
175isPhysicalRegCopy(MachineInstr *MI) {
176  if (MI->getOpcode() != R600::COPY)
177    return false;
178
179  return !MI->getOperand(1).getReg().isVirtual();
180}
181
182void R600SchedStrategy::releaseTopNode(SUnit *SU) {
183  LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
184}
185
186void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
187  LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
188  if (isPhysicalRegCopy(SU->getInstr())) {
189    PhysicalRegCopy.push_back(SU);
190    return;
191  }
192
193  int IK = getInstKind(SU);
194
195  // There is no export clause, we can schedule one as soon as its ready
196  if (IK == IDOther)
197    Available[IDOther].push_back(SU);
198  else
199    Pending[IK].push_back(SU);
200
201}
202
203bool R600SchedStrategy::regBelongsToClass(Register Reg,
204                                          const TargetRegisterClass *RC) const {
205  if (!Reg.isVirtual()) {
206    return RC->contains(Reg);
207  } else {
208    return MRI->getRegClass(Reg) == RC;
209  }
210}
211
212R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
213  MachineInstr *MI = SU->getInstr();
214
215  if (TII->isTransOnly(*MI))
216    return AluTrans;
217
218  switch (MI->getOpcode()) {
219  case R600::PRED_X:
220    return AluPredX;
221  case R600::INTERP_PAIR_XY:
222  case R600::INTERP_PAIR_ZW:
223  case R600::INTERP_VEC_LOAD:
224  case R600::DOT_4:
225    return AluT_XYZW;
226  case R600::COPY:
227    if (MI->getOperand(1).isUndef()) {
228      // MI will become a KILL, don't considers it in scheduling
229      return AluDiscarded;
230    }
231    break;
232  default:
233    break;
234  }
235
236  // Does the instruction take a whole IG ?
237  // XXX: Is it possible to add a helper function in R600InstrInfo that can
238  // be used here and in R600PacketizerList::isSoloInstruction() ?
239  if(TII->isVector(*MI) ||
240     TII->isCubeOp(MI->getOpcode()) ||
241     TII->isReductionOp(MI->getOpcode()) ||
242     MI->getOpcode() == R600::GROUP_BARRIER) {
243    return AluT_XYZW;
244  }
245
246  if (TII->isLDSInstr(MI->getOpcode())) {
247    return AluT_X;
248  }
249
250  // Is the result already assigned to a channel ?
251  unsigned DestSubReg = MI->getOperand(0).getSubReg();
252  switch (DestSubReg) {
253  case R600::sub0:
254    return AluT_X;
255  case R600::sub1:
256    return AluT_Y;
257  case R600::sub2:
258    return AluT_Z;
259  case R600::sub3:
260    return AluT_W;
261  default:
262    break;
263  }
264
265  // Is the result already member of a X/Y/Z/W class ?
266  Register DestReg = MI->getOperand(0).getReg();
267  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
268      regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
269    return AluT_X;
270  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
271    return AluT_Y;
272  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
273    return AluT_Z;
274  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
275    return AluT_W;
276  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
277    return AluT_XYZW;
278
279  // LDS src registers cannot be used in the Trans slot.
280  if (TII->readsLDSSrcReg(*MI))
281    return AluT_XYZW;
282
283  return AluAny;
284}
285
286int R600SchedStrategy::getInstKind(SUnit* SU) {
287  int Opcode = SU->getInstr()->getOpcode();
288
289  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
290    return IDFetch;
291
292  if (TII->isALUInstr(Opcode)) {
293    return IDAlu;
294  }
295
296  switch (Opcode) {
297  case R600::PRED_X:
298  case R600::COPY:
299  case R600::CONST_COPY:
300  case R600::INTERP_PAIR_XY:
301  case R600::INTERP_PAIR_ZW:
302  case R600::INTERP_VEC_LOAD:
303  case R600::DOT_4:
304    return IDAlu;
305  default:
306    return IDOther;
307  }
308}
309
310SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
311  if (Q.empty())
312    return nullptr;
313  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
314      It != E; ++It) {
315    SUnit *SU = *It;
316    InstructionsGroupCandidate.push_back(SU->getInstr());
317    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
318        (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
319      InstructionsGroupCandidate.pop_back();
320      Q.erase((It + 1).base());
321      return SU;
322    } else {
323      InstructionsGroupCandidate.pop_back();
324    }
325  }
326  return nullptr;
327}
328
329void R600SchedStrategy::LoadAlu() {
330  std::vector<SUnit *> &QSrc = Pending[IDAlu];
331  for (SUnit *SU : QSrc) {
332    AluKind AK = getAluKind(SU);
333    AvailableAlus[AK].push_back(SU);
334  }
335  QSrc.clear();
336}
337
338void R600SchedStrategy::PrepareNextSlot() {
339  LLVM_DEBUG(dbgs() << "New Slot\n");
340  assert(OccupiedSlotsMask && "Slot wasn't filled");
341  OccupiedSlotsMask = 0;
342  //  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
343  //    OccupiedSlotsMask |= 16;
344  InstructionsGroupCandidate.clear();
345  LoadAlu();
346}
347
348void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
349  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
350  if (DstIndex == -1) {
351    return;
352  }
353  Register DestReg = MI->getOperand(DstIndex).getReg();
354  // PressureRegister crashes if an operand is def and used in the same inst
355  // and we try to constraint its regclass
356  for (MachineInstr::mop_iterator It = MI->operands_begin(),
357      E = MI->operands_end(); It != E; ++It) {
358    MachineOperand &MO = *It;
359    if (MO.isReg() && !MO.isDef() &&
360        MO.getReg() == DestReg)
361      return;
362  }
363  // Constrains the regclass of DestReg to assign it to Slot
364  switch (Slot) {
365  case 0:
366    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
367    break;
368  case 1:
369    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
370    break;
371  case 2:
372    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
373    break;
374  case 3:
375    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
376    break;
377  }
378}
379
380SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
381  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
382  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
383  if (SlotedSU)
384    return SlotedSU;
385  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
386  if (UnslotedSU)
387    AssignSlot(UnslotedSU->getInstr(), Slot);
388  return UnslotedSU;
389}
390
391unsigned R600SchedStrategy::AvailablesAluCount() const {
392  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
393      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
394      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
395      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
396      AvailableAlus[AluPredX].size();
397}
398
399SUnit* R600SchedStrategy::pickAlu() {
400  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
401    if (!OccupiedSlotsMask) {
402      // Bottom up scheduling : predX must comes first
403      if (!AvailableAlus[AluPredX].empty()) {
404        OccupiedSlotsMask |= 31;
405        return PopInst(AvailableAlus[AluPredX], false);
406      }
407      // Flush physical reg copies (RA will discard them)
408      if (!AvailableAlus[AluDiscarded].empty()) {
409        OccupiedSlotsMask |= 31;
410        return PopInst(AvailableAlus[AluDiscarded], false);
411      }
412      // If there is a T_XYZW alu available, use it
413      if (!AvailableAlus[AluT_XYZW].empty()) {
414        OccupiedSlotsMask |= 15;
415        return PopInst(AvailableAlus[AluT_XYZW], false);
416      }
417    }
418    bool TransSlotOccupied = OccupiedSlotsMask & 16;
419    if (!TransSlotOccupied && VLIW5) {
420      if (!AvailableAlus[AluTrans].empty()) {
421        OccupiedSlotsMask |= 16;
422        return PopInst(AvailableAlus[AluTrans], false);
423      }
424      SUnit *SU = AttemptFillSlot(3, true);
425      if (SU) {
426        OccupiedSlotsMask |= 16;
427        return SU;
428      }
429    }
430    for (int Chan = 3; Chan > -1; --Chan) {
431      bool isOccupied = OccupiedSlotsMask & (1 << Chan);
432      if (!isOccupied) {
433        SUnit *SU = AttemptFillSlot(Chan, false);
434        if (SU) {
435          OccupiedSlotsMask |= (1 << Chan);
436          InstructionsGroupCandidate.push_back(SU->getInstr());
437          return SU;
438        }
439      }
440    }
441    PrepareNextSlot();
442  }
443  return nullptr;
444}
445
446SUnit* R600SchedStrategy::pickOther(int QID) {
447  SUnit *SU = nullptr;
448  std::vector<SUnit *> &AQ = Available[QID];
449
450  if (AQ.empty()) {
451    MoveUnits(Pending[QID], AQ);
452  }
453  if (!AQ.empty()) {
454    SU = AQ.back();
455    AQ.pop_back();
456  }
457  return SU;
458}
459