1249259Sdim//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2249259Sdim//
3249259Sdim//                     The LLVM Compiler Infrastructure
4249259Sdim//
5249259Sdim// This file is distributed under the University of Illinois Open Source
6249259Sdim// License. See LICENSE.TXT for details.
7249259Sdim//
8249259Sdim//===----------------------------------------------------------------------===//
9249259Sdim//
10249259Sdim/// \file
11249259Sdim/// \brief Insert wait instructions for memory reads and writes.
12249259Sdim///
13249259Sdim/// Memory reads and writes are issued asynchronously, so we need to insert
14249259Sdim/// S_WAITCNT instructions when we want to access any of their results or
15249259Sdim/// overwrite any register that's used asynchronously.
16249259Sdim//
17249259Sdim//===----------------------------------------------------------------------===//
18249259Sdim
19249259Sdim#include "AMDGPU.h"
20249259Sdim#include "SIInstrInfo.h"
21249259Sdim#include "SIMachineFunctionInfo.h"
22249259Sdim#include "llvm/CodeGen/MachineFunction.h"
23249259Sdim#include "llvm/CodeGen/MachineFunctionPass.h"
24249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
25249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
26249259Sdim
27249259Sdimusing namespace llvm;
28249259Sdim
29249259Sdimnamespace {
30249259Sdim
31249259Sdim/// \brief One variable for each of the hardware counters
32249259Sdimtypedef union {
33249259Sdim  struct {
34249259Sdim    unsigned VM;
35249259Sdim    unsigned EXP;
36249259Sdim    unsigned LGKM;
37249259Sdim  } Named;
38249259Sdim  unsigned Array[3];
39249259Sdim
40249259Sdim} Counters;
41249259Sdim
42249259Sdimtypedef Counters RegCounters[512];
43249259Sdimtypedef std::pair<unsigned, unsigned> RegInterval;
44249259Sdim
45249259Sdimclass SIInsertWaits : public MachineFunctionPass {
46249259Sdim
47249259Sdimprivate:
48249259Sdim  static char ID;
49249259Sdim  const SIInstrInfo *TII;
50263508Sdim  const SIRegisterInfo *TRI;
51249259Sdim  const MachineRegisterInfo *MRI;
52249259Sdim
53249259Sdim  /// \brief Constant hardware limits
54249259Sdim  static const Counters WaitCounts;
55249259Sdim
56249259Sdim  /// \brief Constant zero value
57249259Sdim  static const Counters ZeroCounts;
58249259Sdim
59249259Sdim  /// \brief Counter values we have already waited on.
60249259Sdim  Counters WaitedOn;
61249259Sdim
62249259Sdim  /// \brief Counter values for last instruction issued.
63249259Sdim  Counters LastIssued;
64249259Sdim
65249259Sdim  /// \brief Registers used by async instructions.
66249259Sdim  RegCounters UsedRegs;
67249259Sdim
68249259Sdim  /// \brief Registers defined by async instructions.
69249259Sdim  RegCounters DefinedRegs;
70249259Sdim
71249259Sdim  /// \brief Different export instruction types seen since last wait.
72249259Sdim  unsigned ExpInstrTypesSeen;
73249259Sdim
74249259Sdim  /// \brief Get increment/decrement amount for this instruction.
75249259Sdim  Counters getHwCounts(MachineInstr &MI);
76249259Sdim
77249259Sdim  /// \brief Is operand relevant for async execution?
78249259Sdim  bool isOpRelevant(MachineOperand &Op);
79249259Sdim
80249259Sdim  /// \brief Get register interval an operand affects.
81249259Sdim  RegInterval getRegInterval(MachineOperand &Op);
82249259Sdim
83249259Sdim  /// \brief Handle instructions async components
84249259Sdim  void pushInstruction(MachineInstr &MI);
85249259Sdim
86249259Sdim  /// \brief Insert the actual wait instruction
87249259Sdim  bool insertWait(MachineBasicBlock &MBB,
88249259Sdim                  MachineBasicBlock::iterator I,
89249259Sdim                  const Counters &Counts);
90249259Sdim
91249259Sdim  /// \brief Do we need def2def checks?
92249259Sdim  bool unorderedDefines(MachineInstr &MI);
93249259Sdim
94249259Sdim  /// \brief Resolve all operand dependencies to counter requirements
95249259Sdim  Counters handleOperands(MachineInstr &MI);
96249259Sdim
97249259Sdimpublic:
98249259Sdim  SIInsertWaits(TargetMachine &tm) :
99249259Sdim    MachineFunctionPass(ID),
100263508Sdim    TII(0),
101263508Sdim    TRI(0),
102263508Sdim    ExpInstrTypesSeen(0) { }
103249259Sdim
104249259Sdim  virtual bool runOnMachineFunction(MachineFunction &MF);
105249259Sdim
106249259Sdim  const char *getPassName() const {
107249259Sdim    return "SI insert wait  instructions";
108249259Sdim  }
109249259Sdim
110249259Sdim};
111249259Sdim
112249259Sdim} // End anonymous namespace
113249259Sdim
114249259Sdimchar SIInsertWaits::ID = 0;
115249259Sdim
116249259Sdimconst Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
117249259Sdimconst Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
118249259Sdim
119249259SdimFunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
120249259Sdim  return new SIInsertWaits(tm);
121249259Sdim}
122249259Sdim
123249259SdimCounters SIInsertWaits::getHwCounts(MachineInstr &MI) {
124249259Sdim
125249259Sdim  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
126249259Sdim  Counters Result;
127249259Sdim
128249259Sdim  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
129249259Sdim
130249259Sdim  // Only consider stores or EXP for EXP_CNT
131249259Sdim  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
132249259Sdim      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
133249259Sdim
134249259Sdim  // LGKM may uses larger values
135249259Sdim  if (TSFlags & SIInstrFlags::LGKM_CNT) {
136249259Sdim
137263508Sdim    if (TII->isSMRD(MI.getOpcode())) {
138249259Sdim
139263508Sdim      MachineOperand &Op = MI.getOperand(0);
140263508Sdim      assert(Op.isReg() && "First LGKM operand must be a register!");
141249259Sdim
142263508Sdim      unsigned Reg = Op.getReg();
143263508Sdim      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
144263508Sdim      Result.Named.LGKM = Size > 4 ? 2 : 1;
145263508Sdim
146263508Sdim    } else {
147263508Sdim      // DS
148263508Sdim      Result.Named.LGKM = 1;
149263508Sdim    }
150263508Sdim
151249259Sdim  } else {
152249259Sdim    Result.Named.LGKM = 0;
153249259Sdim  }
154249259Sdim
155249259Sdim  return Result;
156249259Sdim}
157249259Sdim
158249259Sdimbool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
159249259Sdim
160249259Sdim  // Constants are always irrelevant
161249259Sdim  if (!Op.isReg())
162249259Sdim    return false;
163249259Sdim
164249259Sdim  // Defines are always relevant
165249259Sdim  if (Op.isDef())
166249259Sdim    return true;
167249259Sdim
168249259Sdim  // For exports all registers are relevant
169249259Sdim  MachineInstr &MI = *Op.getParent();
170249259Sdim  if (MI.getOpcode() == AMDGPU::EXP)
171249259Sdim    return true;
172249259Sdim
173249259Sdim  // For stores the stored value is also relevant
174249259Sdim  if (!MI.getDesc().mayStore())
175249259Sdim    return false;
176249259Sdim
177249259Sdim  for (MachineInstr::mop_iterator I = MI.operands_begin(),
178249259Sdim       E = MI.operands_end(); I != E; ++I) {
179249259Sdim
180249259Sdim    if (I->isReg() && I->isUse())
181249259Sdim      return Op.isIdenticalTo(*I);
182249259Sdim  }
183249259Sdim
184249259Sdim  return false;
185249259Sdim}
186249259Sdim
187249259SdimRegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
188249259Sdim
189263508Sdim  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
190249259Sdim    return std::make_pair(0, 0);
191249259Sdim
192249259Sdim  unsigned Reg = Op.getReg();
193263508Sdim  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
194249259Sdim
195249259Sdim  assert(Size >= 4);
196249259Sdim
197249259Sdim  RegInterval Result;
198263508Sdim  Result.first = TRI->getEncodingValue(Reg);
199249259Sdim  Result.second = Result.first + Size / 4;
200249259Sdim
201249259Sdim  return Result;
202249259Sdim}
203249259Sdim
204249259Sdimvoid SIInsertWaits::pushInstruction(MachineInstr &MI) {
205249259Sdim
206249259Sdim  // Get the hardware counter increments and sum them up
207249259Sdim  Counters Increment = getHwCounts(MI);
208249259Sdim  unsigned Sum = 0;
209249259Sdim
210249259Sdim  for (unsigned i = 0; i < 3; ++i) {
211249259Sdim    LastIssued.Array[i] += Increment.Array[i];
212249259Sdim    Sum += Increment.Array[i];
213249259Sdim  }
214249259Sdim
215249259Sdim  // If we don't increase anything then that's it
216249259Sdim  if (Sum == 0)
217249259Sdim    return;
218249259Sdim
219249259Sdim  // Remember which export instructions we have seen
220249259Sdim  if (Increment.Named.EXP) {
221249259Sdim    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
222249259Sdim  }
223249259Sdim
224249259Sdim  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
225249259Sdim
226249259Sdim    MachineOperand &Op = MI.getOperand(i);
227249259Sdim    if (!isOpRelevant(Op))
228249259Sdim      continue;
229249259Sdim
230249259Sdim    RegInterval Interval = getRegInterval(Op);
231249259Sdim    for (unsigned j = Interval.first; j < Interval.second; ++j) {
232249259Sdim
233249259Sdim      // Remember which registers we define
234249259Sdim      if (Op.isDef())
235249259Sdim        DefinedRegs[j] = LastIssued;
236249259Sdim
237249259Sdim      // and which one we are using
238249259Sdim      if (Op.isUse())
239249259Sdim        UsedRegs[j] = LastIssued;
240249259Sdim    }
241249259Sdim  }
242249259Sdim}
243249259Sdim
244249259Sdimbool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
245249259Sdim                               MachineBasicBlock::iterator I,
246249259Sdim                               const Counters &Required) {
247249259Sdim
248249259Sdim  // End of program? No need to wait on anything
249249259Sdim  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
250249259Sdim    return false;
251249259Sdim
252249259Sdim  // Figure out if the async instructions execute in order
253249259Sdim  bool Ordered[3];
254249259Sdim
255249259Sdim  // VM_CNT is always ordered
256249259Sdim  Ordered[0] = true;
257249259Sdim
258249259Sdim  // EXP_CNT is unordered if we have both EXP & VM-writes
259249259Sdim  Ordered[1] = ExpInstrTypesSeen == 3;
260249259Sdim
261249259Sdim  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
262249259Sdim  Ordered[2] = false;
263249259Sdim
264249259Sdim  // The values we are going to put into the S_WAITCNT instruction
265249259Sdim  Counters Counts = WaitCounts;
266249259Sdim
267249259Sdim  // Do we really need to wait?
268249259Sdim  bool NeedWait = false;
269249259Sdim
270249259Sdim  for (unsigned i = 0; i < 3; ++i) {
271249259Sdim
272249259Sdim    if (Required.Array[i] <= WaitedOn.Array[i])
273249259Sdim      continue;
274249259Sdim
275249259Sdim    NeedWait = true;
276249259Sdim
277249259Sdim    if (Ordered[i]) {
278249259Sdim      unsigned Value = LastIssued.Array[i] - Required.Array[i];
279249259Sdim
280249259Sdim      // adjust the value to the real hardware posibilities
281249259Sdim      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
282249259Sdim
283249259Sdim    } else
284249259Sdim      Counts.Array[i] = 0;
285249259Sdim
286249259Sdim    // Remember on what we have waited on
287249259Sdim    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
288249259Sdim  }
289249259Sdim
290249259Sdim  if (!NeedWait)
291249259Sdim    return false;
292249259Sdim
293249259Sdim  // Reset EXP_CNT instruction types
294249259Sdim  if (Counts.Named.EXP == 0)
295249259Sdim    ExpInstrTypesSeen = 0;
296249259Sdim
297249259Sdim  // Build the wait instruction
298249259Sdim  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
299249259Sdim          .addImm((Counts.Named.VM & 0xF) |
300249259Sdim                  ((Counts.Named.EXP & 0x7) << 4) |
301249259Sdim                  ((Counts.Named.LGKM & 0x7) << 8));
302249259Sdim
303249259Sdim  return true;
304249259Sdim}
305249259Sdim
306249259Sdim/// \brief helper function for handleOperands
307249259Sdimstatic void increaseCounters(Counters &Dst, const Counters &Src) {
308249259Sdim
309249259Sdim  for (unsigned i = 0; i < 3; ++i)
310249259Sdim    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
311249259Sdim}
312249259Sdim
313249259SdimCounters SIInsertWaits::handleOperands(MachineInstr &MI) {
314249259Sdim
315249259Sdim  Counters Result = ZeroCounts;
316249259Sdim
317266715Sdim  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
318266715Sdim  // but we also want to wait for any other outstanding transfers before
319266715Sdim  // signalling other hardware blocks
320266715Sdim  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
321266715Sdim    return LastIssued;
322266715Sdim
323249259Sdim  // For each register affected by this
324249259Sdim  // instruction increase the result sequence
325249259Sdim  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
326249259Sdim
327249259Sdim    MachineOperand &Op = MI.getOperand(i);
328249259Sdim    RegInterval Interval = getRegInterval(Op);
329249259Sdim    for (unsigned j = Interval.first; j < Interval.second; ++j) {
330249259Sdim
331249259Sdim      if (Op.isDef()) {
332249259Sdim        increaseCounters(Result, UsedRegs[j]);
333249259Sdim        increaseCounters(Result, DefinedRegs[j]);
334249259Sdim      }
335249259Sdim
336249259Sdim      if (Op.isUse())
337249259Sdim        increaseCounters(Result, DefinedRegs[j]);
338249259Sdim    }
339249259Sdim  }
340249259Sdim
341249259Sdim  return Result;
342249259Sdim}
343249259Sdim
344249259Sdimbool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
345249259Sdim  bool Changes = false;
346249259Sdim
347263508Sdim  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
348263508Sdim  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
349263508Sdim
350249259Sdim  MRI = &MF.getRegInfo();
351249259Sdim
352249259Sdim  WaitedOn = ZeroCounts;
353249259Sdim  LastIssued = ZeroCounts;
354249259Sdim
355249259Sdim  memset(&UsedRegs, 0, sizeof(UsedRegs));
356249259Sdim  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
357249259Sdim
358249259Sdim  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
359249259Sdim       BI != BE; ++BI) {
360249259Sdim
361249259Sdim    MachineBasicBlock &MBB = *BI;
362249259Sdim    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
363249259Sdim         I != E; ++I) {
364249259Sdim
365249259Sdim      Changes |= insertWait(MBB, I, handleOperands(*I));
366249259Sdim      pushInstruction(*I);
367249259Sdim    }
368249259Sdim
369249259Sdim    // Wait for everything at the end of the MBB
370249259Sdim    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
371249259Sdim  }
372249259Sdim
373249259Sdim  return Changes;
374249259Sdim}
375