1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24///   S_MOV_B64 LiveMask, EXEC
25///   S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31///   ...
32///   S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37///   S_OR_SAVEEXEC_B64 Tmp, -1
38///   ...
39///   S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45///  S_MOV_B64 Tmp, EXEC
46///  S_WQM_B64 EXEC, EXEC
47///  ...
48///  S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59///  (1) at the top level (outside of control flow statements, and as long as
60///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61///      the LiveMask (this is implemented for the entry block).
62///
63///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64///      consist of exact and don't-care instructions, the switch only has to
65///      be done at the entry and exit points rather than potentially in each
66///      block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "AMDGPU.h"
71#include "GCNSubtarget.h"
72#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73#include "llvm/ADT/MapVector.h"
74#include "llvm/ADT/PostOrderIterator.h"
75#include "llvm/CodeGen/LiveIntervals.h"
76#include "llvm/CodeGen/MachineBasicBlock.h"
77#include "llvm/CodeGen/MachineDominators.h"
78#include "llvm/CodeGen/MachineFunctionPass.h"
79#include "llvm/CodeGen/MachineInstr.h"
80#include "llvm/CodeGen/MachinePostDominators.h"
81#include "llvm/IR/CallingConv.h"
82#include "llvm/InitializePasses.h"
83#include "llvm/Support/raw_ostream.h"
84
85using namespace llvm;
86
87#define DEBUG_TYPE "si-wqm"
88
89namespace {
90
91enum {
92  StateWQM = 0x1,
93  StateStrictWWM = 0x2,
94  StateStrictWQM = 0x4,
95  StateExact = 0x8,
96  StateStrict = StateStrictWWM | StateStrictWQM,
97};
98
99struct PrintState {
100public:
101  int State;
102
103  explicit PrintState(int State) : State(State) {}
104};
105
106#ifndef NDEBUG
107static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109  static const std::pair<char, const char *> Mapping[] = {
110      std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111      std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112  char State = PS.State;
113  for (auto M : Mapping) {
114    if (State & M.first) {
115      OS << M.second;
116      State &= ~M.first;
117
118      if (State)
119        OS << '|';
120    }
121  }
122  assert(State == 0);
123  return OS;
124}
125#endif
126
127struct InstrInfo {
128  char Needs = 0;
129  char Disabled = 0;
130  char OutNeeds = 0;
131};
132
133struct BlockInfo {
134  char Needs = 0;
135  char InNeeds = 0;
136  char OutNeeds = 0;
137  char InitialState = 0;
138  bool NeedsLowering = false;
139};
140
141struct WorkItem {
142  MachineBasicBlock *MBB = nullptr;
143  MachineInstr *MI = nullptr;
144
145  WorkItem() = default;
146  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147  WorkItem(MachineInstr *MI) : MI(MI) {}
148};
149
150class SIWholeQuadMode : public MachineFunctionPass {
151private:
152  const SIInstrInfo *TII;
153  const SIRegisterInfo *TRI;
154  const GCNSubtarget *ST;
155  MachineRegisterInfo *MRI;
156  LiveIntervals *LIS;
157  MachineDominatorTree *MDT;
158  MachinePostDominatorTree *PDT;
159
160  unsigned AndOpc;
161  unsigned AndN2Opc;
162  unsigned XorOpc;
163  unsigned AndSaveExecOpc;
164  unsigned OrSaveExecOpc;
165  unsigned WQMOpc;
166  Register Exec;
167  Register LiveMaskReg;
168
169  DenseMap<const MachineInstr *, InstrInfo> Instructions;
170  MapVector<MachineBasicBlock *, BlockInfo> Blocks;
171
172  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
173  DenseMap<const MachineInstr *, char> StateTransition;
174
175  SmallVector<MachineInstr *, 2> LiveMaskQueries;
176  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
177  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
178  SmallVector<MachineInstr *, 4> KillInstrs;
179
180  void printInfo();
181
182  void markInstruction(MachineInstr &MI, char Flag,
183                       std::vector<WorkItem> &Worklist);
184  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
185                unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
186  void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
187                   std::vector<WorkItem> &Worklist);
188  void markInstructionUses(const MachineInstr &MI, char Flag,
189                           std::vector<WorkItem> &Worklist);
190  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
191  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
192  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
193  char analyzeFunction(MachineFunction &MF);
194
195  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
196                                      MachineBasicBlock::iterator Before);
197  MachineBasicBlock::iterator
198  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
199                   MachineBasicBlock::iterator Last, bool PreferLast,
200                   bool SaveSCC);
201  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
202               Register SaveWQM);
203  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204             Register SavedWQM);
205  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206                    Register SaveOrig, char StrictStateNeeded);
207  void fromStrictMode(MachineBasicBlock &MBB,
208                      MachineBasicBlock::iterator Before, Register SavedOrig,
209                      char NonStrictState, char CurrentStrictState);
210
211  MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
212
213  MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
214                            bool IsWQM);
215  MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
216  void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
217                             MachineInstr *Exit);
218
219  void lowerBlock(MachineBasicBlock &MBB);
220  void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221
222  void lowerLiveMaskQueries();
223  void lowerCopyInstrs();
224  void lowerKillInstrs(bool IsWQM);
225
226public:
227  static char ID;
228
229  SIWholeQuadMode() :
230    MachineFunctionPass(ID) { }
231
232  bool runOnMachineFunction(MachineFunction &MF) override;
233
234  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235
236  void getAnalysisUsage(AnalysisUsage &AU) const override {
237    AU.addRequired<LiveIntervals>();
238    AU.addPreserved<SlotIndexes>();
239    AU.addPreserved<LiveIntervals>();
240    AU.addRequired<MachineDominatorTree>();
241    AU.addPreserved<MachineDominatorTree>();
242    AU.addRequired<MachinePostDominatorTree>();
243    AU.addPreserved<MachinePostDominatorTree>();
244    MachineFunctionPass::getAnalysisUsage(AU);
245  }
246
247  MachineFunctionProperties getClearedProperties() const override {
248    return MachineFunctionProperties().set(
249        MachineFunctionProperties::Property::IsSSA);
250  }
251};
252
253} // end anonymous namespace
254
255char SIWholeQuadMode::ID = 0;
256
257INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258                      false)
259INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263                    false)
264
265char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266
267FunctionPass *llvm::createSIWholeQuadModePass() {
268  return new SIWholeQuadMode;
269}
270
271#ifndef NDEBUG
272LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273  for (const auto &BII : Blocks) {
274    dbgs() << "\n"
275           << printMBBReference(*BII.first) << ":\n"
276           << "  InNeeds = " << PrintState(BII.second.InNeeds)
277           << ", Needs = " << PrintState(BII.second.Needs)
278           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279
280    for (const MachineInstr &MI : *BII.first) {
281      auto III = Instructions.find(&MI);
282      if (III == Instructions.end())
283        continue;
284
285      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
286             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287    }
288  }
289}
290#endif
291
292void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                      std::vector<WorkItem> &Worklist) {
294  InstrInfo &II = Instructions[&MI];
295
296  assert(!(Flag & StateExact) && Flag != 0);
297
298  // Remove any disabled states from the flag. The user that required it gets
299  // an undefined value in the helper lanes. For example, this can happen if
300  // the result of an atomic is used by instruction that requires WQM, where
301  // ignoring the request for WQM is correct as per the relevant specs.
302  Flag &= ~II.Disabled;
303
304  // Ignore if the flag is already encompassed by the existing needs, or we
305  // just disabled everything.
306  if ((II.Needs & Flag) == Flag)
307    return;
308
309  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310  II.Needs |= Flag;
311  Worklist.push_back(&MI);
312}
313
314/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316                               Register Reg, unsigned SubReg, char Flag,
317                               std::vector<WorkItem> &Worklist) {
318  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319
320  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321  const VNInfo *Value = UseLRQ.valueIn();
322  if (!Value)
323    return;
324
325  // Note: this code assumes that lane masks on AMDGPU completely
326  // cover registers.
327  const LaneBitmask UseLanes =
328      SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329             : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330                                : LaneBitmask::getNone());
331
332  // Perform a depth-first iteration of the LiveRange graph marking defs.
333  // Stop processing of a given branch when all use lanes have been defined.
334  // The first definition stops processing for a physical register.
335  struct PhiEntry {
336    const VNInfo *Phi;
337    unsigned PredIdx;
338    LaneBitmask DefinedLanes;
339
340    PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341        : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342  };
343  using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344  SmallVector<PhiEntry, 2> PhiStack;
345  SmallSet<VisitKey, 4> Visited;
346  LaneBitmask DefinedLanes;
347  unsigned NextPredIdx = 0; // Only used for processing phi nodes
348  do {
349    const VNInfo *NextValue = nullptr;
350    const VisitKey Key(Value, DefinedLanes);
351
352    if (Visited.insert(Key).second) {
353      // On first visit to a phi then start processing first predecessor
354      NextPredIdx = 0;
355    }
356
357    if (Value->isPHIDef()) {
358      // Each predecessor node in the phi must be processed as a subgraph
359      const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360      assert(MBB && "Phi-def has no defining MBB");
361
362      // Find next predecessor to process
363      unsigned Idx = NextPredIdx;
364      auto PI = MBB->pred_begin() + Idx;
365      auto PE = MBB->pred_end();
366      for (; PI != PE && !NextValue; ++PI, ++Idx) {
367        if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368          if (!Visited.count(VisitKey(VN, DefinedLanes)))
369            NextValue = VN;
370        }
371      }
372
373      // If there are more predecessors to process; add phi to stack
374      if (PI != PE)
375        PhiStack.emplace_back(Value, Idx, DefinedLanes);
376    } else {
377      MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378      assert(MI && "Def has no defining instruction");
379
380      if (Reg.isVirtual()) {
381        // Iterate over all operands to find relevant definitions
382        bool HasDef = false;
383        for (const MachineOperand &Op : MI->operands()) {
384          if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
385            continue;
386
387          // Compute lanes defined and overlap with use
388          LaneBitmask OpLanes =
389              Op.isUndef() ? LaneBitmask::getAll()
390                           : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391          LaneBitmask Overlap = (UseLanes & OpLanes);
392
393          // Record if this instruction defined any of use
394          HasDef |= Overlap.any();
395
396          // Mark any lanes defined
397          DefinedLanes |= OpLanes;
398        }
399
400        // Check if all lanes of use have been defined
401        if ((DefinedLanes & UseLanes) != UseLanes) {
402          // Definition not complete; need to process input value
403          LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404          if (const VNInfo *VN = LRQ.valueIn()) {
405            if (!Visited.count(VisitKey(VN, DefinedLanes)))
406              NextValue = VN;
407          }
408        }
409
410        // Only mark the instruction if it defines some part of the use
411        if (HasDef)
412          markInstruction(*MI, Flag, Worklist);
413      } else {
414        // For physical registers simply mark the defining instruction
415        markInstruction(*MI, Flag, Worklist);
416      }
417    }
418
419    if (!NextValue && !PhiStack.empty()) {
420      // Reach end of chain; revert to processing last phi
421      PhiEntry &Entry = PhiStack.back();
422      NextValue = Entry.Phi;
423      NextPredIdx = Entry.PredIdx;
424      DefinedLanes = Entry.DefinedLanes;
425      PhiStack.pop_back();
426    }
427
428    Value = NextValue;
429  } while (Value);
430}
431
432void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433                                  const MachineOperand &Op, char Flag,
434                                  std::vector<WorkItem> &Worklist) {
435  assert(Op.isReg());
436  Register Reg = Op.getReg();
437
438  // Ignore some hardware registers
439  switch (Reg) {
440  case AMDGPU::EXEC:
441  case AMDGPU::EXEC_LO:
442    return;
443  default:
444    break;
445  }
446
447  LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448                    << " for " << MI);
449  if (Reg.isVirtual()) {
450    LiveRange &LR = LIS->getInterval(Reg);
451    markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452  } else {
453    // Handle physical registers that we need to track; this is mostly relevant
454    // for VCC, which can appear as the (implicit) input of a uniform branch,
455    // e.g. when a loop counter is stored in a VGPR.
456    for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
457         ++RegUnit) {
458      LiveRange &LR = LIS->getRegUnit(*RegUnit);
459      const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460      if (!Value)
461        continue;
462
463      markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
464    }
465  }
466}
467
468/// Mark all instructions defining the uses in \p MI with \p Flag.
469void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470                                          std::vector<WorkItem> &Worklist) {
471  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472                    << MI);
473
474  for (const MachineOperand &Use : MI.uses()) {
475    if (!Use.isReg() || !Use.isUse())
476      continue;
477    markOperand(MI, Use, Flag, Worklist);
478  }
479}
480
481// Scan instructions to determine which ones require an Exact execmask and
482// which ones seed WQM requirements.
483char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
484                                       std::vector<WorkItem> &Worklist) {
485  char GlobalFlags = 0;
486  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
487  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
488  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
489  bool HasImplicitDerivatives =
490      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
491
492  // We need to visit the basic blocks in reverse post-order so that we visit
493  // defs before uses, in particular so that we don't accidentally mark an
494  // instruction as needing e.g. WQM before visiting it and realizing it needs
495  // WQM disabled.
496  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
497  for (MachineBasicBlock *MBB : RPOT) {
498    BlockInfo &BBI = Blocks[MBB];
499
500    for (MachineInstr &MI : *MBB) {
501      InstrInfo &III = Instructions[&MI];
502      unsigned Opcode = MI.getOpcode();
503      char Flags = 0;
504
505      if (TII->isWQM(Opcode)) {
506        // If LOD is not supported WQM is not needed.
507        if (!ST->hasExtendedImageInsts())
508          continue;
509        // Only generate implicit WQM if implicit derivatives are required.
510        // This avoids inserting unintended WQM if a shader type without
511        // implicit derivatives uses an image sampling instruction.
512        if (!HasImplicitDerivatives)
513          continue;
514        // Sampling instructions don't need to produce results for all pixels
515        // in a quad, they just require all inputs of a quad to have been
516        // computed for derivatives.
517        markInstructionUses(MI, StateWQM, Worklist);
518        GlobalFlags |= StateWQM;
519        continue;
520      } else if (Opcode == AMDGPU::WQM) {
521        // The WQM intrinsic requires its output to have all the helper lanes
522        // correct, so we need it to be in WQM.
523        Flags = StateWQM;
524        LowerToCopyInstrs.push_back(&MI);
525      } else if (Opcode == AMDGPU::SOFT_WQM) {
526        LowerToCopyInstrs.push_back(&MI);
527        SoftWQMInstrs.push_back(&MI);
528        continue;
529      } else if (Opcode == AMDGPU::STRICT_WWM) {
530        // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531        // it needs to be executed in WQM or Exact so that its copy doesn't
532        // clobber inactive lanes.
533        markInstructionUses(MI, StateStrictWWM, Worklist);
534        GlobalFlags |= StateStrictWWM;
535        LowerToMovInstrs.push_back(&MI);
536        continue;
537      } else if (Opcode == AMDGPU::STRICT_WQM ||
538                 TII->isDualSourceBlendEXP(MI)) {
539        // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540        // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541        // quads that have at least one active thread.
542        markInstructionUses(MI, StateStrictWQM, Worklist);
543        GlobalFlags |= StateStrictWQM;
544
545        if (Opcode == AMDGPU::STRICT_WQM) {
546          LowerToMovInstrs.push_back(&MI);
547        } else {
548          // Dual source blend export acts as implicit strict-wqm, its sources
549          // need to be shuffled in strict wqm, but the export itself needs to
550          // run in exact mode.
551          BBI.Needs |= StateExact;
552          if (!(BBI.InNeeds & StateExact)) {
553            BBI.InNeeds |= StateExact;
554            Worklist.push_back(MBB);
555          }
556          GlobalFlags |= StateExact;
557          III.Disabled = StateWQM | StateStrict;
558        }
559        continue;
560      } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
561                 Opcode == AMDGPU::LDS_DIRECT_LOAD) {
562        // Mark these STRICTWQM, but only for the instruction, not its operands.
563        // This avoid unnecessarily marking M0 as requiring WQM.
564        InstrInfo &II = Instructions[&MI];
565        II.Needs |= StateStrictWQM;
566        GlobalFlags |= StateStrictWQM;
567        continue;
568      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
569                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
570        III.Disabled = StateStrict;
571        MachineOperand &Inactive = MI.getOperand(2);
572        if (Inactive.isReg()) {
573          if (Inactive.isUndef()) {
574            LowerToCopyInstrs.push_back(&MI);
575          } else {
576            markOperand(MI, Inactive, StateStrictWWM, Worklist);
577          }
578        }
579        SetInactiveInstrs.push_back(&MI);
580        continue;
581      } else if (TII->isDisableWQM(MI)) {
582        BBI.Needs |= StateExact;
583        if (!(BBI.InNeeds & StateExact)) {
584          BBI.InNeeds |= StateExact;
585          Worklist.push_back(MBB);
586        }
587        GlobalFlags |= StateExact;
588        III.Disabled = StateWQM | StateStrict;
589        continue;
590      } else {
591        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
592          LiveMaskQueries.push_back(&MI);
593        } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
594                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
595                   Opcode == AMDGPU::SI_DEMOTE_I1) {
596          KillInstrs.push_back(&MI);
597          BBI.NeedsLowering = true;
598        } else if (WQMOutputs) {
599          // The function is in machine SSA form, which means that physical
600          // VGPRs correspond to shader inputs and outputs. Inputs are
601          // only used, outputs are only defined.
602          // FIXME: is this still valid?
603          for (const MachineOperand &MO : MI.defs()) {
604            if (!MO.isReg())
605              continue;
606
607            Register Reg = MO.getReg();
608
609            if (!Reg.isVirtual() &&
610                TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
611              Flags = StateWQM;
612              break;
613            }
614          }
615        }
616
617        if (!Flags)
618          continue;
619      }
620
621      markInstruction(MI, Flags, Worklist);
622      GlobalFlags |= Flags;
623    }
624  }
625
626  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
627  // ever used anywhere in the function. This implements the corresponding
628  // semantics of @llvm.amdgcn.set.inactive.
629  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
630  if (GlobalFlags & StateWQM) {
631    for (MachineInstr *MI : SetInactiveInstrs)
632      markInstruction(*MI, StateWQM, Worklist);
633    for (MachineInstr *MI : SoftWQMInstrs)
634      markInstruction(*MI, StateWQM, Worklist);
635  }
636
637  return GlobalFlags;
638}
639
640void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
641                                           std::vector<WorkItem>& Worklist) {
642  MachineBasicBlock *MBB = MI.getParent();
643  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
644  BlockInfo &BI = Blocks[MBB];
645
646  // Control flow-type instructions and stores to temporary memory that are
647  // followed by WQM computations must themselves be in WQM.
648  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
649      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
650    Instructions[&MI].Needs = StateWQM;
651    II.Needs = StateWQM;
652  }
653
654  // Propagate to block level
655  if (II.Needs & StateWQM) {
656    BI.Needs |= StateWQM;
657    if (!(BI.InNeeds & StateWQM)) {
658      BI.InNeeds |= StateWQM;
659      Worklist.push_back(MBB);
660    }
661  }
662
663  // Propagate backwards within block
664  if (MachineInstr *PrevMI = MI.getPrevNode()) {
665    char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
666    if (!PrevMI->isPHI()) {
667      InstrInfo &PrevII = Instructions[PrevMI];
668      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
669        PrevII.OutNeeds |= InNeeds;
670        Worklist.push_back(PrevMI);
671      }
672    }
673  }
674
675  // Propagate WQM flag to instruction inputs
676  assert(!(II.Needs & StateExact));
677
678  if (II.Needs != 0)
679    markInstructionUses(MI, II.Needs, Worklist);
680
681  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
682  // not require any WQM transitions.
683  if (II.Needs & StateStrictWWM)
684    BI.Needs |= StateStrictWWM;
685  if (II.Needs & StateStrictWQM)
686    BI.Needs |= StateStrictWQM;
687}
688
689void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
690                                     std::vector<WorkItem>& Worklist) {
691  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
692
693  // Propagate through instructions
694  if (!MBB.empty()) {
695    MachineInstr *LastMI = &*MBB.rbegin();
696    InstrInfo &LastII = Instructions[LastMI];
697    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
698      LastII.OutNeeds |= BI.OutNeeds;
699      Worklist.push_back(LastMI);
700    }
701  }
702
703  // Predecessor blocks must provide for our WQM/Exact needs.
704  for (MachineBasicBlock *Pred : MBB.predecessors()) {
705    BlockInfo &PredBI = Blocks[Pred];
706    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
707      continue;
708
709    PredBI.OutNeeds |= BI.InNeeds;
710    PredBI.InNeeds |= BI.InNeeds;
711    Worklist.push_back(Pred);
712  }
713
714  // All successors must be prepared to accept the same set of WQM/Exact data.
715  for (MachineBasicBlock *Succ : MBB.successors()) {
716    BlockInfo &SuccBI = Blocks[Succ];
717    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
718      continue;
719
720    SuccBI.InNeeds |= BI.OutNeeds;
721    Worklist.push_back(Succ);
722  }
723}
724
725char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
726  std::vector<WorkItem> Worklist;
727  char GlobalFlags = scanInstructions(MF, Worklist);
728
729  while (!Worklist.empty()) {
730    WorkItem WI = Worklist.back();
731    Worklist.pop_back();
732
733    if (WI.MI)
734      propagateInstruction(*WI.MI, Worklist);
735    else
736      propagateBlock(*WI.MBB, Worklist);
737  }
738
739  return GlobalFlags;
740}
741
742MachineBasicBlock::iterator
743SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
744                         MachineBasicBlock::iterator Before) {
745  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
746
747  MachineInstr *Save =
748      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
749          .addReg(AMDGPU::SCC);
750  MachineInstr *Restore =
751      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
752          .addReg(SaveReg);
753
754  LIS->InsertMachineInstrInMaps(*Save);
755  LIS->InsertMachineInstrInMaps(*Restore);
756  LIS->createAndComputeVirtRegInterval(SaveReg);
757
758  return Restore;
759}
760
761MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
762                                               MachineInstr *TermMI) {
763  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
764                    << *TermMI << "\n");
765
766  MachineBasicBlock *SplitBB =
767      BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
768
769  // Convert last instruction in block to a terminator.
770  // Note: this only covers the expected patterns
771  unsigned NewOpcode = 0;
772  switch (TermMI->getOpcode()) {
773  case AMDGPU::S_AND_B32:
774    NewOpcode = AMDGPU::S_AND_B32_term;
775    break;
776  case AMDGPU::S_AND_B64:
777    NewOpcode = AMDGPU::S_AND_B64_term;
778    break;
779  case AMDGPU::S_MOV_B32:
780    NewOpcode = AMDGPU::S_MOV_B32_term;
781    break;
782  case AMDGPU::S_MOV_B64:
783    NewOpcode = AMDGPU::S_MOV_B64_term;
784    break;
785  default:
786    break;
787  }
788  if (NewOpcode)
789    TermMI->setDesc(TII->get(NewOpcode));
790
791  if (SplitBB != BB) {
792    // Update dominator trees
793    using DomTreeT = DomTreeBase<MachineBasicBlock>;
794    SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
795    for (MachineBasicBlock *Succ : SplitBB->successors()) {
796      DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
797      DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
798    }
799    DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
800    if (MDT)
801      MDT->getBase().applyUpdates(DTUpdates);
802    if (PDT)
803      PDT->getBase().applyUpdates(DTUpdates);
804
805    // Link blocks
806    MachineInstr *MI =
807        BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
808            .addMBB(SplitBB);
809    LIS->InsertMachineInstrInMaps(*MI);
810  }
811
812  return SplitBB;
813}
814
815MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
816                                            MachineInstr &MI) {
817  const DebugLoc &DL = MI.getDebugLoc();
818  unsigned Opcode = 0;
819
820  assert(MI.getOperand(0).isReg());
821
822  // Comparison is for live lanes; however here we compute the inverse
823  // (killed lanes).  This is because VCMP will always generate 0 bits
824  // for inactive lanes so a mask of live lanes would not be correct
825  // inside control flow.
826  // Invert the comparison by swapping the operands and adjusting
827  // the comparison codes.
828
829  switch (MI.getOperand(2).getImm()) {
830  case ISD::SETUEQ:
831    Opcode = AMDGPU::V_CMP_LG_F32_e64;
832    break;
833  case ISD::SETUGT:
834    Opcode = AMDGPU::V_CMP_GE_F32_e64;
835    break;
836  case ISD::SETUGE:
837    Opcode = AMDGPU::V_CMP_GT_F32_e64;
838    break;
839  case ISD::SETULT:
840    Opcode = AMDGPU::V_CMP_LE_F32_e64;
841    break;
842  case ISD::SETULE:
843    Opcode = AMDGPU::V_CMP_LT_F32_e64;
844    break;
845  case ISD::SETUNE:
846    Opcode = AMDGPU::V_CMP_EQ_F32_e64;
847    break;
848  case ISD::SETO:
849    Opcode = AMDGPU::V_CMP_O_F32_e64;
850    break;
851  case ISD::SETUO:
852    Opcode = AMDGPU::V_CMP_U_F32_e64;
853    break;
854  case ISD::SETOEQ:
855  case ISD::SETEQ:
856    Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
857    break;
858  case ISD::SETOGT:
859  case ISD::SETGT:
860    Opcode = AMDGPU::V_CMP_NLT_F32_e64;
861    break;
862  case ISD::SETOGE:
863  case ISD::SETGE:
864    Opcode = AMDGPU::V_CMP_NLE_F32_e64;
865    break;
866  case ISD::SETOLT:
867  case ISD::SETLT:
868    Opcode = AMDGPU::V_CMP_NGT_F32_e64;
869    break;
870  case ISD::SETOLE:
871  case ISD::SETLE:
872    Opcode = AMDGPU::V_CMP_NGE_F32_e64;
873    break;
874  case ISD::SETONE:
875  case ISD::SETNE:
876    Opcode = AMDGPU::V_CMP_NLG_F32_e64;
877    break;
878  default:
879    llvm_unreachable("invalid ISD:SET cond code");
880  }
881
882  // Pick opcode based on comparison type.
883  MachineInstr *VcmpMI;
884  const MachineOperand &Op0 = MI.getOperand(0);
885  const MachineOperand &Op1 = MI.getOperand(1);
886
887  // VCC represents lanes killed.
888  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
889
890  if (TRI->isVGPR(*MRI, Op0.getReg())) {
891    Opcode = AMDGPU::getVOPe32(Opcode);
892    VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
893  } else {
894    VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
895                 .addReg(VCC, RegState::Define)
896                 .addImm(0) // src0 modifiers
897                 .add(Op1)
898                 .addImm(0) // src1 modifiers
899                 .add(Op0)
900                 .addImm(0); // omod
901  }
902
903  MachineInstr *MaskUpdateMI =
904      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
905          .addReg(LiveMaskReg)
906          .addReg(VCC);
907
908  // State of SCC represents whether any lanes are live in mask,
909  // if SCC is 0 then no lanes will be alive anymore.
910  MachineInstr *EarlyTermMI =
911      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
912
913  MachineInstr *ExecMaskMI =
914      BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
915
916  assert(MBB.succ_size() == 1);
917  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
918                              .addMBB(*MBB.succ_begin());
919
920  // Update live intervals
921  LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
922  MBB.remove(&MI);
923
924  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
925  LIS->InsertMachineInstrInMaps(*ExecMaskMI);
926  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
927  LIS->InsertMachineInstrInMaps(*NewTerm);
928
929  return NewTerm;
930}
931
932MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
933                                           MachineInstr &MI, bool IsWQM) {
934  const DebugLoc &DL = MI.getDebugLoc();
935  MachineInstr *MaskUpdateMI = nullptr;
936
937  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
938  const MachineOperand &Op = MI.getOperand(0);
939  int64_t KillVal = MI.getOperand(1).getImm();
940  MachineInstr *ComputeKilledMaskMI = nullptr;
941  Register CndReg = !Op.isImm() ? Op.getReg() : Register();
942  Register TmpReg;
943
944  // Is this a static or dynamic kill?
945  if (Op.isImm()) {
946    if (Op.getImm() == KillVal) {
947      // Static: all active lanes are killed
948      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
949                         .addReg(LiveMaskReg)
950                         .addReg(Exec);
951    } else {
952      // Static: kill does nothing
953      MachineInstr *NewTerm = nullptr;
954      if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
955        LIS->RemoveMachineInstrFromMaps(MI);
956      } else {
957        assert(MBB.succ_size() == 1);
958        NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
959                      .addMBB(*MBB.succ_begin());
960        LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
961      }
962      MBB.remove(&MI);
963      return NewTerm;
964    }
965  } else {
966    if (!KillVal) {
967      // Op represents live lanes after kill,
968      // so exec mask needs to be factored in.
969      TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
970      ComputeKilledMaskMI =
971          BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
972      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
973                         .addReg(LiveMaskReg)
974                         .addReg(TmpReg);
975    } else {
976      // Op represents lanes to kill
977      MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
978                         .addReg(LiveMaskReg)
979                         .add(Op);
980    }
981  }
982
983  // State of SCC represents whether any lanes are live in mask,
984  // if SCC is 0 then no lanes will be alive anymore.
985  MachineInstr *EarlyTermMI =
986      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
987
988  // In the case we got this far some lanes are still live,
989  // update EXEC to deactivate lanes as appropriate.
990  MachineInstr *NewTerm;
991  MachineInstr *WQMMaskMI = nullptr;
992  Register LiveMaskWQM;
993  if (IsDemote) {
994    // Demote - deactivate quads with only helper lanes
995    LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
996    WQMMaskMI =
997        BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
998    NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
999                  .addReg(Exec)
1000                  .addReg(LiveMaskWQM);
1001  } else {
1002    // Kill - deactivate lanes no longer in live mask
1003    if (Op.isImm()) {
1004      unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1005      NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1006    } else if (!IsWQM) {
1007      NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1008                    .addReg(Exec)
1009                    .addReg(LiveMaskReg);
1010    } else {
1011      unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1012      NewTerm =
1013          BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1014    }
1015  }
1016
1017  // Update live intervals
1018  LIS->RemoveMachineInstrFromMaps(MI);
1019  MBB.remove(&MI);
1020  assert(EarlyTermMI);
1021  assert(MaskUpdateMI);
1022  assert(NewTerm);
1023  if (ComputeKilledMaskMI)
1024    LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1025  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1026  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1027  if (WQMMaskMI)
1028    LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1029  LIS->InsertMachineInstrInMaps(*NewTerm);
1030
1031  if (CndReg) {
1032    LIS->removeInterval(CndReg);
1033    LIS->createAndComputeVirtRegInterval(CndReg);
1034  }
1035  if (TmpReg)
1036    LIS->createAndComputeVirtRegInterval(TmpReg);
1037  if (LiveMaskWQM)
1038    LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1039
1040  return NewTerm;
1041}
1042
1043// Convert a strict mode transition to a pseudo transition.
1044// This still pre-allocates registers to prevent clobbering,
1045// but avoids any EXEC mask changes.
1046void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1047                                            MachineInstr *Entry,
1048                                            MachineInstr *Exit) {
1049  assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1050  assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1051
1052  Register SaveOrig = Entry->getOperand(0).getReg();
1053
1054  MachineInstr *NewEntry =
1055    BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1056  MachineInstr *NewExit =
1057    BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1058
1059  LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1060  Exit->eraseFromParent();
1061
1062  LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1063  Entry->eraseFromParent();
1064
1065  LIS->removeInterval(SaveOrig);
1066}
1067
1068// Replace (or supplement) instructions accessing live mask.
1069// This can only happen once all the live mask registers have been created
1070// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1071void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1072  auto BII = Blocks.find(&MBB);
1073  if (BII == Blocks.end())
1074    return;
1075
1076  const BlockInfo &BI = BII->second;
1077  if (!BI.NeedsLowering)
1078    return;
1079
1080  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1081
1082  SmallVector<MachineInstr *, 4> SplitPoints;
1083  char State = BI.InitialState;
1084  MachineInstr *StrictEntry = nullptr;
1085
1086  for (MachineInstr &MI : llvm::make_early_inc_range(
1087           llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1088    char PreviousState = State;
1089
1090    if (StateTransition.count(&MI))
1091      State = StateTransition[&MI];
1092
1093    MachineInstr *SplitPoint = nullptr;
1094    switch (MI.getOpcode()) {
1095    case AMDGPU::SI_DEMOTE_I1:
1096    case AMDGPU::SI_KILL_I1_TERMINATOR:
1097      SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1098      break;
1099    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1100      SplitPoint = lowerKillF32(MBB, MI);
1101      break;
1102    case AMDGPU::ENTER_STRICT_WQM:
1103      StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1104      break;
1105    case AMDGPU::EXIT_STRICT_WQM:
1106      if (State == StateWQM && StrictEntry) {
1107        // Transition WQM -> StrictWQM -> WQM detected.
1108        lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1109      }
1110      StrictEntry = nullptr;
1111      break;
1112    case AMDGPU::ENTER_STRICT_WWM:
1113    case AMDGPU::EXIT_STRICT_WWM:
1114      StrictEntry = nullptr;
1115      break;
1116    default:
1117      break;
1118    }
1119    if (SplitPoint)
1120      SplitPoints.push_back(SplitPoint);
1121  }
1122
1123  // Perform splitting after instruction scan to simplify iteration.
1124  if (!SplitPoints.empty()) {
1125    MachineBasicBlock *BB = &MBB;
1126    for (MachineInstr *MI : SplitPoints) {
1127      BB = splitBlock(BB, MI);
1128    }
1129  }
1130}
1131
1132// Return an iterator in the (inclusive) range [First, Last] at which
1133// instructions can be safely inserted, keeping in mind that some of the
1134// instructions we want to add necessarily clobber SCC.
1135MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1136    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1137    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1138  if (!SaveSCC)
1139    return PreferLast ? Last : First;
1140
1141  LiveRange &LR =
1142      LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1143  auto MBBE = MBB.end();
1144  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1145                                     : LIS->getMBBEndIdx(&MBB);
1146  SlotIndex LastIdx =
1147      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1148  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1149  const LiveRange::Segment *S;
1150
1151  for (;;) {
1152    S = LR.getSegmentContaining(Idx);
1153    if (!S)
1154      break;
1155
1156    if (PreferLast) {
1157      SlotIndex Next = S->start.getBaseIndex();
1158      if (Next < FirstIdx)
1159        break;
1160      Idx = Next;
1161    } else {
1162      MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1163      assert(EndMI && "Segment does not end on valid instruction");
1164      auto NextI = std::next(EndMI->getIterator());
1165      if (NextI == MBB.end())
1166        break;
1167      SlotIndex Next = LIS->getInstructionIndex(*NextI);
1168      if (Next > LastIdx)
1169        break;
1170      Idx = Next;
1171    }
1172  }
1173
1174  MachineBasicBlock::iterator MBBI;
1175
1176  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1177    MBBI = MI;
1178  else {
1179    assert(Idx == LIS->getMBBEndIdx(&MBB));
1180    MBBI = MBB.end();
1181  }
1182
1183  // Move insertion point past any operations modifying EXEC.
1184  // This assumes that the value of SCC defined by any of these operations
1185  // does not need to be preserved.
1186  while (MBBI != Last) {
1187    bool IsExecDef = false;
1188    for (const MachineOperand &MO : MBBI->operands()) {
1189      if (MO.isReg() && MO.isDef()) {
1190        IsExecDef |=
1191            MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1192      }
1193    }
1194    if (!IsExecDef)
1195      break;
1196    MBBI++;
1197    S = nullptr;
1198  }
1199
1200  if (S)
1201    MBBI = saveSCC(MBB, MBBI);
1202
1203  return MBBI;
1204}
1205
1206void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1207                              MachineBasicBlock::iterator Before,
1208                              Register SaveWQM) {
1209  MachineInstr *MI;
1210
1211  if (SaveWQM) {
1212    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1213             .addReg(LiveMaskReg);
1214  } else {
1215    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1216             .addReg(Exec)
1217             .addReg(LiveMaskReg);
1218  }
1219
1220  LIS->InsertMachineInstrInMaps(*MI);
1221  StateTransition[MI] = StateExact;
1222}
1223
1224void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1225                            MachineBasicBlock::iterator Before,
1226                            Register SavedWQM) {
1227  MachineInstr *MI;
1228
1229  if (SavedWQM) {
1230    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1231             .addReg(SavedWQM);
1232  } else {
1233    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1234  }
1235
1236  LIS->InsertMachineInstrInMaps(*MI);
1237  StateTransition[MI] = StateWQM;
1238}
1239
1240void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1241                                   MachineBasicBlock::iterator Before,
1242                                   Register SaveOrig, char StrictStateNeeded) {
1243  MachineInstr *MI;
1244  assert(SaveOrig);
1245  assert(StrictStateNeeded == StateStrictWWM ||
1246         StrictStateNeeded == StateStrictWQM);
1247
1248  if (StrictStateNeeded == StateStrictWWM) {
1249    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1250                 SaveOrig)
1251             .addImm(-1);
1252  } else {
1253    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1254                 SaveOrig)
1255             .addImm(-1);
1256  }
1257  LIS->InsertMachineInstrInMaps(*MI);
1258  StateTransition[MI] = StrictStateNeeded;
1259
1260  // Mark block as needing lower so it will be checked for unnecessary transitions.
1261  auto BII = Blocks.find(&MBB);
1262  if (BII != Blocks.end())
1263    BII->second.NeedsLowering = true;
1264}
1265
1266void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1267                                     MachineBasicBlock::iterator Before,
1268                                     Register SavedOrig, char NonStrictState,
1269                                     char CurrentStrictState) {
1270  MachineInstr *MI;
1271
1272  assert(SavedOrig);
1273  assert(CurrentStrictState == StateStrictWWM ||
1274         CurrentStrictState == StateStrictWQM);
1275
1276  if (CurrentStrictState == StateStrictWWM) {
1277    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1278                 Exec)
1279             .addReg(SavedOrig);
1280  } else {
1281    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1282                 Exec)
1283             .addReg(SavedOrig);
1284  }
1285  LIS->InsertMachineInstrInMaps(*MI);
1286  StateTransition[MI] = NonStrictState;
1287}
1288
1289void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1290  auto BII = Blocks.find(&MBB);
1291  if (BII == Blocks.end())
1292    return;
1293
1294  BlockInfo &BI = BII->second;
1295
1296  // This is a non-entry block that is WQM throughout, so no need to do
1297  // anything.
1298  if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1299    BI.InitialState = StateWQM;
1300    return;
1301  }
1302
1303  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1304                    << ":\n");
1305
1306  Register SavedWQMReg;
1307  Register SavedNonStrictReg;
1308  bool WQMFromExec = IsEntry;
1309  char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1310  char NonStrictState = 0;
1311  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1312
1313  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1314  if (IsEntry) {
1315    // Skip the instruction that saves LiveMask
1316    if (II != IE && II->getOpcode() == AMDGPU::COPY)
1317      ++II;
1318  }
1319
1320  // This stores the first instruction where it's safe to switch from WQM to
1321  // Exact or vice versa.
1322  MachineBasicBlock::iterator FirstWQM = IE;
1323
1324  // This stores the first instruction where it's safe to switch from Strict
1325  // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1326  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1327  // be safe to switch to/from WQM as well.
1328  MachineBasicBlock::iterator FirstStrict = IE;
1329
1330  // Record initial state is block information.
1331  BI.InitialState = State;
1332
1333  for (;;) {
1334    MachineBasicBlock::iterator Next = II;
1335    char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1336    char OutNeeds = 0;
1337
1338    if (FirstWQM == IE)
1339      FirstWQM = II;
1340
1341    if (FirstStrict == IE)
1342      FirstStrict = II;
1343
1344    // First, figure out the allowed states (Needs) based on the propagated
1345    // flags.
1346    if (II != IE) {
1347      MachineInstr &MI = *II;
1348
1349      if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1350        auto III = Instructions.find(&MI);
1351        if (III != Instructions.end()) {
1352          if (III->second.Needs & StateStrictWWM)
1353            Needs = StateStrictWWM;
1354          else if (III->second.Needs & StateStrictWQM)
1355            Needs = StateStrictWQM;
1356          else if (III->second.Needs & StateWQM)
1357            Needs = StateWQM;
1358          else
1359            Needs &= ~III->second.Disabled;
1360          OutNeeds = III->second.OutNeeds;
1361        }
1362      } else {
1363        // If the instruction doesn't actually need a correct EXEC, then we can
1364        // safely leave Strict mode enabled.
1365        Needs = StateExact | StateWQM | StateStrict;
1366      }
1367
1368      if (MI.isTerminator() && OutNeeds == StateExact)
1369        Needs = StateExact;
1370
1371      ++Next;
1372    } else {
1373      // End of basic block
1374      if (BI.OutNeeds & StateWQM)
1375        Needs = StateWQM;
1376      else if (BI.OutNeeds == StateExact)
1377        Needs = StateExact;
1378      else
1379        Needs = StateWQM | StateExact;
1380    }
1381
1382    // Now, transition if necessary.
1383    if (!(Needs & State)) {
1384      MachineBasicBlock::iterator First;
1385      if (State == StateStrictWWM || Needs == StateStrictWWM ||
1386          State == StateStrictWQM || Needs == StateStrictWQM) {
1387        // We must switch to or from Strict mode.
1388        First = FirstStrict;
1389      } else {
1390        // We only need to switch to/from WQM, so we can use FirstWQM.
1391        First = FirstWQM;
1392      }
1393
1394      // Whether we need to save SCC depends on start and end states.
1395      bool SaveSCC = false;
1396      switch (State) {
1397      case StateExact:
1398      case StateStrictWWM:
1399      case StateStrictWQM:
1400        // Exact/Strict -> Strict: save SCC
1401        // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1402        // Exact/Strict -> Exact: no save
1403        SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1404        break;
1405      case StateWQM:
1406        // WQM -> Exact/Strict: save SCC
1407        SaveSCC = !(Needs & StateWQM);
1408        break;
1409      default:
1410        llvm_unreachable("Unknown state");
1411        break;
1412      }
1413      MachineBasicBlock::iterator Before =
1414          prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1415
1416      if (State & StateStrict) {
1417        assert(State == StateStrictWWM || State == StateStrictWQM);
1418        assert(SavedNonStrictReg);
1419        fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1420
1421        LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1422        SavedNonStrictReg = 0;
1423        State = NonStrictState;
1424      }
1425
1426      if (Needs & StateStrict) {
1427        NonStrictState = State;
1428        assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1429        assert(!SavedNonStrictReg);
1430        SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1431
1432        toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1433        State = Needs;
1434
1435      } else {
1436        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1437          if (!WQMFromExec && (OutNeeds & StateWQM)) {
1438            assert(!SavedWQMReg);
1439            SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1440          }
1441
1442          toExact(MBB, Before, SavedWQMReg);
1443          State = StateExact;
1444        } else if (State == StateExact && (Needs & StateWQM) &&
1445                   !(Needs & StateExact)) {
1446          assert(WQMFromExec == (SavedWQMReg == 0));
1447
1448          toWQM(MBB, Before, SavedWQMReg);
1449
1450          if (SavedWQMReg) {
1451            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1452            SavedWQMReg = 0;
1453          }
1454          State = StateWQM;
1455        } else {
1456          // We can get here if we transitioned from StrictWWM to a
1457          // non-StrictWWM state that already matches our needs, but we
1458          // shouldn't need to do anything.
1459          assert(Needs & State);
1460        }
1461      }
1462    }
1463
1464    if (Needs != (StateExact | StateWQM | StateStrict)) {
1465      if (Needs != (StateExact | StateWQM))
1466        FirstWQM = IE;
1467      FirstStrict = IE;
1468    }
1469
1470    if (II == IE)
1471      break;
1472
1473    II = Next;
1474  }
1475  assert(!SavedWQMReg);
1476  assert(!SavedNonStrictReg);
1477}
1478
1479void SIWholeQuadMode::lowerLiveMaskQueries() {
1480  for (MachineInstr *MI : LiveMaskQueries) {
1481    const DebugLoc &DL = MI->getDebugLoc();
1482    Register Dest = MI->getOperand(0).getReg();
1483
1484    MachineInstr *Copy =
1485        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1486            .addReg(LiveMaskReg);
1487
1488    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1489    MI->eraseFromParent();
1490  }
1491}
1492
1493void SIWholeQuadMode::lowerCopyInstrs() {
1494  for (MachineInstr *MI : LowerToMovInstrs) {
1495    assert(MI->getNumExplicitOperands() == 2);
1496
1497    const Register Reg = MI->getOperand(0).getReg();
1498
1499    const TargetRegisterClass *regClass =
1500        TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1501    if (TRI->isVGPRClass(regClass)) {
1502      const unsigned MovOp = TII->getMovOpcode(regClass);
1503      MI->setDesc(TII->get(MovOp));
1504
1505      // Check that it already implicitly depends on exec (like all VALU movs
1506      // should do).
1507      assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1508        return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1509      }));
1510    } else {
1511      // Remove early-clobber and exec dependency from simple SGPR copies.
1512      // This allows some to be eliminated during/post RA.
1513      LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1514      if (MI->getOperand(0).isEarlyClobber()) {
1515        LIS->removeInterval(Reg);
1516        MI->getOperand(0).setIsEarlyClobber(false);
1517        LIS->createAndComputeVirtRegInterval(Reg);
1518      }
1519      int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1520      while (Index >= 0) {
1521        MI->removeOperand(Index);
1522        Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1523      }
1524      MI->setDesc(TII->get(AMDGPU::COPY));
1525      LLVM_DEBUG(dbgs() << "  -> " << *MI);
1526    }
1527  }
1528  for (MachineInstr *MI : LowerToCopyInstrs) {
1529    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1530        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1531      assert(MI->getNumExplicitOperands() == 3);
1532      // the only reason we should be here is V_SET_INACTIVE has
1533      // an undef input so it is being replaced by a simple copy.
1534      // There should be a second undef source that we should remove.
1535      assert(MI->getOperand(2).isUndef());
1536      MI->removeOperand(2);
1537      MI->untieRegOperand(1);
1538    } else {
1539      assert(MI->getNumExplicitOperands() == 2);
1540    }
1541
1542    MI->setDesc(TII->get(AMDGPU::COPY));
1543  }
1544}
1545
1546void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1547  for (MachineInstr *MI : KillInstrs) {
1548    MachineBasicBlock *MBB = MI->getParent();
1549    MachineInstr *SplitPoint = nullptr;
1550    switch (MI->getOpcode()) {
1551    case AMDGPU::SI_DEMOTE_I1:
1552    case AMDGPU::SI_KILL_I1_TERMINATOR:
1553      SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1554      break;
1555    case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1556      SplitPoint = lowerKillF32(*MBB, *MI);
1557      break;
1558    default:
1559      continue;
1560    }
1561    if (SplitPoint)
1562      splitBlock(MBB, SplitPoint);
1563  }
1564}
1565
1566bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1567  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1568                    << " ------------- \n");
1569  LLVM_DEBUG(MF.dump(););
1570
1571  Instructions.clear();
1572  Blocks.clear();
1573  LiveMaskQueries.clear();
1574  LowerToCopyInstrs.clear();
1575  LowerToMovInstrs.clear();
1576  KillInstrs.clear();
1577  StateTransition.clear();
1578
1579  ST = &MF.getSubtarget<GCNSubtarget>();
1580
1581  TII = ST->getInstrInfo();
1582  TRI = &TII->getRegisterInfo();
1583  MRI = &MF.getRegInfo();
1584  LIS = &getAnalysis<LiveIntervals>();
1585  MDT = &getAnalysis<MachineDominatorTree>();
1586  PDT = &getAnalysis<MachinePostDominatorTree>();
1587
1588  if (ST->isWave32()) {
1589    AndOpc = AMDGPU::S_AND_B32;
1590    AndN2Opc = AMDGPU::S_ANDN2_B32;
1591    XorOpc = AMDGPU::S_XOR_B32;
1592    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1593    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1594    WQMOpc = AMDGPU::S_WQM_B32;
1595    Exec = AMDGPU::EXEC_LO;
1596  } else {
1597    AndOpc = AMDGPU::S_AND_B64;
1598    AndN2Opc = AMDGPU::S_ANDN2_B64;
1599    XorOpc = AMDGPU::S_XOR_B64;
1600    AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1601    OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1602    WQMOpc = AMDGPU::S_WQM_B64;
1603    Exec = AMDGPU::EXEC;
1604  }
1605
1606  const char GlobalFlags = analyzeFunction(MF);
1607  const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1608
1609  LiveMaskReg = Exec;
1610
1611  // Shader is simple does not need any state changes or any complex lowering
1612  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1613      LowerToMovInstrs.empty() && KillInstrs.empty()) {
1614    lowerLiveMaskQueries();
1615    return !LiveMaskQueries.empty();
1616  }
1617
1618  MachineBasicBlock &Entry = MF.front();
1619  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1620
1621  // Store a copy of the original live mask when required
1622  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1623    LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1624    MachineInstr *MI =
1625        BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1626            .addReg(Exec);
1627    LIS->InsertMachineInstrInMaps(*MI);
1628  }
1629
1630  LLVM_DEBUG(printInfo());
1631
1632  lowerLiveMaskQueries();
1633  lowerCopyInstrs();
1634
1635  // Shader only needs WQM
1636  if (GlobalFlags == StateWQM) {
1637    auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1638                  .addReg(Exec);
1639    LIS->InsertMachineInstrInMaps(*MI);
1640    lowerKillInstrs(true);
1641  } else {
1642    for (auto BII : Blocks)
1643      processBlock(*BII.first, BII.first == &Entry);
1644    // Lowering blocks causes block splitting so perform as a second pass.
1645    for (auto BII : Blocks)
1646      lowerBlock(*BII.first);
1647  }
1648
1649  // Compute live range for live mask
1650  if (LiveMaskReg != Exec)
1651    LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1652
1653  // Physical registers like SCC aren't tracked by default anyway, so just
1654  // removing the ranges we computed is the simplest option for maintaining
1655  // the analysis results.
1656  LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1657
1658  // If we performed any kills then recompute EXEC
1659  if (!KillInstrs.empty())
1660    LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1661
1662  return true;
1663}
1664