1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode for pixel
11/// shaders, and whole wavefront mode for all programs.
12///
13/// Whole quad mode is required for derivative computations, but it interferes
14/// with shader side effects (stores and atomics). This pass is run on the
15/// scheduled machine IR but before register coalescing, so that machine SSA is
16/// available for analysis. It ensures that WQM is enabled when necessary, but
17/// disabled around stores and atomics.
18///
19/// When necessary, this pass creates a function prolog
20///
21///   S_MOV_B64 LiveMask, EXEC
22///   S_WQM_B64 EXEC, EXEC
23///
24/// to enter WQM at the top of the function and surrounds blocks of Exact
25/// instructions by
26///
27///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
28///   ...
29///   S_MOV_B64 EXEC, Tmp
30///
31/// We also compute when a sequence of instructions requires Whole Wavefront
32/// Mode (WWM) and insert instructions to save and restore it:
33///
34/// S_OR_SAVEEXEC_B64 Tmp, -1
35/// ...
36/// S_MOV_B64 EXEC, Tmp
37///
38/// In order to avoid excessive switching during sequences of Exact
39/// instructions, the pass first analyzes which instructions must be run in WQM
40/// (aka which instructions produce values that lead to derivative
41/// computations).
42///
43/// Basic blocks are always exited in WQM as long as some successor needs WQM.
44///
45/// There is room for improvement given better control flow analysis:
46///
47///  (1) at the top level (outside of control flow statements, and as long as
48///      kill hasn't been used), one SGPR can be saved by recovering WQM from
49///      the LiveMask (this is implemented for the entry block).
50///
51///  (2) when entire regions (e.g. if-else blocks or entire loops) only
52///      consist of exact and don't-care instructions, the switch only has to
53///      be done at the entry and exit points rather than potentially in each
54///      block of the region.
55///
56//===----------------------------------------------------------------------===//
57
58#include "AMDGPU.h"
59#include "AMDGPUSubtarget.h"
60#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
61#include "SIInstrInfo.h"
62#include "SIMachineFunctionInfo.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
65#include "llvm/ADT/PostOrderIterator.h"
66#include "llvm/ADT/SmallVector.h"
67#include "llvm/ADT/StringRef.h"
68#include "llvm/CodeGen/LiveInterval.h"
69#include "llvm/CodeGen/LiveIntervals.h"
70#include "llvm/CodeGen/MachineBasicBlock.h"
71#include "llvm/CodeGen/MachineFunction.h"
72#include "llvm/CodeGen/MachineFunctionPass.h"
73#include "llvm/CodeGen/MachineInstr.h"
74#include "llvm/CodeGen/MachineInstrBuilder.h"
75#include "llvm/CodeGen/MachineOperand.h"
76#include "llvm/CodeGen/MachineRegisterInfo.h"
77#include "llvm/CodeGen/SlotIndexes.h"
78#include "llvm/CodeGen/TargetRegisterInfo.h"
79#include "llvm/IR/CallingConv.h"
80#include "llvm/IR/DebugLoc.h"
81#include "llvm/InitializePasses.h"
82#include "llvm/MC/MCRegisterInfo.h"
83#include "llvm/Pass.h"
84#include "llvm/Support/Debug.h"
85#include "llvm/Support/raw_ostream.h"
86#include <cassert>
87#include <vector>
88
89using namespace llvm;
90
91#define DEBUG_TYPE "si-wqm"
92
93namespace {
94
95enum {
96  StateWQM = 0x1,
97  StateWWM = 0x2,
98  StateExact = 0x4,
99};
100
101struct PrintState {
102public:
103  int State;
104
105  explicit PrintState(int State) : State(State) {}
106};
107
108#ifndef NDEBUG
109static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
110  if (PS.State & StateWQM)
111    OS << "WQM";
112  if (PS.State & StateWWM) {
113    if (PS.State & StateWQM)
114      OS << '|';
115    OS << "WWM";
116  }
117  if (PS.State & StateExact) {
118    if (PS.State & (StateWQM | StateWWM))
119      OS << '|';
120    OS << "Exact";
121  }
122
123  return OS;
124}
125#endif
126
127struct InstrInfo {
128  char Needs = 0;
129  char Disabled = 0;
130  char OutNeeds = 0;
131};
132
133struct BlockInfo {
134  char Needs = 0;
135  char InNeeds = 0;
136  char OutNeeds = 0;
137};
138
139struct WorkItem {
140  MachineBasicBlock *MBB = nullptr;
141  MachineInstr *MI = nullptr;
142
143  WorkItem() = default;
144  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
145  WorkItem(MachineInstr *MI) : MI(MI) {}
146};
147
148class SIWholeQuadMode : public MachineFunctionPass {
149private:
150  CallingConv::ID CallingConv;
151  const SIInstrInfo *TII;
152  const SIRegisterInfo *TRI;
153  const GCNSubtarget *ST;
154  MachineRegisterInfo *MRI;
155  LiveIntervals *LIS;
156
157  DenseMap<const MachineInstr *, InstrInfo> Instructions;
158  MapVector<MachineBasicBlock *, BlockInfo> Blocks;
159  SmallVector<MachineInstr *, 1> LiveMaskQueries;
160  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
161  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
162
163  void printInfo();
164
165  void markInstruction(MachineInstr &MI, char Flag,
166                       std::vector<WorkItem> &Worklist);
167  void markInstructionUses(const MachineInstr &MI, char Flag,
168                           std::vector<WorkItem> &Worklist);
169  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
170  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
171  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
172  char analyzeFunction(MachineFunction &MF);
173
174  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
175                                      MachineBasicBlock::iterator Before);
176  MachineBasicBlock::iterator
177  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
178                   MachineBasicBlock::iterator Last, bool PreferLast,
179                   bool SaveSCC);
180  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
181               unsigned SaveWQM, unsigned LiveMaskReg);
182  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
183             unsigned SavedWQM);
184  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
185             unsigned SaveOrig);
186  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
187               unsigned SavedOrig);
188  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
189
190  void lowerLiveMaskQueries(unsigned LiveMaskReg);
191  void lowerCopyInstrs();
192
193public:
194  static char ID;
195
196  SIWholeQuadMode() :
197    MachineFunctionPass(ID) { }
198
199  bool runOnMachineFunction(MachineFunction &MF) override;
200
201  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
202
203  void getAnalysisUsage(AnalysisUsage &AU) const override {
204    AU.addRequired<LiveIntervals>();
205    AU.addPreserved<SlotIndexes>();
206    AU.addPreserved<LiveIntervals>();
207    AU.setPreservesCFG();
208    MachineFunctionPass::getAnalysisUsage(AU);
209  }
210};
211
212} // end anonymous namespace
213
214char SIWholeQuadMode::ID = 0;
215
216INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
217                      false)
218INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
219INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
220                    false)
221
222char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
223
224FunctionPass *llvm::createSIWholeQuadModePass() {
225  return new SIWholeQuadMode;
226}
227
228#ifndef NDEBUG
229LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
230  for (const auto &BII : Blocks) {
231    dbgs() << "\n"
232           << printMBBReference(*BII.first) << ":\n"
233           << "  InNeeds = " << PrintState(BII.second.InNeeds)
234           << ", Needs = " << PrintState(BII.second.Needs)
235           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
236
237    for (const MachineInstr &MI : *BII.first) {
238      auto III = Instructions.find(&MI);
239      if (III == Instructions.end())
240        continue;
241
242      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
243             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
244    }
245  }
246}
247#endif
248
249void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
250                                      std::vector<WorkItem> &Worklist) {
251  InstrInfo &II = Instructions[&MI];
252
253  assert(!(Flag & StateExact) && Flag != 0);
254
255  // Remove any disabled states from the flag. The user that required it gets
256  // an undefined value in the helper lanes. For example, this can happen if
257  // the result of an atomic is used by instruction that requires WQM, where
258  // ignoring the request for WQM is correct as per the relevant specs.
259  Flag &= ~II.Disabled;
260
261  // Ignore if the flag is already encompassed by the existing needs, or we
262  // just disabled everything.
263  if ((II.Needs & Flag) == Flag)
264    return;
265
266  II.Needs |= Flag;
267  Worklist.push_back(&MI);
268}
269
270/// Mark all instructions defining the uses in \p MI with \p Flag.
271void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
272                                          std::vector<WorkItem> &Worklist) {
273  for (const MachineOperand &Use : MI.uses()) {
274    if (!Use.isReg() || !Use.isUse())
275      continue;
276
277    Register Reg = Use.getReg();
278
279    // Handle physical registers that we need to track; this is mostly relevant
280    // for VCC, which can appear as the (implicit) input of a uniform branch,
281    // e.g. when a loop counter is stored in a VGPR.
282    if (!Register::isVirtualRegister(Reg)) {
283      if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
284        continue;
285
286      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
287        LiveRange &LR = LIS->getRegUnit(*RegUnit);
288        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
289        if (!Value)
290          continue;
291
292        // Since we're in machine SSA, we do not need to track physical
293        // registers across basic blocks.
294        if (Value->isPHIDef())
295          continue;
296
297        markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
298                        Worklist);
299      }
300
301      continue;
302    }
303
304    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
305      markInstruction(DefMI, Flag, Worklist);
306  }
307}
308
309// Scan instructions to determine which ones require an Exact execmask and
310// which ones seed WQM requirements.
311char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
312                                       std::vector<WorkItem> &Worklist) {
313  char GlobalFlags = 0;
314  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
315  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
316  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
317
318  // We need to visit the basic blocks in reverse post-order so that we visit
319  // defs before uses, in particular so that we don't accidentally mark an
320  // instruction as needing e.g. WQM before visiting it and realizing it needs
321  // WQM disabled.
322  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
323  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
324    MachineBasicBlock &MBB = **BI;
325    BlockInfo &BBI = Blocks[&MBB];
326
327    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
328      MachineInstr &MI = *II;
329      InstrInfo &III = Instructions[&MI];
330      unsigned Opcode = MI.getOpcode();
331      char Flags = 0;
332
333      if (TII->isWQM(Opcode)) {
334        // Sampling instructions don't need to produce results for all pixels
335        // in a quad, they just require all inputs of a quad to have been
336        // computed for derivatives.
337        markInstructionUses(MI, StateWQM, Worklist);
338        GlobalFlags |= StateWQM;
339        continue;
340      } else if (Opcode == AMDGPU::WQM) {
341        // The WQM intrinsic requires its output to have all the helper lanes
342        // correct, so we need it to be in WQM.
343        Flags = StateWQM;
344        LowerToCopyInstrs.push_back(&MI);
345      } else if (Opcode == AMDGPU::SOFT_WQM) {
346        LowerToCopyInstrs.push_back(&MI);
347        SoftWQMInstrs.push_back(&MI);
348        continue;
349      } else if (Opcode == AMDGPU::WWM) {
350        // The WWM intrinsic doesn't make the same guarantee, and plus it needs
351        // to be executed in WQM or Exact so that its copy doesn't clobber
352        // inactive lanes.
353        markInstructionUses(MI, StateWWM, Worklist);
354        GlobalFlags |= StateWWM;
355        LowerToMovInstrs.push_back(&MI);
356        continue;
357      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
358                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
359        III.Disabled = StateWWM;
360        MachineOperand &Inactive = MI.getOperand(2);
361        if (Inactive.isReg()) {
362          if (Inactive.isUndef()) {
363            LowerToCopyInstrs.push_back(&MI);
364          } else {
365            Register Reg = Inactive.getReg();
366            if (Register::isVirtualRegister(Reg)) {
367              for (MachineInstr &DefMI : MRI->def_instructions(Reg))
368                markInstruction(DefMI, StateWWM, Worklist);
369            }
370          }
371        }
372        SetInactiveInstrs.push_back(&MI);
373        continue;
374      } else if (TII->isDisableWQM(MI)) {
375        BBI.Needs |= StateExact;
376        if (!(BBI.InNeeds & StateExact)) {
377          BBI.InNeeds |= StateExact;
378          Worklist.push_back(&MBB);
379        }
380        GlobalFlags |= StateExact;
381        III.Disabled = StateWQM | StateWWM;
382        continue;
383      } else {
384        if (Opcode == AMDGPU::SI_PS_LIVE) {
385          LiveMaskQueries.push_back(&MI);
386        } else if (WQMOutputs) {
387          // The function is in machine SSA form, which means that physical
388          // VGPRs correspond to shader inputs and outputs. Inputs are
389          // only used, outputs are only defined.
390          for (const MachineOperand &MO : MI.defs()) {
391            if (!MO.isReg())
392              continue;
393
394            Register Reg = MO.getReg();
395
396            if (!Register::isVirtualRegister(Reg) &&
397                TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
398              Flags = StateWQM;
399              break;
400            }
401          }
402        }
403
404        if (!Flags)
405          continue;
406      }
407
408      markInstruction(MI, Flags, Worklist);
409      GlobalFlags |= Flags;
410    }
411  }
412
413  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
414  // ever used anywhere in the function. This implements the corresponding
415  // semantics of @llvm.amdgcn.set.inactive.
416  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
417  if (GlobalFlags & StateWQM) {
418    for (MachineInstr *MI : SetInactiveInstrs)
419      markInstruction(*MI, StateWQM, Worklist);
420    for (MachineInstr *MI : SoftWQMInstrs)
421      markInstruction(*MI, StateWQM, Worklist);
422  }
423
424  return GlobalFlags;
425}
426
427void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
428                                           std::vector<WorkItem>& Worklist) {
429  MachineBasicBlock *MBB = MI.getParent();
430  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
431  BlockInfo &BI = Blocks[MBB];
432
433  // Control flow-type instructions and stores to temporary memory that are
434  // followed by WQM computations must themselves be in WQM.
435  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
436      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
437    Instructions[&MI].Needs = StateWQM;
438    II.Needs = StateWQM;
439  }
440
441  // Propagate to block level
442  if (II.Needs & StateWQM) {
443    BI.Needs |= StateWQM;
444    if (!(BI.InNeeds & StateWQM)) {
445      BI.InNeeds |= StateWQM;
446      Worklist.push_back(MBB);
447    }
448  }
449
450  // Propagate backwards within block
451  if (MachineInstr *PrevMI = MI.getPrevNode()) {
452    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
453    if (!PrevMI->isPHI()) {
454      InstrInfo &PrevII = Instructions[PrevMI];
455      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
456        PrevII.OutNeeds |= InNeeds;
457        Worklist.push_back(PrevMI);
458      }
459    }
460  }
461
462  // Propagate WQM flag to instruction inputs
463  assert(!(II.Needs & StateExact));
464
465  if (II.Needs != 0)
466    markInstructionUses(MI, II.Needs, Worklist);
467
468  // Ensure we process a block containing WWM, even if it does not require any
469  // WQM transitions.
470  if (II.Needs & StateWWM)
471    BI.Needs |= StateWWM;
472}
473
474void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
475                                     std::vector<WorkItem>& Worklist) {
476  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
477
478  // Propagate through instructions
479  if (!MBB.empty()) {
480    MachineInstr *LastMI = &*MBB.rbegin();
481    InstrInfo &LastII = Instructions[LastMI];
482    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
483      LastII.OutNeeds |= BI.OutNeeds;
484      Worklist.push_back(LastMI);
485    }
486  }
487
488  // Predecessor blocks must provide for our WQM/Exact needs.
489  for (MachineBasicBlock *Pred : MBB.predecessors()) {
490    BlockInfo &PredBI = Blocks[Pred];
491    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
492      continue;
493
494    PredBI.OutNeeds |= BI.InNeeds;
495    PredBI.InNeeds |= BI.InNeeds;
496    Worklist.push_back(Pred);
497  }
498
499  // All successors must be prepared to accept the same set of WQM/Exact data.
500  for (MachineBasicBlock *Succ : MBB.successors()) {
501    BlockInfo &SuccBI = Blocks[Succ];
502    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
503      continue;
504
505    SuccBI.InNeeds |= BI.OutNeeds;
506    Worklist.push_back(Succ);
507  }
508}
509
510char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
511  std::vector<WorkItem> Worklist;
512  char GlobalFlags = scanInstructions(MF, Worklist);
513
514  while (!Worklist.empty()) {
515    WorkItem WI = Worklist.back();
516    Worklist.pop_back();
517
518    if (WI.MI)
519      propagateInstruction(*WI.MI, Worklist);
520    else
521      propagateBlock(*WI.MBB, Worklist);
522  }
523
524  return GlobalFlags;
525}
526
527MachineBasicBlock::iterator
528SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
529                         MachineBasicBlock::iterator Before) {
530  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
531
532  MachineInstr *Save =
533      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
534          .addReg(AMDGPU::SCC);
535  MachineInstr *Restore =
536      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
537          .addReg(SaveReg);
538
539  LIS->InsertMachineInstrInMaps(*Save);
540  LIS->InsertMachineInstrInMaps(*Restore);
541  LIS->createAndComputeVirtRegInterval(SaveReg);
542
543  return Restore;
544}
545
546// Return an iterator in the (inclusive) range [First, Last] at which
547// instructions can be safely inserted, keeping in mind that some of the
548// instructions we want to add necessarily clobber SCC.
549MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
550    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
551    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
552  if (!SaveSCC)
553    return PreferLast ? Last : First;
554
555  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
556  auto MBBE = MBB.end();
557  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
558                                     : LIS->getMBBEndIdx(&MBB);
559  SlotIndex LastIdx =
560      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
561  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
562  const LiveRange::Segment *S;
563
564  for (;;) {
565    S = LR.getSegmentContaining(Idx);
566    if (!S)
567      break;
568
569    if (PreferLast) {
570      SlotIndex Next = S->start.getBaseIndex();
571      if (Next < FirstIdx)
572        break;
573      Idx = Next;
574    } else {
575      SlotIndex Next = S->end.getNextIndex().getBaseIndex();
576      if (Next > LastIdx)
577        break;
578      Idx = Next;
579    }
580  }
581
582  MachineBasicBlock::iterator MBBI;
583
584  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
585    MBBI = MI;
586  else {
587    assert(Idx == LIS->getMBBEndIdx(&MBB));
588    MBBI = MBB.end();
589  }
590
591  if (S)
592    MBBI = saveSCC(MBB, MBBI);
593
594  return MBBI;
595}
596
597void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
598                              MachineBasicBlock::iterator Before,
599                              unsigned SaveWQM, unsigned LiveMaskReg) {
600  MachineInstr *MI;
601
602  if (SaveWQM) {
603    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
604                   AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
605                 SaveWQM)
606             .addReg(LiveMaskReg);
607  } else {
608    unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
609    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
610                   AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
611                 Exec)
612             .addReg(Exec)
613             .addReg(LiveMaskReg);
614  }
615
616  LIS->InsertMachineInstrInMaps(*MI);
617}
618
619void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
620                            MachineBasicBlock::iterator Before,
621                            unsigned SavedWQM) {
622  MachineInstr *MI;
623
624  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
625  if (SavedWQM) {
626    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
627             .addReg(SavedWQM);
628  } else {
629    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
630                   AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
631                 Exec)
632             .addReg(Exec);
633  }
634
635  LIS->InsertMachineInstrInMaps(*MI);
636}
637
638void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
639                            MachineBasicBlock::iterator Before,
640                            unsigned SaveOrig) {
641  MachineInstr *MI;
642
643  assert(SaveOrig);
644  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
645           .addImm(-1);
646  LIS->InsertMachineInstrInMaps(*MI);
647}
648
649void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
650                              MachineBasicBlock::iterator Before,
651                              unsigned SavedOrig) {
652  MachineInstr *MI;
653
654  assert(SavedOrig);
655  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
656               ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
657           .addReg(SavedOrig);
658  LIS->InsertMachineInstrInMaps(*MI);
659}
660
661void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
662                                   bool isEntry) {
663  auto BII = Blocks.find(&MBB);
664  if (BII == Blocks.end())
665    return;
666
667  const BlockInfo &BI = BII->second;
668
669  // This is a non-entry block that is WQM throughout, so no need to do
670  // anything.
671  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
672    return;
673
674  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
675                    << ":\n");
676
677  unsigned SavedWQMReg = 0;
678  unsigned SavedNonWWMReg = 0;
679  bool WQMFromExec = isEntry;
680  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
681  char NonWWMState = 0;
682  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
683
684  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
685  if (isEntry)
686    ++II; // Skip the instruction that saves LiveMask
687
688  // This stores the first instruction where it's safe to switch from WQM to
689  // Exact or vice versa.
690  MachineBasicBlock::iterator FirstWQM = IE;
691
692  // This stores the first instruction where it's safe to switch from WWM to
693  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
694  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
695  // switch to/from WQM as well.
696  MachineBasicBlock::iterator FirstWWM = IE;
697  for (;;) {
698    MachineBasicBlock::iterator Next = II;
699    char Needs = StateExact | StateWQM; // WWM is disabled by default
700    char OutNeeds = 0;
701
702    if (FirstWQM == IE)
703      FirstWQM = II;
704
705    if (FirstWWM == IE)
706      FirstWWM = II;
707
708    // First, figure out the allowed states (Needs) based on the propagated
709    // flags.
710    if (II != IE) {
711      MachineInstr &MI = *II;
712
713      if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
714        auto III = Instructions.find(&MI);
715        if (III != Instructions.end()) {
716          if (III->second.Needs & StateWWM)
717            Needs = StateWWM;
718          else if (III->second.Needs & StateWQM)
719            Needs = StateWQM;
720          else
721            Needs &= ~III->second.Disabled;
722          OutNeeds = III->second.OutNeeds;
723        }
724      } else {
725        // If the instruction doesn't actually need a correct EXEC, then we can
726        // safely leave WWM enabled.
727        Needs = StateExact | StateWQM | StateWWM;
728      }
729
730      if (MI.isTerminator() && OutNeeds == StateExact)
731        Needs = StateExact;
732
733      if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
734        MI.getOperand(3).setImm(1);
735
736      ++Next;
737    } else {
738      // End of basic block
739      if (BI.OutNeeds & StateWQM)
740        Needs = StateWQM;
741      else if (BI.OutNeeds == StateExact)
742        Needs = StateExact;
743      else
744        Needs = StateWQM | StateExact;
745    }
746
747    // Now, transition if necessary.
748    if (!(Needs & State)) {
749      MachineBasicBlock::iterator First;
750      if (State == StateWWM || Needs == StateWWM) {
751        // We must switch to or from WWM
752        First = FirstWWM;
753      } else {
754        // We only need to switch to/from WQM, so we can use FirstWQM
755        First = FirstWQM;
756      }
757
758      MachineBasicBlock::iterator Before =
759          prepareInsertion(MBB, First, II, Needs == StateWQM,
760                           Needs == StateExact || WQMFromExec);
761
762      if (State == StateWWM) {
763        assert(SavedNonWWMReg);
764        fromWWM(MBB, Before, SavedNonWWMReg);
765        LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
766        SavedNonWWMReg = 0;
767        State = NonWWMState;
768      }
769
770      if (Needs == StateWWM) {
771        NonWWMState = State;
772        assert(!SavedNonWWMReg);
773        SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
774        toWWM(MBB, Before, SavedNonWWMReg);
775        State = StateWWM;
776      } else {
777        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
778          if (!WQMFromExec && (OutNeeds & StateWQM)) {
779            assert(!SavedWQMReg);
780            SavedWQMReg = MRI->createVirtualRegister(BoolRC);
781          }
782
783          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
784          State = StateExact;
785        } else if (State == StateExact && (Needs & StateWQM) &&
786                   !(Needs & StateExact)) {
787          assert(WQMFromExec == (SavedWQMReg == 0));
788
789          toWQM(MBB, Before, SavedWQMReg);
790
791          if (SavedWQMReg) {
792            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
793            SavedWQMReg = 0;
794          }
795          State = StateWQM;
796        } else {
797          // We can get here if we transitioned from WWM to a non-WWM state that
798          // already matches our needs, but we shouldn't need to do anything.
799          assert(Needs & State);
800        }
801      }
802    }
803
804    if (Needs != (StateExact | StateWQM | StateWWM)) {
805      if (Needs != (StateExact | StateWQM))
806        FirstWQM = IE;
807      FirstWWM = IE;
808    }
809
810    if (II == IE)
811      break;
812    II = Next;
813  }
814  assert(!SavedWQMReg);
815  assert(!SavedNonWWMReg);
816}
817
818void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
819  for (MachineInstr *MI : LiveMaskQueries) {
820    const DebugLoc &DL = MI->getDebugLoc();
821    Register Dest = MI->getOperand(0).getReg();
822    MachineInstr *Copy =
823        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
824            .addReg(LiveMaskReg);
825
826    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
827    MI->eraseFromParent();
828  }
829}
830
831void SIWholeQuadMode::lowerCopyInstrs() {
832  for (MachineInstr *MI : LowerToMovInstrs) {
833    assert(MI->getNumExplicitOperands() == 2);
834
835    const Register Reg = MI->getOperand(0).getReg();
836
837    if (TRI->isVGPR(*MRI, Reg)) {
838      const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
839                                                ? MRI->getRegClass(Reg)
840                                                : TRI->getPhysRegClass(Reg);
841
842      const unsigned MovOp = TII->getMovOpcode(regClass);
843      MI->setDesc(TII->get(MovOp));
844
845      // And make it implicitly depend on exec (like all VALU movs should do).
846      MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
847    } else {
848      MI->setDesc(TII->get(AMDGPU::COPY));
849    }
850  }
851  for (MachineInstr *MI : LowerToCopyInstrs) {
852    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
853        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
854      assert(MI->getNumExplicitOperands() == 3);
855      // the only reason we should be here is V_SET_INACTIVE has
856      // an undef input so it is being replaced by a simple copy.
857      // There should be a second undef source that we should remove.
858      assert(MI->getOperand(2).isUndef());
859      MI->RemoveOperand(2);
860      MI->untieRegOperand(1);
861    } else {
862      assert(MI->getNumExplicitOperands() == 2);
863    }
864
865    MI->setDesc(TII->get(AMDGPU::COPY));
866  }
867}
868
869bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
870  Instructions.clear();
871  Blocks.clear();
872  LiveMaskQueries.clear();
873  LowerToCopyInstrs.clear();
874  LowerToMovInstrs.clear();
875  CallingConv = MF.getFunction().getCallingConv();
876
877  ST = &MF.getSubtarget<GCNSubtarget>();
878
879  TII = ST->getInstrInfo();
880  TRI = &TII->getRegisterInfo();
881  MRI = &MF.getRegInfo();
882  LIS = &getAnalysis<LiveIntervals>();
883
884  char GlobalFlags = analyzeFunction(MF);
885  unsigned LiveMaskReg = 0;
886  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
887  if (!(GlobalFlags & StateWQM)) {
888    lowerLiveMaskQueries(Exec);
889    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
890      return !LiveMaskQueries.empty();
891  } else {
892    // Store a copy of the original live mask when required
893    MachineBasicBlock &Entry = MF.front();
894    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
895
896    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
897      LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
898      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
899                                 TII->get(AMDGPU::COPY), LiveMaskReg)
900                             .addReg(Exec);
901      LIS->InsertMachineInstrInMaps(*MI);
902    }
903
904    lowerLiveMaskQueries(LiveMaskReg);
905
906    if (GlobalFlags == StateWQM) {
907      // For a shader that needs only WQM, we can just set it once.
908      auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
909                        TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
910                                                : AMDGPU::S_WQM_B64),
911                        Exec)
912                    .addReg(Exec);
913      LIS->InsertMachineInstrInMaps(*MI);
914
915      lowerCopyInstrs();
916      // EntryMI may become invalid here
917      return true;
918    }
919  }
920
921  LLVM_DEBUG(printInfo());
922
923  lowerCopyInstrs();
924
925  // Handle the general case
926  for (auto BII : Blocks)
927    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
928
929  if (LiveMaskReg)
930    LIS->createAndComputeVirtRegInterval(LiveMaskReg);
931
932  // Physical registers like SCC aren't tracked by default anyway, so just
933  // removing the ranges we computed is the simplest option for maintaining
934  // the analysis results.
935  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
936
937  return true;
938}
939