1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "llvm/CodeGen/MachineFunctionPass.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "si-pre-emit-peephole"
22
23static unsigned SkipThreshold;
24
25static cl::opt<unsigned, true> SkipThresholdFlag(
26    "amdgpu-skip-threshold", cl::Hidden,
27    cl::desc(
28        "Number of instructions before jumping over divergent control flow"),
29    cl::location(SkipThreshold), cl::init(12));
30
31namespace {
32
33class SIPreEmitPeephole : public MachineFunctionPass {
34private:
35  const SIInstrInfo *TII = nullptr;
36  const SIRegisterInfo *TRI = nullptr;
37
38  bool optimizeVccBranch(MachineInstr &MI) const;
39  bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40  bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41                            MachineBasicBlock *&TrueMBB,
42                            MachineBasicBlock *&FalseMBB,
43                            SmallVectorImpl<MachineOperand> &Cond);
44  bool mustRetainExeczBranch(const MachineBasicBlock &From,
45                             const MachineBasicBlock &To) const;
46  bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47
48public:
49  static char ID;
50
51  SIPreEmitPeephole() : MachineFunctionPass(ID) {
52    initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
53  }
54
55  bool runOnMachineFunction(MachineFunction &MF) override;
56};
57
58} // End anonymous namespace.
59
60INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61                "SI peephole optimizations", false, false)
62
63char SIPreEmitPeephole::ID = 0;
64
65char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66
67bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68  // Match:
69  // sreg = -1 or 0
70  // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71  // S_CBRANCH_VCC[N]Z
72  // =>
73  // S_CBRANCH_EXEC[N]Z
74  // We end up with this pattern sometimes after basic block placement.
75  // It happens while combining a block which assigns -1 or 0 to a saved mask
76  // and another block which consumes that saved mask and then a branch.
77  bool Changed = false;
78  MachineBasicBlock &MBB = *MI.getParent();
79  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
80  const bool IsWave32 = ST.isWave32();
81  const unsigned CondReg = TRI->getVCC();
82  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
83  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
84  const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
85  const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
86
87  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
88                                      E = MBB.rend();
89  bool ReadsCond = false;
90  unsigned Threshold = 5;
91  for (++A; A != E; ++A) {
92    if (!--Threshold)
93      return false;
94    if (A->modifiesRegister(ExecReg, TRI))
95      return false;
96    if (A->modifiesRegister(CondReg, TRI)) {
97      if (!A->definesRegister(CondReg, TRI) ||
98          (A->getOpcode() != And && A->getOpcode() != AndN2))
99        return false;
100      break;
101    }
102    ReadsCond |= A->readsRegister(CondReg, TRI);
103  }
104  if (A == E)
105    return false;
106
107  MachineOperand &Op1 = A->getOperand(1);
108  MachineOperand &Op2 = A->getOperand(2);
109  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
110    TII->commuteInstruction(*A);
111    Changed = true;
112  }
113  if (Op1.getReg() != ExecReg)
114    return Changed;
115  if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
116    return Changed;
117
118  int64_t MaskValue = 0;
119  Register SReg;
120  if (Op2.isReg()) {
121    SReg = Op2.getReg();
122    auto M = std::next(A);
123    bool ReadsSreg = false;
124    for (; M != E; ++M) {
125      if (M->definesRegister(SReg, TRI))
126        break;
127      if (M->modifiesRegister(SReg, TRI))
128        return Changed;
129      ReadsSreg |= M->readsRegister(SReg, TRI);
130    }
131    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
132        (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
133      return Changed;
134    MaskValue = M->getOperand(1).getImm();
135    // First if sreg is only used in the AND instruction fold the immediate
136    // into into the AND.
137    if (!ReadsSreg && Op2.isKill()) {
138      A->getOperand(2).ChangeToImmediate(MaskValue);
139      M->eraseFromParent();
140    }
141  } else if (Op2.isImm()) {
142    MaskValue = Op2.getImm();
143  } else {
144    llvm_unreachable("Op2 must be register or immediate");
145  }
146
147  // Invert mask for s_andn2
148  assert(MaskValue == 0 || MaskValue == -1);
149  if (A->getOpcode() == AndN2)
150    MaskValue = ~MaskValue;
151
152  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
153    if (!MI.killsRegister(CondReg, TRI)) {
154      // Replace AND with MOV
155      if (MaskValue == 0) {
156        BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
157            .addImm(0);
158      } else {
159        BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
160            .addReg(ExecReg);
161      }
162    }
163    // Remove AND instruction
164    A->eraseFromParent();
165  }
166
167  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
168  if (SReg == ExecReg) {
169    // EXEC is updated directly
170    if (IsVCCZ) {
171      MI.eraseFromParent();
172      return true;
173    }
174    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
175  } else if (IsVCCZ && MaskValue == 0) {
176    // Will always branch
177    // Remove all succesors shadowed by new unconditional branch
178    MachineBasicBlock *Parent = MI.getParent();
179    SmallVector<MachineInstr *, 4> ToRemove;
180    bool Found = false;
181    for (MachineInstr &Term : Parent->terminators()) {
182      if (Found) {
183        if (Term.isBranch())
184          ToRemove.push_back(&Term);
185      } else {
186        Found = Term.isIdenticalTo(MI);
187      }
188    }
189    assert(Found && "conditional branch is not terminator");
190    for (auto BranchMI : ToRemove) {
191      MachineOperand &Dst = BranchMI->getOperand(0);
192      assert(Dst.isMBB() && "destination is not basic block");
193      Parent->removeSuccessor(Dst.getMBB());
194      BranchMI->eraseFromParent();
195    }
196
197    if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
198      Parent->removeSuccessor(Succ);
199    }
200
201    // Rewrite to unconditional branch
202    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
203  } else if (!IsVCCZ && MaskValue == 0) {
204    // Will never branch
205    MachineOperand &Dst = MI.getOperand(0);
206    assert(Dst.isMBB() && "destination is not basic block");
207    MI.getParent()->removeSuccessor(Dst.getMBB());
208    MI.eraseFromParent();
209    return true;
210  } else if (MaskValue == -1) {
211    // Depends only on EXEC
212    MI.setDesc(
213        TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
214  }
215
216  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
217  MI.addImplicitDefUseOperands(*MBB.getParent());
218
219  return true;
220}
221
222bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
223                                       MachineInstr &MI) const {
224  MachineBasicBlock &MBB = *MI.getParent();
225  const MachineFunction &MF = *MBB.getParent();
226  const MachineRegisterInfo &MRI = MF.getRegInfo();
227  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
228  Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
229  SmallVector<MachineInstr *, 4> ToRemove;
230  bool IdxOn = true;
231
232  if (!MI.isIdenticalTo(First))
233    return false;
234
235  // Scan back to find an identical S_SET_GPR_IDX_ON
236  for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
237                                         E = MI.getIterator();
238       I != E; ++I) {
239    if (I->isBundle())
240      continue;
241    switch (I->getOpcode()) {
242    case AMDGPU::S_SET_GPR_IDX_MODE:
243      return false;
244    case AMDGPU::S_SET_GPR_IDX_OFF:
245      IdxOn = false;
246      ToRemove.push_back(&*I);
247      break;
248    default:
249      if (I->modifiesRegister(AMDGPU::M0, TRI))
250        return false;
251      if (IdxReg && I->modifiesRegister(IdxReg, TRI))
252        return false;
253      if (llvm::any_of(I->operands(),
254                       [&MRI, this](const MachineOperand &MO) {
255                         return MO.isReg() &&
256                                TRI->isVectorRegister(MRI, MO.getReg());
257                       })) {
258        // The only exception allowed here is another indirect vector move
259        // with the same mode.
260        if (!IdxOn ||
261            !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
262               I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
263              I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
264          return false;
265      }
266    }
267  }
268
269  MI.eraseFromBundle();
270  for (MachineInstr *RI : ToRemove)
271    RI->eraseFromBundle();
272  return true;
273}
274
275bool SIPreEmitPeephole::getBlockDestinations(
276    MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
277    MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
278  if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
279    return false;
280
281  if (!FalseMBB)
282    FalseMBB = SrcMBB.getNextNode();
283
284  return true;
285}
286
287bool SIPreEmitPeephole::mustRetainExeczBranch(
288    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
289  unsigned NumInstr = 0;
290  const MachineFunction *MF = From.getParent();
291
292  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
293       MBBI != End && MBBI != ToI; ++MBBI) {
294    const MachineBasicBlock &MBB = *MBBI;
295
296    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
297         I != E; ++I) {
298      // When a uniform loop is inside non-uniform control flow, the branch
299      // leaving the loop might never be taken when EXEC = 0.
300      // Hence we should retain cbranch out of the loop lest it become infinite.
301      if (I->isConditionalBranch())
302        return true;
303
304      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
305        return true;
306
307      // These instructions are potentially expensive even if EXEC = 0.
308      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
309          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
310        return true;
311
312      ++NumInstr;
313      if (NumInstr >= SkipThreshold)
314        return true;
315    }
316  }
317
318  return false;
319}
320
321// Returns true if the skip branch instruction is removed.
322bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
323                                          MachineBasicBlock &SrcMBB) {
324  MachineBasicBlock *TrueMBB = nullptr;
325  MachineBasicBlock *FalseMBB = nullptr;
326  SmallVector<MachineOperand, 1> Cond;
327
328  if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
329    return false;
330
331  // Consider only the forward branches.
332  if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
333      mustRetainExeczBranch(*FalseMBB, *TrueMBB))
334    return false;
335
336  LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
337  MI.eraseFromParent();
338  SrcMBB.removeSuccessor(TrueMBB);
339
340  return true;
341}
342
343bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
344  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
345  TII = ST.getInstrInfo();
346  TRI = &TII->getRegisterInfo();
347  bool Changed = false;
348
349  MF.RenumberBlocks();
350
351  for (MachineBasicBlock &MBB : MF) {
352    MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
353    // Check first terminator for branches to optimize
354    if (TermI != MBB.end()) {
355      MachineInstr &MI = *TermI;
356      switch (MI.getOpcode()) {
357      case AMDGPU::S_CBRANCH_VCCZ:
358      case AMDGPU::S_CBRANCH_VCCNZ:
359        Changed |= optimizeVccBranch(MI);
360        break;
361      case AMDGPU::S_CBRANCH_EXECZ:
362        Changed |= removeExeczBranch(MI, MBB);
363        break;
364      }
365    }
366
367    if (!ST.hasVGPRIndexMode())
368      continue;
369
370    MachineInstr *SetGPRMI = nullptr;
371    const unsigned Threshold = 20;
372    unsigned Count = 0;
373    // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
374    // second is not needed. Do expensive checks in the optimizeSetGPR()
375    // and limit the distance to 20 instructions for compile time purposes.
376    // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
377    // may be bundled with the instructions they modify.
378    for (auto &MI :
379         make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
380      if (Count == Threshold)
381        SetGPRMI = nullptr;
382      else
383        ++Count;
384
385      if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
386        continue;
387
388      Count = 0;
389      if (!SetGPRMI) {
390        SetGPRMI = &MI;
391        continue;
392      }
393
394      if (optimizeSetGPR(*SetGPRMI, MI))
395        Changed = true;
396      else
397        SetGPRMI = &MI;
398    }
399  }
400
401  return Changed;
402}
403