1//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// after register banks are known.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULegalizerInfo.h"
16#include "AMDGPURegisterBankInfo.h"
17#include "GCNSubtarget.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/CodeGen/GlobalISel/Combiner.h"
21#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/MachineDominators.h"
27#include "llvm/CodeGen/TargetPassConfig.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/Target/TargetMachine.h"
30
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
34
35#define DEBUG_TYPE "amdgpu-regbank-combiner"
36
37using namespace llvm;
38using namespace MIPatternMatch;
39
40namespace {
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenRegBankGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
44
45class AMDGPURegBankCombinerImpl : public Combiner {
46protected:
47  const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
48  const GCNSubtarget &STI;
49  const RegisterBankInfo &RBI;
50  const TargetRegisterInfo &TRI;
51  const SIInstrInfo &TII;
52  // TODO: Make CombinerHelper methods const.
53  mutable CombinerHelper Helper;
54
55public:
56  AMDGPURegBankCombinerImpl(
57      MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
58      GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
59      const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
60      const GCNSubtarget &STI, MachineDominatorTree *MDT,
61      const LegalizerInfo *LI);
62
63  static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
64
65  bool tryCombineAll(MachineInstr &I) const override;
66
67  bool isVgprRegBank(Register Reg) const;
68  Register getAsVgpr(Register Reg) const;
69
70  struct MinMaxMedOpc {
71    unsigned Min, Max, Med;
72  };
73
74  struct Med3MatchInfo {
75    unsigned Opc;
76    Register Val0, Val1, Val2;
77  };
78
79  MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
80
81  template <class m_Cst, typename CstTy>
82  bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
83                Register &Val, CstTy &K0, CstTy &K1) const;
84
85  bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
86  bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
87  bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
88  bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
89  void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
90  void applyClamp(MachineInstr &MI, Register &Reg) const;
91
92private:
93  SIModeRegisterDefaults getMode() const;
94  bool getIEEE() const;
95  bool getDX10Clamp() const;
96  bool isFminnumIeee(const MachineInstr &MI) const;
97  bool isFCst(MachineInstr *MI) const;
98  bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const;
99
100#define GET_GICOMBINER_CLASS_MEMBERS
101#define AMDGPUSubtarget GCNSubtarget
102#include "AMDGPUGenRegBankGICombiner.inc"
103#undef GET_GICOMBINER_CLASS_MEMBERS
104#undef AMDGPUSubtarget
105};
106
107#define GET_GICOMBINER_IMPL
108#define AMDGPUSubtarget GCNSubtarget
109#include "AMDGPUGenRegBankGICombiner.inc"
110#undef AMDGPUSubtarget
111#undef GET_GICOMBINER_IMPL
112
113AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
114    MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
115    GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
116    const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
117    const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
118    : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
119      RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()),
120      TII(*STI.getInstrInfo()),
121      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
122#define GET_GICOMBINER_CONSTRUCTOR_INITS
123#include "AMDGPUGenRegBankGICombiner.inc"
124#undef GET_GICOMBINER_CONSTRUCTOR_INITS
125{
126}
127
128bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
129  return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
130}
131
132Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
133  if (isVgprRegBank(Reg))
134    return Reg;
135
136  // Search for existing copy of Reg to vgpr.
137  for (MachineInstr &Use : MRI.use_instructions(Reg)) {
138    Register Def = Use.getOperand(0).getReg();
139    if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def))
140      return Def;
141  }
142
143  // Copy Reg to vgpr.
144  Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
145  MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID));
146  return VgprReg;
147}
148
149AMDGPURegBankCombinerImpl::MinMaxMedOpc
150AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
151  switch (Opc) {
152  default:
153    llvm_unreachable("Unsupported opcode");
154  case AMDGPU::G_SMAX:
155  case AMDGPU::G_SMIN:
156    return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3};
157  case AMDGPU::G_UMAX:
158  case AMDGPU::G_UMIN:
159    return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
160  case AMDGPU::G_FMAXNUM:
161  case AMDGPU::G_FMINNUM:
162    return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
163  case AMDGPU::G_FMAXNUM_IEEE:
164  case AMDGPU::G_FMINNUM_IEEE:
165    return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
166            AMDGPU::G_AMDGPU_FMED3};
167  }
168}
169
170template <class m_Cst, typename CstTy>
171bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
172                                         MachineRegisterInfo &MRI,
173                                         MinMaxMedOpc MMMOpc, Register &Val,
174                                         CstTy &K0, CstTy &K1) const {
175  // 4 operand commutes of: min(max(Val, K0), K1).
176  // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
177  // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
178  // 4 operand commutes of: max(min(Val, K1), K0).
179  // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
180  // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
181  return mi_match(
182      MI, MRI,
183      m_any_of(
184          m_CommutativeBinOp(
185              MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)),
186              m_Cst(K1)),
187          m_CommutativeBinOp(
188              MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)),
189              m_Cst(K0))));
190}
191
192bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
193    MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
194  Register Dst = MI.getOperand(0).getReg();
195  if (!isVgprRegBank(Dst))
196    return false;
197
198  // med3 for i16 is only available on gfx9+, and not available for v2i16.
199  LLT Ty = MRI.getType(Dst);
200  if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
201    return false;
202
203  MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
204  Register Val;
205  std::optional<ValueAndVReg> K0, K1;
206  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
207  if (!matchMed<GCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
208    return false;
209
210  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(K1->Value))
211    return false;
212  if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(K1->Value))
213    return false;
214
215  MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
216  return true;
217}
218
219// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
220// ieee = true  : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
221// ieee = false : min/max(NaN, K) = K
222// clamp(NaN) = dx10_clamp ? 0.0 : NaN
223// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
224// Other operand commutes (see matchMed) give same result since min and max are
225// commutative.
226
227// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
228// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
229// Val = SNaN only for ieee = true
230// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
231// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
232// max(min(SNaN, K1), K0) = max(K1, K0) = K1
233// Val = NaN,ieee = false or Val = QNaN,ieee = true
234// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
235// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
236// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
237bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
238    MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
239  Register Dst = MI.getOperand(0).getReg();
240  LLT Ty = MRI.getType(Dst);
241
242  // med3 for f16 is only available on gfx9+, and not available for v2f16.
243  if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
244    return false;
245
246  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
247
248  Register Val;
249  std::optional<FPValueAndVReg> K0, K1;
250  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
251  if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
252    return false;
253
254  if (K0->Value > K1->Value)
255    return false;
256
257  // For IEEE=false perform combine only when it's safe to assume that there are
258  // no NaN inputs. Most often MI is marked with nnan fast math flag.
259  // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
260  // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
261  // nodes(max/min) have same behavior when one input is NaN and other isn't.
262  // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
263  // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
264  if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
265    // Don't fold single use constant that can't be inlined.
266    if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
267        (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
268      MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
269      return true;
270    }
271  }
272
273  return false;
274}
275
276bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
277                                                     Register &Reg) const {
278  // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
279  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
280  Register Val;
281  std::optional<FPValueAndVReg> K0, K1;
282  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
283  if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
284    return false;
285
286  if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0))
287    return false;
288
289  // For IEEE=false perform combine only when it's safe to assume that there are
290  // no NaN inputs. Most often MI is marked with nnan fast math flag.
291  // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
292  // to 0.0 requires dx10_clamp = true.
293  if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
294       isKnownNeverSNaN(Val, MRI)) ||
295      isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
296    Reg = Val;
297    return true;
298  }
299
300  return false;
301}
302
303// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
304// Val = SNaN only for ieee = true. It is important which operand is NaN.
305// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
306// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
307// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
308// Val = NaN,ieee = false or Val = QNaN,ieee = true
309// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
310// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
311// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
312bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
313                                                   Register &Reg) const {
314  // In llvm-ir, clamp is often represented as an intrinsic call to
315  // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
316  MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
317  MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
318  MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
319
320  if (isFCst(Src0) && !isFCst(Src1))
321    std::swap(Src0, Src1);
322  if (isFCst(Src1) && !isFCst(Src2))
323    std::swap(Src1, Src2);
324  if (isFCst(Src0) && !isFCst(Src1))
325    std::swap(Src0, Src1);
326  if (!isClampZeroToOne(Src1, Src2))
327    return false;
328
329  Register Val = Src0->getOperand(0).getReg();
330
331  auto isOp3Zero = [&]() {
332    MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
333    if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
334      return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
335    return false;
336  };
337  // For IEEE=false perform combine only when it's safe to assume that there are
338  // no NaN inputs. Most often MI is marked with nnan fast math flag.
339  // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
340  // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
341  if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
342      (getIEEE() && getDX10Clamp() &&
343       (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
344    Reg = Val;
345    return true;
346  }
347
348  return false;
349}
350
351void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
352                                           Register &Reg) const {
353  B.setInstrAndDebugLoc(MI);
354  B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
355               MI.getFlags());
356  MI.eraseFromParent();
357}
358
359void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
360                                          Med3MatchInfo &MatchInfo) const {
361  B.setInstrAndDebugLoc(MI);
362  B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
363               {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
364                getAsVgpr(MatchInfo.Val2)},
365               MI.getFlags());
366  MI.eraseFromParent();
367}
368
369SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
370  return MF.getInfo<SIMachineFunctionInfo>()->getMode();
371}
372
373bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; }
374
375bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
376  return getMode().DX10Clamp;
377}
378
379bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const {
380  return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
381}
382
383bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const {
384  return MI->getOpcode() == AMDGPU::G_FCONSTANT;
385}
386
387bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
388                                                 MachineInstr *K1) const {
389  if (isFCst(K0) && isFCst(K1)) {
390    const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
391    const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
392    return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) ||
393           (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0));
394  }
395  return false;
396}
397
398// Pass boilerplate
399// ================
400
401class AMDGPURegBankCombiner : public MachineFunctionPass {
402public:
403  static char ID;
404
405  AMDGPURegBankCombiner(bool IsOptNone = false);
406
407  StringRef getPassName() const override { return "AMDGPURegBankCombiner"; }
408
409  bool runOnMachineFunction(MachineFunction &MF) override;
410
411  void getAnalysisUsage(AnalysisUsage &AU) const override;
412
413private:
414  bool IsOptNone;
415  AMDGPURegBankCombinerImplRuleConfig RuleConfig;
416};
417} // end anonymous namespace
418
419void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
420  AU.addRequired<TargetPassConfig>();
421  AU.setPreservesCFG();
422  getSelectionDAGFallbackAnalysisUsage(AU);
423  AU.addRequired<GISelKnownBitsAnalysis>();
424  AU.addPreserved<GISelKnownBitsAnalysis>();
425  if (!IsOptNone) {
426    AU.addRequired<MachineDominatorTree>();
427    AU.addPreserved<MachineDominatorTree>();
428  }
429  MachineFunctionPass::getAnalysisUsage(AU);
430}
431
432AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
433    : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
434  initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
435
436  if (!RuleConfig.parseCommandLineOption())
437    report_fatal_error("Invalid rule identifier");
438}
439
440bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
441  if (MF.getProperties().hasProperty(
442          MachineFunctionProperties::Property::FailedISel))
443    return false;
444  auto *TPC = &getAnalysis<TargetPassConfig>();
445  const Function &F = MF.getFunction();
446  bool EnableOpt =
447      MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
448
449  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
450  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
451
452  const auto *LI = ST.getLegalizerInfo();
453  MachineDominatorTree *MDT =
454      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
455
456  CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
457                     LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
458  AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
459                                 RuleConfig, ST, MDT, LI);
460  return Impl.combineMachineInstrs();
461}
462
463char AMDGPURegBankCombiner::ID = 0;
464INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
465                      "Combine AMDGPU machine instrs after regbankselect",
466                      false, false)
467INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
468INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
469INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
470                    "Combine AMDGPU machine instrs after regbankselect", false,
471                    false)
472
473namespace llvm {
474FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) {
475  return new AMDGPURegBankCombiner(IsOptNone);
476}
477} // end namespace llvm
478