1//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass does combining of machine instructions at the generic MI level,
10// after the legalizer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUTargetMachine.h"
15#include "AMDGPULegalizerInfo.h"
16#include "llvm/CodeGen/GlobalISel/Combiner.h"
17#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21#include "llvm/CodeGen/MachineDominators.h"
22#include "llvm/CodeGen/MachineFunctionPass.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/Support/Debug.h"
25#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26
27#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28
29using namespace llvm;
30using namespace MIPatternMatch;
31
32struct FMinFMaxLegacyInfo {
33  Register LHS;
34  Register RHS;
35  Register True;
36  Register False;
37  CmpInst::Predicate Pred;
38};
39
40// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
41static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
42                                MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
43  // FIXME: Combines should have subtarget predicates, and we shouldn't need
44  // this here.
45  if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
46    return false;
47
48  // FIXME: Type predicate on pattern
49  if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
50    return false;
51
52  Register Cond = MI.getOperand(1).getReg();
53  if (!MRI.hasOneNonDBGUse(Cond) ||
54      !mi_match(Cond, MRI,
55                m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
56    return false;
57
58  Info.True = MI.getOperand(2).getReg();
59  Info.False = MI.getOperand(3).getReg();
60
61  if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
62      !(Info.LHS == Info.False && Info.RHS == Info.True))
63    return false;
64
65  switch (Info.Pred) {
66  case CmpInst::FCMP_FALSE:
67  case CmpInst::FCMP_OEQ:
68  case CmpInst::FCMP_ONE:
69  case CmpInst::FCMP_ORD:
70  case CmpInst::FCMP_UNO:
71  case CmpInst::FCMP_UEQ:
72  case CmpInst::FCMP_UNE:
73  case CmpInst::FCMP_TRUE:
74    return false;
75  default:
76    return true;
77  }
78}
79
80static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
81                                              const FMinFMaxLegacyInfo &Info) {
82
83  auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
84    MachineIRBuilder MIB(MI);
85    MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
86  };
87
88  switch (Info.Pred) {
89  case CmpInst::FCMP_ULT:
90  case CmpInst::FCMP_ULE:
91    if (Info.LHS == Info.True)
92      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
93    else
94      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
95    break;
96  case CmpInst::FCMP_OLE:
97  case CmpInst::FCMP_OLT: {
98    // We need to permute the operands to get the correct NaN behavior. The
99    // selected operand is the second one based on the failing compare with NaN,
100    // so permute it based on the compare type the hardware uses.
101    if (Info.LHS == Info.True)
102      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
103    else
104      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
105    break;
106  }
107  case CmpInst::FCMP_UGE:
108  case CmpInst::FCMP_UGT: {
109    if (Info.LHS == Info.True)
110      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
111    else
112      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
113    break;
114  }
115  case CmpInst::FCMP_OGT:
116  case CmpInst::FCMP_OGE: {
117    if (Info.LHS == Info.True)
118      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
119    else
120      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121    break;
122  }
123  default:
124    llvm_unreachable("predicate should not have matched");
125  }
126
127  MI.eraseFromParent();
128}
129
130static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
131                              MachineFunction &MF, CombinerHelper &Helper) {
132  Register DstReg = MI.getOperand(0).getReg();
133
134  // TODO: We could try to match extracting the higher bytes, which would be
135  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
136  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
137  // about in practice.
138  LLT Ty = MRI.getType(DstReg);
139  if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
140    Register SrcReg = MI.getOperand(1).getReg();
141    unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
142    assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
143    const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
144    return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
145  }
146
147  return false;
148}
149
150static void applyUCharToFloat(MachineInstr &MI) {
151  MachineIRBuilder B(MI);
152
153  const LLT S32 = LLT::scalar(32);
154
155  Register DstReg = MI.getOperand(0).getReg();
156  Register SrcReg = MI.getOperand(1).getReg();
157  LLT Ty = B.getMRI()->getType(DstReg);
158  LLT SrcTy = B.getMRI()->getType(SrcReg);
159  if (SrcTy != S32)
160    SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
161
162  if (Ty == S32) {
163    B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
164                   {SrcReg}, MI.getFlags());
165  } else {
166    auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
167                             {SrcReg}, MI.getFlags());
168    B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
169  }
170
171  MI.eraseFromParent();
172}
173
174// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
175// boilerplate.
176struct CvtF32UByteMatchInfo {
177  Register CvtVal;
178  unsigned ShiftOffset;
179};
180
181static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
182                              MachineFunction &MF,
183                              CvtF32UByteMatchInfo &MatchInfo) {
184  Register SrcReg = MI.getOperand(1).getReg();
185
186  // Look through G_ZEXT.
187  mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
188
189  Register Src0;
190  int64_t ShiftAmt;
191  bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
192  if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
193    const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
194
195    unsigned ShiftOffset = 8 * Offset;
196    if (IsShr)
197      ShiftOffset += ShiftAmt;
198    else
199      ShiftOffset -= ShiftAmt;
200
201    MatchInfo.CvtVal = Src0;
202    MatchInfo.ShiftOffset = ShiftOffset;
203    return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
204  }
205
206  // TODO: Simplify demanded bits.
207  return false;
208}
209
210static void applyCvtF32UByteN(MachineInstr &MI,
211                              const CvtF32UByteMatchInfo &MatchInfo) {
212  MachineIRBuilder B(MI);
213  unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
214
215  const LLT S32 = LLT::scalar(32);
216  Register CvtSrc = MatchInfo.CvtVal;
217  LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
218  if (SrcTy != S32) {
219    assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
220    CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
221  }
222
223  assert(MI.getOpcode() != NewOpc);
224  B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
225  MI.eraseFromParent();
226}
227
228#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
229#include "AMDGPUGenPostLegalizeGICombiner.inc"
230#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
231
232namespace {
233#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
234#include "AMDGPUGenPostLegalizeGICombiner.inc"
235#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
236
237class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
238  GISelKnownBits *KB;
239  MachineDominatorTree *MDT;
240
241public:
242  AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
243
244  AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
245                                  const AMDGPULegalizerInfo *LI,
246                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
247      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
248                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
249        KB(KB), MDT(MDT) {
250    if (!GeneratedRuleCfg.parseCommandLineOption())
251      report_fatal_error("Invalid rule identifier");
252  }
253
254  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
255               MachineIRBuilder &B) const override;
256};
257
258bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
259                                              MachineInstr &MI,
260                                              MachineIRBuilder &B) const {
261  CombinerHelper Helper(Observer, B, KB, MDT);
262  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
263
264  if (Generated.tryCombineAll(Observer, MI, B, Helper))
265    return true;
266
267  switch (MI.getOpcode()) {
268  case TargetOpcode::G_SHL:
269  case TargetOpcode::G_LSHR:
270  case TargetOpcode::G_ASHR:
271    // On some subtargets, 64-bit shift is a quarter rate instruction. In the
272    // common case, splitting this into a move and a 32-bit shift is faster and
273    // the same code size.
274    return Helper.tryCombineShiftToUnmerge(MI, 32);
275  }
276
277  return false;
278}
279
280#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
281#include "AMDGPUGenPostLegalizeGICombiner.inc"
282#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
283
284// Pass boilerplate
285// ================
286
287class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
288public:
289  static char ID;
290
291  AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
292
293  StringRef getPassName() const override {
294    return "AMDGPUPostLegalizerCombiner";
295  }
296
297  bool runOnMachineFunction(MachineFunction &MF) override;
298
299  void getAnalysisUsage(AnalysisUsage &AU) const override;
300private:
301  bool IsOptNone;
302};
303} // end anonymous namespace
304
305void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
306  AU.addRequired<TargetPassConfig>();
307  AU.setPreservesCFG();
308  getSelectionDAGFallbackAnalysisUsage(AU);
309  AU.addRequired<GISelKnownBitsAnalysis>();
310  AU.addPreserved<GISelKnownBitsAnalysis>();
311  if (!IsOptNone) {
312    AU.addRequired<MachineDominatorTree>();
313    AU.addPreserved<MachineDominatorTree>();
314  }
315  MachineFunctionPass::getAnalysisUsage(AU);
316}
317
318AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
319  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
320  initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
321}
322
323bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
324  if (MF.getProperties().hasProperty(
325          MachineFunctionProperties::Property::FailedISel))
326    return false;
327  auto *TPC = &getAnalysis<TargetPassConfig>();
328  const Function &F = MF.getFunction();
329  bool EnableOpt =
330      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
331
332  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
333  const AMDGPULegalizerInfo *LI
334    = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
335
336  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
337  MachineDominatorTree *MDT =
338      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
339  AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
340                                         F.hasMinSize(), LI, KB, MDT);
341  Combiner C(PCInfo, TPC);
342  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
343}
344
345char AMDGPUPostLegalizerCombiner::ID = 0;
346INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
347                      "Combine AMDGPU machine instrs after legalization",
348                      false, false)
349INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
350INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
351INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
352                    "Combine AMDGPU machine instrs after legalization", false,
353                    false)
354
355namespace llvm {
356FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
357  return new AMDGPUPostLegalizerCombiner(IsOptNone);
358}
359} // end namespace llvm
360