1//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This pass does combining of machine instructions at the generic MI level, 10// after the legalizer. 11// 12//===----------------------------------------------------------------------===// 13 14#include "AMDGPUTargetMachine.h" 15#include "AMDGPULegalizerInfo.h" 16#include "llvm/CodeGen/GlobalISel/Combiner.h" 17#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 18#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 19#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 20#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21#include "llvm/CodeGen/MachineDominators.h" 22#include "llvm/CodeGen/MachineFunctionPass.h" 23#include "llvm/CodeGen/TargetPassConfig.h" 24#include "llvm/Support/Debug.h" 25#include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 27#define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29using namespace llvm; 30using namespace MIPatternMatch; 31 32struct FMinFMaxLegacyInfo { 33 Register LHS; 34 Register RHS; 35 Register True; 36 Register False; 37 CmpInst::Predicate Pred; 38}; 39 40// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 41static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, 42 MachineFunction &MF, FMinFMaxLegacyInfo &Info) { 43 // FIXME: Combines should have subtarget predicates, and we shouldn't need 44 // this here. 45 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 46 return false; 47 48 // FIXME: Type predicate on pattern 49 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 50 return false; 51 52 Register Cond = MI.getOperand(1).getReg(); 53 if (!MRI.hasOneNonDBGUse(Cond) || 54 !mi_match(Cond, MRI, 55 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 56 return false; 57 58 Info.True = MI.getOperand(2).getReg(); 59 Info.False = MI.getOperand(3).getReg(); 60 61 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 62 !(Info.LHS == Info.False && Info.RHS == Info.True)) 63 return false; 64 65 switch (Info.Pred) { 66 case CmpInst::FCMP_FALSE: 67 case CmpInst::FCMP_OEQ: 68 case CmpInst::FCMP_ONE: 69 case CmpInst::FCMP_ORD: 70 case CmpInst::FCMP_UNO: 71 case CmpInst::FCMP_UEQ: 72 case CmpInst::FCMP_UNE: 73 case CmpInst::FCMP_TRUE: 74 return false; 75 default: 76 return true; 77 } 78} 79 80static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 81 const FMinFMaxLegacyInfo &Info) { 82 83 auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { 84 MachineIRBuilder MIB(MI); 85 MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 86 }; 87 88 switch (Info.Pred) { 89 case CmpInst::FCMP_ULT: 90 case CmpInst::FCMP_ULE: 91 if (Info.LHS == Info.True) 92 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 93 else 94 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 95 break; 96 case CmpInst::FCMP_OLE: 97 case CmpInst::FCMP_OLT: { 98 // We need to permute the operands to get the correct NaN behavior. The 99 // selected operand is the second one based on the failing compare with NaN, 100 // so permute it based on the compare type the hardware uses. 101 if (Info.LHS == Info.True) 102 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 103 else 104 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 105 break; 106 } 107 case CmpInst::FCMP_UGE: 108 case CmpInst::FCMP_UGT: { 109 if (Info.LHS == Info.True) 110 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 111 else 112 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 113 break; 114 } 115 case CmpInst::FCMP_OGT: 116 case CmpInst::FCMP_OGE: { 117 if (Info.LHS == Info.True) 118 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 119 else 120 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 121 break; 122 } 123 default: 124 llvm_unreachable("predicate should not have matched"); 125 } 126 127 MI.eraseFromParent(); 128} 129 130static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, 131 MachineFunction &MF, CombinerHelper &Helper) { 132 Register DstReg = MI.getOperand(0).getReg(); 133 134 // TODO: We could try to match extracting the higher bytes, which would be 135 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 136 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 137 // about in practice. 138 LLT Ty = MRI.getType(DstReg); 139 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 140 Register SrcReg = MI.getOperand(1).getReg(); 141 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 142 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 143 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 144 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 145 } 146 147 return false; 148} 149 150static void applyUCharToFloat(MachineInstr &MI) { 151 MachineIRBuilder B(MI); 152 153 const LLT S32 = LLT::scalar(32); 154 155 Register DstReg = MI.getOperand(0).getReg(); 156 Register SrcReg = MI.getOperand(1).getReg(); 157 LLT Ty = B.getMRI()->getType(DstReg); 158 LLT SrcTy = B.getMRI()->getType(SrcReg); 159 if (SrcTy != S32) 160 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 161 162 if (Ty == S32) { 163 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 164 {SrcReg}, MI.getFlags()); 165 } else { 166 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 167 {SrcReg}, MI.getFlags()); 168 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 169 } 170 171 MI.eraseFromParent(); 172} 173 174// FIXME: Should be able to have 2 separate matchdatas rather than custom struct 175// boilerplate. 176struct CvtF32UByteMatchInfo { 177 Register CvtVal; 178 unsigned ShiftOffset; 179}; 180 181static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, 182 MachineFunction &MF, 183 CvtF32UByteMatchInfo &MatchInfo) { 184 Register SrcReg = MI.getOperand(1).getReg(); 185 186 // Look through G_ZEXT. 187 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 188 189 Register Src0; 190 int64_t ShiftAmt; 191 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 192 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 193 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 194 195 unsigned ShiftOffset = 8 * Offset; 196 if (IsShr) 197 ShiftOffset += ShiftAmt; 198 else 199 ShiftOffset -= ShiftAmt; 200 201 MatchInfo.CvtVal = Src0; 202 MatchInfo.ShiftOffset = ShiftOffset; 203 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 204 } 205 206 // TODO: Simplify demanded bits. 207 return false; 208} 209 210static void applyCvtF32UByteN(MachineInstr &MI, 211 const CvtF32UByteMatchInfo &MatchInfo) { 212 MachineIRBuilder B(MI); 213 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 214 215 const LLT S32 = LLT::scalar(32); 216 Register CvtSrc = MatchInfo.CvtVal; 217 LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal); 218 if (SrcTy != S32) { 219 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 220 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 221 } 222 223 assert(MI.getOpcode() != NewOpc); 224 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 225 MI.eraseFromParent(); 226} 227 228#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 229#include "AMDGPUGenPostLegalizeGICombiner.inc" 230#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 231 232namespace { 233#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 234#include "AMDGPUGenPostLegalizeGICombiner.inc" 235#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 236 237class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { 238 GISelKnownBits *KB; 239 MachineDominatorTree *MDT; 240 241public: 242 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 243 244 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 245 const AMDGPULegalizerInfo *LI, 246 GISelKnownBits *KB, MachineDominatorTree *MDT) 247 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 248 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 249 KB(KB), MDT(MDT) { 250 if (!GeneratedRuleCfg.parseCommandLineOption()) 251 report_fatal_error("Invalid rule identifier"); 252 } 253 254 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 255 MachineIRBuilder &B) const override; 256}; 257 258bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 259 MachineInstr &MI, 260 MachineIRBuilder &B) const { 261 CombinerHelper Helper(Observer, B, KB, MDT); 262 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); 263 264 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 265 return true; 266 267 switch (MI.getOpcode()) { 268 case TargetOpcode::G_SHL: 269 case TargetOpcode::G_LSHR: 270 case TargetOpcode::G_ASHR: 271 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 272 // common case, splitting this into a move and a 32-bit shift is faster and 273 // the same code size. 274 return Helper.tryCombineShiftToUnmerge(MI, 32); 275 } 276 277 return false; 278} 279 280#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 281#include "AMDGPUGenPostLegalizeGICombiner.inc" 282#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 283 284// Pass boilerplate 285// ================ 286 287class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 288public: 289 static char ID; 290 291 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 292 293 StringRef getPassName() const override { 294 return "AMDGPUPostLegalizerCombiner"; 295 } 296 297 bool runOnMachineFunction(MachineFunction &MF) override; 298 299 void getAnalysisUsage(AnalysisUsage &AU) const override; 300private: 301 bool IsOptNone; 302}; 303} // end anonymous namespace 304 305void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 306 AU.addRequired<TargetPassConfig>(); 307 AU.setPreservesCFG(); 308 getSelectionDAGFallbackAnalysisUsage(AU); 309 AU.addRequired<GISelKnownBitsAnalysis>(); 310 AU.addPreserved<GISelKnownBitsAnalysis>(); 311 if (!IsOptNone) { 312 AU.addRequired<MachineDominatorTree>(); 313 AU.addPreserved<MachineDominatorTree>(); 314 } 315 MachineFunctionPass::getAnalysisUsage(AU); 316} 317 318AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 319 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 320 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 321} 322 323bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 324 if (MF.getProperties().hasProperty( 325 MachineFunctionProperties::Property::FailedISel)) 326 return false; 327 auto *TPC = &getAnalysis<TargetPassConfig>(); 328 const Function &F = MF.getFunction(); 329 bool EnableOpt = 330 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 331 332 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 333 const AMDGPULegalizerInfo *LI 334 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 335 336 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 337 MachineDominatorTree *MDT = 338 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 339 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 340 F.hasMinSize(), LI, KB, MDT); 341 Combiner C(PCInfo, TPC); 342 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 343} 344 345char AMDGPUPostLegalizerCombiner::ID = 0; 346INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 347 "Combine AMDGPU machine instrs after legalization", 348 false, false) 349INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 350INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 351INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 352 "Combine AMDGPU machine instrs after legalization", false, 353 false) 354 355namespace llvm { 356FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 357 return new AMDGPUPostLegalizerCombiner(IsOptNone); 358} 359} // end namespace llvm 360