AMDGPUInline.cpp revision 360784
1//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// This is AMDGPU specific replacement of the standard inliner. 11/// The main purpose is to account for the fact that calls not only expensive 12/// on the AMDGPU, but much more expensive if a private memory pointer is 13/// passed to a function as an argument. In this situation, we are unable to 14/// eliminate private memory in the caller unless inlined and end up with slow 15/// and expensive scratch access. Thus, we boost the inline threshold for such 16/// functions here. 17/// 18//===----------------------------------------------------------------------===// 19 20#include "AMDGPU.h" 21#include "llvm/Analysis/AssumptionCache.h" 22#include "llvm/Analysis/CallGraph.h" 23#include "llvm/Analysis/InlineCost.h" 24#include "llvm/Analysis/TargetTransformInfo.h" 25#include "llvm/Analysis/ValueTracking.h" 26#include "llvm/IR/CallSite.h" 27#include "llvm/IR/DataLayout.h" 28#include "llvm/IR/Instructions.h" 29#include "llvm/IR/Module.h" 30#include "llvm/IR/Type.h" 31#include "llvm/InitializePasses.h" 32#include "llvm/Support/CommandLine.h" 33#include "llvm/Support/Debug.h" 34#include "llvm/Transforms/IPO.h" 35#include "llvm/Transforms/IPO/Inliner.h" 36 37using namespace llvm; 38 39#define DEBUG_TYPE "inline" 40 41static cl::opt<int> 42ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), 43 cl::desc("Cost of alloca argument")); 44 45// If the amount of scratch memory to eliminate exceeds our ability to allocate 46// it into registers we gain nothing by aggressively inlining functions for that 47// heuristic. 48static cl::opt<unsigned> 49ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), 50 cl::desc("Maximum alloca size to use for inline cost")); 51 52// Inliner constraint to achieve reasonable compilation time 53static cl::opt<size_t> 54MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), 55 cl::desc("Maximum BB number allowed in a function after inlining" 56 " (compile time constraint)")); 57 58namespace { 59 60class AMDGPUInliner : public LegacyInlinerBase { 61 62public: 63 AMDGPUInliner() : LegacyInlinerBase(ID) { 64 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); 65 Params = getInlineParams(); 66 } 67 68 static char ID; // Pass identification, replacement for typeid 69 70 unsigned getInlineThreshold(CallSite CS) const; 71 72 InlineCost getInlineCost(CallSite CS) override; 73 74 bool runOnSCC(CallGraphSCC &SCC) override; 75 76 void getAnalysisUsage(AnalysisUsage &AU) const override; 77 78private: 79 TargetTransformInfoWrapperPass *TTIWP; 80 81 InlineParams Params; 82}; 83 84} // end anonymous namespace 85 86char AMDGPUInliner::ID = 0; 87INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", 88 "AMDGPU Function Integration/Inlining", false, false) 89INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 90INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 91INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 92INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 93INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 94INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", 95 "AMDGPU Function Integration/Inlining", false, false) 96 97Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } 98 99bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { 100 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 101 return LegacyInlinerBase::runOnSCC(SCC); 102} 103 104void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { 105 AU.addRequired<TargetTransformInfoWrapperPass>(); 106 LegacyInlinerBase::getAnalysisUsage(AU); 107} 108 109unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { 110 int Thres = Params.DefaultThreshold; 111 112 Function *Caller = CS.getCaller(); 113 // Listen to the inlinehint attribute when it would increase the threshold 114 // and the caller does not need to minimize its size. 115 Function *Callee = CS.getCalledFunction(); 116 bool InlineHint = Callee && !Callee->isDeclaration() && 117 Callee->hasFnAttribute(Attribute::InlineHint); 118 if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres 119 && !Caller->hasFnAttribute(Attribute::MinSize)) 120 Thres = Params.HintThreshold.getValue() * 121 TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); 122 123 const DataLayout &DL = Caller->getParent()->getDataLayout(); 124 if (!Callee) 125 return (unsigned)Thres; 126 127 // If we have a pointer to private array passed into a function 128 // it will not be optimized out, leaving scratch usage. 129 // Increase the inline threshold to allow inliniting in this case. 130 uint64_t AllocaSize = 0; 131 SmallPtrSet<const AllocaInst *, 8> AIVisited; 132 for (Value *PtrArg : CS.args()) { 133 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); 134 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && 135 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) 136 continue; 137 138 PtrArg = GetUnderlyingObject(PtrArg, DL); 139 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 140 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 141 continue; 142 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 143 // If the amount of stack memory is excessive we will not be able 144 // to get rid of the scratch anyway, bail out. 145 if (AllocaSize > ArgAllocaCutoff) { 146 AllocaSize = 0; 147 break; 148 } 149 } 150 } 151 if (AllocaSize) 152 Thres += ArgAllocaCost; 153 154 return (unsigned)Thres; 155} 156 157// Check if call is just a wrapper around another call. 158// In this case we only have call and ret instructions. 159static bool isWrapperOnlyCall(CallSite CS) { 160 Function *Callee = CS.getCalledFunction(); 161 if (!Callee || Callee->size() != 1) 162 return false; 163 const BasicBlock &BB = Callee->getEntryBlock(); 164 if (const Instruction *I = BB.getFirstNonPHI()) { 165 if (!isa<CallInst>(I)) { 166 return false; 167 } 168 if (isa<ReturnInst>(*std::next(I->getIterator()))) { 169 LLVM_DEBUG(dbgs() << " Wrapper only call detected: " 170 << Callee->getName() << '\n'); 171 return true; 172 } 173 } 174 return false; 175} 176 177InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { 178 Function *Callee = CS.getCalledFunction(); 179 Function *Caller = CS.getCaller(); 180 181 if (!Callee || Callee->isDeclaration()) 182 return llvm::InlineCost::getNever("undefined callee"); 183 184 if (CS.isNoInline()) 185 return llvm::InlineCost::getNever("noinline"); 186 187 TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 188 if (!TTI.areInlineCompatible(Caller, Callee)) 189 return llvm::InlineCost::getNever("incompatible"); 190 191 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 192 auto IsViable = isInlineViable(*Callee); 193 if (IsViable) 194 return llvm::InlineCost::getAlways("alwaysinline viable"); 195 return llvm::InlineCost::getNever(IsViable.message); 196 } 197 198 if (isWrapperOnlyCall(CS)) 199 return llvm::InlineCost::getAlways("wrapper-only call"); 200 201 InlineParams LocalParams = Params; 202 LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); 203 bool RemarksEnabled = false; 204 const auto &BBs = Caller->getBasicBlockList(); 205 if (!BBs.empty()) { 206 auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 207 if (DI.isEnabled()) 208 RemarksEnabled = true; 209 } 210 211 OptimizationRemarkEmitter ORE(Caller); 212 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 213 [this](Function &F) -> AssumptionCache & { 214 return ACT->getAssumptionCache(F); 215 }; 216 217 auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee, 218 LocalParams, TTI, GetAssumptionCache, None, PSI, 219 RemarksEnabled ? &ORE : nullptr); 220 221 if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { 222 // Single BB does not increase total BB amount, thus subtract 1 223 size_t Size = Caller->size() + Callee->size() - 1; 224 if (MaxBB && Size > MaxBB) 225 return llvm::InlineCost::getNever("max number of bb exceeded"); 226 } 227 return IC; 228} 229