1//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// This is AMDGPU specific replacement of the standard inliner. 11/// The main purpose is to account for the fact that calls not only expensive 12/// on the AMDGPU, but much more expensive if a private memory pointer is 13/// passed to a function as an argument. In this situation, we are unable to 14/// eliminate private memory in the caller unless inlined and end up with slow 15/// and expensive scratch access. Thus, we boost the inline threshold for such 16/// functions here. 17/// 18//===----------------------------------------------------------------------===// 19 20#include "AMDGPU.h" 21#include "llvm/Analysis/AssumptionCache.h" 22#include "llvm/Analysis/CallGraph.h" 23#include "llvm/Analysis/InlineCost.h" 24#include "llvm/Analysis/TargetTransformInfo.h" 25#include "llvm/Analysis/ValueTracking.h" 26#include "llvm/IR/DataLayout.h" 27#include "llvm/IR/Instructions.h" 28#include "llvm/IR/Module.h" 29#include "llvm/IR/Type.h" 30#include "llvm/InitializePasses.h" 31#include "llvm/Support/CommandLine.h" 32#include "llvm/Support/Debug.h" 33#include "llvm/Transforms/IPO.h" 34#include "llvm/Transforms/IPO/Inliner.h" 35 36using namespace llvm; 37 38#define DEBUG_TYPE "inline" 39 40static cl::opt<int> 41ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), 42 cl::desc("Cost of alloca argument")); 43 44// If the amount of scratch memory to eliminate exceeds our ability to allocate 45// it into registers we gain nothing by aggressively inlining functions for that 46// heuristic. 47static cl::opt<unsigned> 48ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), 49 cl::desc("Maximum alloca size to use for inline cost")); 50 51// Inliner constraint to achieve reasonable compilation time 52static cl::opt<size_t> 53MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), 54 cl::desc("Maximum BB number allowed in a function after inlining" 55 " (compile time constraint)")); 56 57namespace { 58 59class AMDGPUInliner : public LegacyInlinerBase { 60 61public: 62 AMDGPUInliner() : LegacyInlinerBase(ID) { 63 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); 64 Params = getInlineParams(); 65 } 66 67 static char ID; // Pass identification, replacement for typeid 68 69 unsigned getInlineThreshold(CallBase &CB) const; 70 71 InlineCost getInlineCost(CallBase &CB) override; 72 73 bool runOnSCC(CallGraphSCC &SCC) override; 74 75 void getAnalysisUsage(AnalysisUsage &AU) const override; 76 77private: 78 TargetTransformInfoWrapperPass *TTIWP; 79 80 InlineParams Params; 81}; 82 83} // end anonymous namespace 84 85char AMDGPUInliner::ID = 0; 86INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", 87 "AMDGPU Function Integration/Inlining", false, false) 88INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 89INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 90INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 91INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 92INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 93INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", 94 "AMDGPU Function Integration/Inlining", false, false) 95 96Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } 97 98bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { 99 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 100 return LegacyInlinerBase::runOnSCC(SCC); 101} 102 103void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { 104 AU.addRequired<TargetTransformInfoWrapperPass>(); 105 LegacyInlinerBase::getAnalysisUsage(AU); 106} 107 108unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const { 109 int Thres = Params.DefaultThreshold; 110 111 Function *Caller = CB.getCaller(); 112 // Listen to the inlinehint attribute when it would increase the threshold 113 // and the caller does not need to minimize its size. 114 Function *Callee = CB.getCalledFunction(); 115 bool InlineHint = Callee && !Callee->isDeclaration() && 116 Callee->hasFnAttribute(Attribute::InlineHint); 117 if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres 118 && !Caller->hasFnAttribute(Attribute::MinSize)) 119 Thres = Params.HintThreshold.getValue() * 120 TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); 121 122 const DataLayout &DL = Caller->getParent()->getDataLayout(); 123 if (!Callee) 124 return (unsigned)Thres; 125 126 // If we have a pointer to private array passed into a function 127 // it will not be optimized out, leaving scratch usage. 128 // Increase the inline threshold to allow inliniting in this case. 129 uint64_t AllocaSize = 0; 130 SmallPtrSet<const AllocaInst *, 8> AIVisited; 131 for (Value *PtrArg : CB.args()) { 132 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); 133 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && 134 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) 135 continue; 136 137 PtrArg = GetUnderlyingObject(PtrArg, DL); 138 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 139 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 140 continue; 141 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 142 // If the amount of stack memory is excessive we will not be able 143 // to get rid of the scratch anyway, bail out. 144 if (AllocaSize > ArgAllocaCutoff) { 145 AllocaSize = 0; 146 break; 147 } 148 } 149 } 150 if (AllocaSize) 151 Thres += ArgAllocaCost; 152 153 return (unsigned)Thres; 154} 155 156// Check if call is just a wrapper around another call. 157// In this case we only have call and ret instructions. 158static bool isWrapperOnlyCall(CallBase &CB) { 159 Function *Callee = CB.getCalledFunction(); 160 if (!Callee || Callee->size() != 1) 161 return false; 162 const BasicBlock &BB = Callee->getEntryBlock(); 163 if (const Instruction *I = BB.getFirstNonPHI()) { 164 if (!isa<CallInst>(I)) { 165 return false; 166 } 167 if (isa<ReturnInst>(*std::next(I->getIterator()))) { 168 LLVM_DEBUG(dbgs() << " Wrapper only call detected: " 169 << Callee->getName() << '\n'); 170 return true; 171 } 172 } 173 return false; 174} 175 176InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) { 177 Function *Callee = CB.getCalledFunction(); 178 Function *Caller = CB.getCaller(); 179 180 if (!Callee || Callee->isDeclaration()) 181 return llvm::InlineCost::getNever("undefined callee"); 182 183 if (CB.isNoInline()) 184 return llvm::InlineCost::getNever("noinline"); 185 186 TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 187 if (!TTI.areInlineCompatible(Caller, Callee)) 188 return llvm::InlineCost::getNever("incompatible"); 189 190 if (CB.hasFnAttr(Attribute::AlwaysInline)) { 191 auto IsViable = isInlineViable(*Callee); 192 if (IsViable.isSuccess()) 193 return llvm::InlineCost::getAlways("alwaysinline viable"); 194 return llvm::InlineCost::getNever(IsViable.getFailureReason()); 195 } 196 197 if (isWrapperOnlyCall(CB)) 198 return llvm::InlineCost::getAlways("wrapper-only call"); 199 200 InlineParams LocalParams = Params; 201 LocalParams.DefaultThreshold = (int)getInlineThreshold(CB); 202 bool RemarksEnabled = false; 203 const auto &BBs = Caller->getBasicBlockList(); 204 if (!BBs.empty()) { 205 auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 206 if (DI.isEnabled()) 207 RemarksEnabled = true; 208 } 209 210 OptimizationRemarkEmitter ORE(Caller); 211 auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & { 212 return ACT->getAssumptionCache(F); 213 }; 214 215 auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI, 216 GetAssumptionCache, GetTLI, nullptr, PSI, 217 RemarksEnabled ? &ORE : nullptr); 218 219 if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { 220 // Single BB does not increase total BB amount, thus subtract 1 221 size_t Size = Caller->size() + Callee->size() - 1; 222 if (MaxBB && Size > MaxBB) 223 return llvm::InlineCost::getNever("max number of bb exceeded"); 224 } 225 return IC; 226} 227