1326938Sdim//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// 2326938Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6326938Sdim// 7326938Sdim//===----------------------------------------------------------------------===// 8326938Sdim// 9326938Sdim/// \file 10341825Sdim/// This is AMDGPU specific replacement of the standard inliner. 11326938Sdim/// The main purpose is to account for the fact that calls not only expensive 12326938Sdim/// on the AMDGPU, but much more expensive if a private memory pointer is 13326938Sdim/// passed to a function as an argument. In this situation, we are unable to 14326938Sdim/// eliminate private memory in the caller unless inlined and end up with slow 15326938Sdim/// and expensive scratch access. Thus, we boost the inline threshold for such 16326938Sdim/// functions here. 17326938Sdim/// 18326938Sdim//===----------------------------------------------------------------------===// 19326938Sdim 20326938Sdim#include "AMDGPU.h" 21326938Sdim#include "llvm/Analysis/AssumptionCache.h" 22326938Sdim#include "llvm/Analysis/CallGraph.h" 23326938Sdim#include "llvm/Analysis/InlineCost.h" 24360784Sdim#include "llvm/Analysis/TargetTransformInfo.h" 25326938Sdim#include "llvm/Analysis/ValueTracking.h" 26326938Sdim#include "llvm/IR/CallSite.h" 27326938Sdim#include "llvm/IR/DataLayout.h" 28326938Sdim#include "llvm/IR/Instructions.h" 29326938Sdim#include "llvm/IR/Module.h" 30326938Sdim#include "llvm/IR/Type.h" 31360784Sdim#include "llvm/InitializePasses.h" 32326938Sdim#include "llvm/Support/CommandLine.h" 33326938Sdim#include "llvm/Support/Debug.h" 34360784Sdim#include "llvm/Transforms/IPO.h" 35326938Sdim#include "llvm/Transforms/IPO/Inliner.h" 36326938Sdim 37326938Sdimusing namespace llvm; 38326938Sdim 39326938Sdim#define DEBUG_TYPE "inline" 40326938Sdim 41326938Sdimstatic cl::opt<int> 42360784SdimArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), 43326938Sdim cl::desc("Cost of alloca argument")); 44326938Sdim 45326938Sdim// If the amount of scratch memory to eliminate exceeds our ability to allocate 46344779Sdim// it into registers we gain nothing by aggressively inlining functions for that 47326938Sdim// heuristic. 48326938Sdimstatic cl::opt<unsigned> 49326938SdimArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), 50326938Sdim cl::desc("Maximum alloca size to use for inline cost")); 51326938Sdim 52353358Sdim// Inliner constraint to achieve reasonable compilation time 53353358Sdimstatic cl::opt<size_t> 54360784SdimMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), 55353358Sdim cl::desc("Maximum BB number allowed in a function after inlining" 56353358Sdim " (compile time constraint)")); 57353358Sdim 58326938Sdimnamespace { 59326938Sdim 60326938Sdimclass AMDGPUInliner : public LegacyInlinerBase { 61326938Sdim 62326938Sdimpublic: 63326938Sdim AMDGPUInliner() : LegacyInlinerBase(ID) { 64326938Sdim initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); 65326938Sdim Params = getInlineParams(); 66326938Sdim } 67326938Sdim 68326938Sdim static char ID; // Pass identification, replacement for typeid 69326938Sdim 70326938Sdim unsigned getInlineThreshold(CallSite CS) const; 71326938Sdim 72326938Sdim InlineCost getInlineCost(CallSite CS) override; 73326938Sdim 74326938Sdim bool runOnSCC(CallGraphSCC &SCC) override; 75326938Sdim 76326938Sdim void getAnalysisUsage(AnalysisUsage &AU) const override; 77326938Sdim 78326938Sdimprivate: 79326938Sdim TargetTransformInfoWrapperPass *TTIWP; 80326938Sdim 81326938Sdim InlineParams Params; 82326938Sdim}; 83326938Sdim 84326938Sdim} // end anonymous namespace 85326938Sdim 86326938Sdimchar AMDGPUInliner::ID = 0; 87326938SdimINITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", 88326938Sdim "AMDGPU Function Integration/Inlining", false, false) 89326938SdimINITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 90326938SdimINITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 91326938SdimINITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 92326938SdimINITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 93326938SdimINITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 94326938SdimINITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", 95326938Sdim "AMDGPU Function Integration/Inlining", false, false) 96326938Sdim 97326938SdimPass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } 98326938Sdim 99326938Sdimbool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { 100326938Sdim TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 101326938Sdim return LegacyInlinerBase::runOnSCC(SCC); 102326938Sdim} 103326938Sdim 104326938Sdimvoid AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { 105326938Sdim AU.addRequired<TargetTransformInfoWrapperPass>(); 106326938Sdim LegacyInlinerBase::getAnalysisUsage(AU); 107326938Sdim} 108326938Sdim 109326938Sdimunsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { 110326938Sdim int Thres = Params.DefaultThreshold; 111326938Sdim 112326938Sdim Function *Caller = CS.getCaller(); 113326938Sdim // Listen to the inlinehint attribute when it would increase the threshold 114326938Sdim // and the caller does not need to minimize its size. 115326938Sdim Function *Callee = CS.getCalledFunction(); 116326938Sdim bool InlineHint = Callee && !Callee->isDeclaration() && 117326938Sdim Callee->hasFnAttribute(Attribute::InlineHint); 118326938Sdim if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres 119326938Sdim && !Caller->hasFnAttribute(Attribute::MinSize)) 120353358Sdim Thres = Params.HintThreshold.getValue() * 121353358Sdim TTIWP->getTTI(*Callee).getInliningThresholdMultiplier(); 122326938Sdim 123326938Sdim const DataLayout &DL = Caller->getParent()->getDataLayout(); 124326938Sdim if (!Callee) 125326938Sdim return (unsigned)Thres; 126326938Sdim 127326938Sdim // If we have a pointer to private array passed into a function 128326938Sdim // it will not be optimized out, leaving scratch usage. 129326938Sdim // Increase the inline threshold to allow inliniting in this case. 130326938Sdim uint64_t AllocaSize = 0; 131326938Sdim SmallPtrSet<const AllocaInst *, 8> AIVisited; 132326938Sdim for (Value *PtrArg : CS.args()) { 133353358Sdim PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); 134353358Sdim if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && 135353358Sdim Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) 136326938Sdim continue; 137353358Sdim 138326938Sdim PtrArg = GetUnderlyingObject(PtrArg, DL); 139326938Sdim if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 140326938Sdim if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 141326938Sdim continue; 142326938Sdim AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 143326938Sdim // If the amount of stack memory is excessive we will not be able 144326938Sdim // to get rid of the scratch anyway, bail out. 145326938Sdim if (AllocaSize > ArgAllocaCutoff) { 146326938Sdim AllocaSize = 0; 147326938Sdim break; 148326938Sdim } 149326938Sdim } 150326938Sdim } 151326938Sdim if (AllocaSize) 152326938Sdim Thres += ArgAllocaCost; 153326938Sdim 154326938Sdim return (unsigned)Thres; 155326938Sdim} 156326938Sdim 157326938Sdim// Check if call is just a wrapper around another call. 158326938Sdim// In this case we only have call and ret instructions. 159326938Sdimstatic bool isWrapperOnlyCall(CallSite CS) { 160326938Sdim Function *Callee = CS.getCalledFunction(); 161326938Sdim if (!Callee || Callee->size() != 1) 162326938Sdim return false; 163326938Sdim const BasicBlock &BB = Callee->getEntryBlock(); 164326938Sdim if (const Instruction *I = BB.getFirstNonPHI()) { 165326938Sdim if (!isa<CallInst>(I)) { 166326938Sdim return false; 167326938Sdim } 168326938Sdim if (isa<ReturnInst>(*std::next(I->getIterator()))) { 169341825Sdim LLVM_DEBUG(dbgs() << " Wrapper only call detected: " 170341825Sdim << Callee->getName() << '\n'); 171326938Sdim return true; 172326938Sdim } 173326938Sdim } 174326938Sdim return false; 175326938Sdim} 176326938Sdim 177326938SdimInlineCost AMDGPUInliner::getInlineCost(CallSite CS) { 178326938Sdim Function *Callee = CS.getCalledFunction(); 179326938Sdim Function *Caller = CS.getCaller(); 180326938Sdim 181344779Sdim if (!Callee || Callee->isDeclaration()) 182344779Sdim return llvm::InlineCost::getNever("undefined callee"); 183326938Sdim 184344779Sdim if (CS.isNoInline()) 185344779Sdim return llvm::InlineCost::getNever("noinline"); 186344779Sdim 187353358Sdim TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 188344779Sdim if (!TTI.areInlineCompatible(Caller, Callee)) 189344779Sdim return llvm::InlineCost::getNever("incompatible"); 190344779Sdim 191326938Sdim if (CS.hasFnAttr(Attribute::AlwaysInline)) { 192353358Sdim auto IsViable = isInlineViable(*Callee); 193353358Sdim if (IsViable) 194344779Sdim return llvm::InlineCost::getAlways("alwaysinline viable"); 195353358Sdim return llvm::InlineCost::getNever(IsViable.message); 196326938Sdim } 197326938Sdim 198326938Sdim if (isWrapperOnlyCall(CS)) 199344779Sdim return llvm::InlineCost::getAlways("wrapper-only call"); 200326938Sdim 201326938Sdim InlineParams LocalParams = Params; 202326938Sdim LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); 203326938Sdim bool RemarksEnabled = false; 204326938Sdim const auto &BBs = Caller->getBasicBlockList(); 205326938Sdim if (!BBs.empty()) { 206326938Sdim auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 207326938Sdim if (DI.isEnabled()) 208326938Sdim RemarksEnabled = true; 209326938Sdim } 210326938Sdim 211326938Sdim OptimizationRemarkEmitter ORE(Caller); 212326938Sdim std::function<AssumptionCache &(Function &)> GetAssumptionCache = 213326938Sdim [this](Function &F) -> AssumptionCache & { 214326938Sdim return ACT->getAssumptionCache(F); 215326938Sdim }; 216326938Sdim 217353358Sdim auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee, 218353358Sdim LocalParams, TTI, GetAssumptionCache, None, PSI, 219353358Sdim RemarksEnabled ? &ORE : nullptr); 220353358Sdim 221353358Sdim if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) { 222353358Sdim // Single BB does not increase total BB amount, thus subtract 1 223353358Sdim size_t Size = Caller->size() + Callee->size() - 1; 224353358Sdim if (MaxBB && Size > MaxBB) 225353358Sdim return llvm::InlineCost::getNever("max number of bb exceeded"); 226353358Sdim } 227353358Sdim return IC; 228326938Sdim} 229