AMDGPUInline.cpp revision 360784
1//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is AMDGPU specific replacement of the standard inliner.
11/// The main purpose is to account for the fact that calls not only expensive
12/// on the AMDGPU, but much more expensive if a private memory pointer is
13/// passed to a function as an argument. In this situation, we are unable to
14/// eliminate private memory in the caller unless inlined and end up with slow
15/// and expensive scratch access. Thus, we boost the inline threshold for such
16/// functions here.
17///
18//===----------------------------------------------------------------------===//
19
20#include "AMDGPU.h"
21#include "llvm/Analysis/AssumptionCache.h"
22#include "llvm/Analysis/CallGraph.h"
23#include "llvm/Analysis/InlineCost.h"
24#include "llvm/Analysis/TargetTransformInfo.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/IR/CallSite.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/Module.h"
30#include "llvm/IR/Type.h"
31#include "llvm/InitializePasses.h"
32#include "llvm/Support/CommandLine.h"
33#include "llvm/Support/Debug.h"
34#include "llvm/Transforms/IPO.h"
35#include "llvm/Transforms/IPO/Inliner.h"
36
37using namespace llvm;
38
39#define DEBUG_TYPE "inline"
40
41static cl::opt<int>
42ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
43              cl::desc("Cost of alloca argument"));
44
45// If the amount of scratch memory to eliminate exceeds our ability to allocate
46// it into registers we gain nothing by aggressively inlining functions for that
47// heuristic.
48static cl::opt<unsigned>
49ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
50                cl::desc("Maximum alloca size to use for inline cost"));
51
52// Inliner constraint to achieve reasonable compilation time
53static cl::opt<size_t>
54MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
55      cl::desc("Maximum BB number allowed in a function after inlining"
56               " (compile time constraint)"));
57
58namespace {
59
60class AMDGPUInliner : public LegacyInlinerBase {
61
62public:
63  AMDGPUInliner() : LegacyInlinerBase(ID) {
64    initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
65    Params = getInlineParams();
66  }
67
68  static char ID; // Pass identification, replacement for typeid
69
70  unsigned getInlineThreshold(CallSite CS) const;
71
72  InlineCost getInlineCost(CallSite CS) override;
73
74  bool runOnSCC(CallGraphSCC &SCC) override;
75
76  void getAnalysisUsage(AnalysisUsage &AU) const override;
77
78private:
79  TargetTransformInfoWrapperPass *TTIWP;
80
81  InlineParams Params;
82};
83
84} // end anonymous namespace
85
86char AMDGPUInliner::ID = 0;
87INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
88                "AMDGPU Function Integration/Inlining", false, false)
89INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
90INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
91INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
92INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
93INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
94INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
95                "AMDGPU Function Integration/Inlining", false, false)
96
97Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
98
99bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
100  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
101  return LegacyInlinerBase::runOnSCC(SCC);
102}
103
104void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
105  AU.addRequired<TargetTransformInfoWrapperPass>();
106  LegacyInlinerBase::getAnalysisUsage(AU);
107}
108
109unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
110  int Thres = Params.DefaultThreshold;
111
112  Function *Caller = CS.getCaller();
113  // Listen to the inlinehint attribute when it would increase the threshold
114  // and the caller does not need to minimize its size.
115  Function *Callee = CS.getCalledFunction();
116  bool InlineHint = Callee && !Callee->isDeclaration() &&
117    Callee->hasFnAttribute(Attribute::InlineHint);
118  if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
119      && !Caller->hasFnAttribute(Attribute::MinSize))
120    Thres = Params.HintThreshold.getValue() *
121            TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
122
123  const DataLayout &DL = Caller->getParent()->getDataLayout();
124  if (!Callee)
125    return (unsigned)Thres;
126
127  // If we have a pointer to private array passed into a function
128  // it will not be optimized out, leaving scratch usage.
129  // Increase the inline threshold to allow inliniting in this case.
130  uint64_t AllocaSize = 0;
131  SmallPtrSet<const AllocaInst *, 8> AIVisited;
132  for (Value *PtrArg : CS.args()) {
133    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
134    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
135                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
136      continue;
137
138    PtrArg = GetUnderlyingObject(PtrArg, DL);
139    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
140      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
141        continue;
142      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
143      // If the amount of stack memory is excessive we will not be able
144      // to get rid of the scratch anyway, bail out.
145      if (AllocaSize > ArgAllocaCutoff) {
146        AllocaSize = 0;
147        break;
148      }
149    }
150  }
151  if (AllocaSize)
152    Thres += ArgAllocaCost;
153
154  return (unsigned)Thres;
155}
156
157// Check if call is just a wrapper around another call.
158// In this case we only have call and ret instructions.
159static bool isWrapperOnlyCall(CallSite CS) {
160  Function *Callee = CS.getCalledFunction();
161  if (!Callee || Callee->size() != 1)
162    return false;
163  const BasicBlock &BB = Callee->getEntryBlock();
164  if (const Instruction *I = BB.getFirstNonPHI()) {
165    if (!isa<CallInst>(I)) {
166      return false;
167    }
168    if (isa<ReturnInst>(*std::next(I->getIterator()))) {
169      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
170                        << Callee->getName() << '\n');
171      return true;
172    }
173  }
174  return false;
175}
176
177InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
178  Function *Callee = CS.getCalledFunction();
179  Function *Caller = CS.getCaller();
180
181  if (!Callee || Callee->isDeclaration())
182    return llvm::InlineCost::getNever("undefined callee");
183
184  if (CS.isNoInline())
185    return llvm::InlineCost::getNever("noinline");
186
187  TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
188  if (!TTI.areInlineCompatible(Caller, Callee))
189    return llvm::InlineCost::getNever("incompatible");
190
191  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
192    auto IsViable = isInlineViable(*Callee);
193    if (IsViable)
194      return llvm::InlineCost::getAlways("alwaysinline viable");
195    return llvm::InlineCost::getNever(IsViable.message);
196  }
197
198  if (isWrapperOnlyCall(CS))
199    return llvm::InlineCost::getAlways("wrapper-only call");
200
201  InlineParams LocalParams = Params;
202  LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
203  bool RemarksEnabled = false;
204  const auto &BBs = Caller->getBasicBlockList();
205  if (!BBs.empty()) {
206    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
207    if (DI.isEnabled())
208      RemarksEnabled = true;
209  }
210
211  OptimizationRemarkEmitter ORE(Caller);
212  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
213      [this](Function &F) -> AssumptionCache & {
214    return ACT->getAssumptionCache(F);
215  };
216
217  auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
218                             LocalParams, TTI, GetAssumptionCache, None, PSI,
219                             RemarksEnabled ? &ORE : nullptr);
220
221  if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
222    // Single BB does not increase total BB amount, thus subtract 1
223    size_t Size = Caller->size() + Callee->size() - 1;
224    if (MaxBB && Size > MaxBB)
225      return llvm::InlineCost::getNever("max number of bb exceeded");
226  }
227  return IC;
228}
229