1326938Sdim//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
2326938Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6326938Sdim//
7326938Sdim//===----------------------------------------------------------------------===//
8326938Sdim//
9326938Sdim/// \file
10341825Sdim/// This is AMDGPU specific replacement of the standard inliner.
11326938Sdim/// The main purpose is to account for the fact that calls not only expensive
12326938Sdim/// on the AMDGPU, but much more expensive if a private memory pointer is
13326938Sdim/// passed to a function as an argument. In this situation, we are unable to
14326938Sdim/// eliminate private memory in the caller unless inlined and end up with slow
15326938Sdim/// and expensive scratch access. Thus, we boost the inline threshold for such
16326938Sdim/// functions here.
17326938Sdim///
18326938Sdim//===----------------------------------------------------------------------===//
19326938Sdim
20326938Sdim#include "AMDGPU.h"
21326938Sdim#include "llvm/Analysis/AssumptionCache.h"
22326938Sdim#include "llvm/Analysis/CallGraph.h"
23326938Sdim#include "llvm/Analysis/InlineCost.h"
24360784Sdim#include "llvm/Analysis/TargetTransformInfo.h"
25326938Sdim#include "llvm/Analysis/ValueTracking.h"
26326938Sdim#include "llvm/IR/CallSite.h"
27326938Sdim#include "llvm/IR/DataLayout.h"
28326938Sdim#include "llvm/IR/Instructions.h"
29326938Sdim#include "llvm/IR/Module.h"
30326938Sdim#include "llvm/IR/Type.h"
31360784Sdim#include "llvm/InitializePasses.h"
32326938Sdim#include "llvm/Support/CommandLine.h"
33326938Sdim#include "llvm/Support/Debug.h"
34360784Sdim#include "llvm/Transforms/IPO.h"
35326938Sdim#include "llvm/Transforms/IPO/Inliner.h"
36326938Sdim
37326938Sdimusing namespace llvm;
38326938Sdim
39326938Sdim#define DEBUG_TYPE "inline"
40326938Sdim
41326938Sdimstatic cl::opt<int>
42360784SdimArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
43326938Sdim              cl::desc("Cost of alloca argument"));
44326938Sdim
45326938Sdim// If the amount of scratch memory to eliminate exceeds our ability to allocate
46344779Sdim// it into registers we gain nothing by aggressively inlining functions for that
47326938Sdim// heuristic.
48326938Sdimstatic cl::opt<unsigned>
49326938SdimArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
50326938Sdim                cl::desc("Maximum alloca size to use for inline cost"));
51326938Sdim
52353358Sdim// Inliner constraint to achieve reasonable compilation time
53353358Sdimstatic cl::opt<size_t>
54360784SdimMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
55353358Sdim      cl::desc("Maximum BB number allowed in a function after inlining"
56353358Sdim               " (compile time constraint)"));
57353358Sdim
58326938Sdimnamespace {
59326938Sdim
60326938Sdimclass AMDGPUInliner : public LegacyInlinerBase {
61326938Sdim
62326938Sdimpublic:
63326938Sdim  AMDGPUInliner() : LegacyInlinerBase(ID) {
64326938Sdim    initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
65326938Sdim    Params = getInlineParams();
66326938Sdim  }
67326938Sdim
68326938Sdim  static char ID; // Pass identification, replacement for typeid
69326938Sdim
70326938Sdim  unsigned getInlineThreshold(CallSite CS) const;
71326938Sdim
72326938Sdim  InlineCost getInlineCost(CallSite CS) override;
73326938Sdim
74326938Sdim  bool runOnSCC(CallGraphSCC &SCC) override;
75326938Sdim
76326938Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override;
77326938Sdim
78326938Sdimprivate:
79326938Sdim  TargetTransformInfoWrapperPass *TTIWP;
80326938Sdim
81326938Sdim  InlineParams Params;
82326938Sdim};
83326938Sdim
84326938Sdim} // end anonymous namespace
85326938Sdim
86326938Sdimchar AMDGPUInliner::ID = 0;
87326938SdimINITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
88326938Sdim                "AMDGPU Function Integration/Inlining", false, false)
89326938SdimINITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
90326938SdimINITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
91326938SdimINITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
92326938SdimINITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
93326938SdimINITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
94326938SdimINITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
95326938Sdim                "AMDGPU Function Integration/Inlining", false, false)
96326938Sdim
97326938SdimPass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
98326938Sdim
99326938Sdimbool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
100326938Sdim  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
101326938Sdim  return LegacyInlinerBase::runOnSCC(SCC);
102326938Sdim}
103326938Sdim
104326938Sdimvoid AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
105326938Sdim  AU.addRequired<TargetTransformInfoWrapperPass>();
106326938Sdim  LegacyInlinerBase::getAnalysisUsage(AU);
107326938Sdim}
108326938Sdim
109326938Sdimunsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
110326938Sdim  int Thres = Params.DefaultThreshold;
111326938Sdim
112326938Sdim  Function *Caller = CS.getCaller();
113326938Sdim  // Listen to the inlinehint attribute when it would increase the threshold
114326938Sdim  // and the caller does not need to minimize its size.
115326938Sdim  Function *Callee = CS.getCalledFunction();
116326938Sdim  bool InlineHint = Callee && !Callee->isDeclaration() &&
117326938Sdim    Callee->hasFnAttribute(Attribute::InlineHint);
118326938Sdim  if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
119326938Sdim      && !Caller->hasFnAttribute(Attribute::MinSize))
120353358Sdim    Thres = Params.HintThreshold.getValue() *
121353358Sdim            TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
122326938Sdim
123326938Sdim  const DataLayout &DL = Caller->getParent()->getDataLayout();
124326938Sdim  if (!Callee)
125326938Sdim    return (unsigned)Thres;
126326938Sdim
127326938Sdim  // If we have a pointer to private array passed into a function
128326938Sdim  // it will not be optimized out, leaving scratch usage.
129326938Sdim  // Increase the inline threshold to allow inliniting in this case.
130326938Sdim  uint64_t AllocaSize = 0;
131326938Sdim  SmallPtrSet<const AllocaInst *, 8> AIVisited;
132326938Sdim  for (Value *PtrArg : CS.args()) {
133353358Sdim    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
134353358Sdim    if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
135353358Sdim                Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
136326938Sdim      continue;
137353358Sdim
138326938Sdim    PtrArg = GetUnderlyingObject(PtrArg, DL);
139326938Sdim    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
140326938Sdim      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
141326938Sdim        continue;
142326938Sdim      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
143326938Sdim      // If the amount of stack memory is excessive we will not be able
144326938Sdim      // to get rid of the scratch anyway, bail out.
145326938Sdim      if (AllocaSize > ArgAllocaCutoff) {
146326938Sdim        AllocaSize = 0;
147326938Sdim        break;
148326938Sdim      }
149326938Sdim    }
150326938Sdim  }
151326938Sdim  if (AllocaSize)
152326938Sdim    Thres += ArgAllocaCost;
153326938Sdim
154326938Sdim  return (unsigned)Thres;
155326938Sdim}
156326938Sdim
157326938Sdim// Check if call is just a wrapper around another call.
158326938Sdim// In this case we only have call and ret instructions.
159326938Sdimstatic bool isWrapperOnlyCall(CallSite CS) {
160326938Sdim  Function *Callee = CS.getCalledFunction();
161326938Sdim  if (!Callee || Callee->size() != 1)
162326938Sdim    return false;
163326938Sdim  const BasicBlock &BB = Callee->getEntryBlock();
164326938Sdim  if (const Instruction *I = BB.getFirstNonPHI()) {
165326938Sdim    if (!isa<CallInst>(I)) {
166326938Sdim      return false;
167326938Sdim    }
168326938Sdim    if (isa<ReturnInst>(*std::next(I->getIterator()))) {
169341825Sdim      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
170341825Sdim                        << Callee->getName() << '\n');
171326938Sdim      return true;
172326938Sdim    }
173326938Sdim  }
174326938Sdim  return false;
175326938Sdim}
176326938Sdim
177326938SdimInlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
178326938Sdim  Function *Callee = CS.getCalledFunction();
179326938Sdim  Function *Caller = CS.getCaller();
180326938Sdim
181344779Sdim  if (!Callee || Callee->isDeclaration())
182344779Sdim    return llvm::InlineCost::getNever("undefined callee");
183326938Sdim
184344779Sdim  if (CS.isNoInline())
185344779Sdim    return llvm::InlineCost::getNever("noinline");
186344779Sdim
187353358Sdim  TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
188344779Sdim  if (!TTI.areInlineCompatible(Caller, Callee))
189344779Sdim    return llvm::InlineCost::getNever("incompatible");
190344779Sdim
191326938Sdim  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
192353358Sdim    auto IsViable = isInlineViable(*Callee);
193353358Sdim    if (IsViable)
194344779Sdim      return llvm::InlineCost::getAlways("alwaysinline viable");
195353358Sdim    return llvm::InlineCost::getNever(IsViable.message);
196326938Sdim  }
197326938Sdim
198326938Sdim  if (isWrapperOnlyCall(CS))
199344779Sdim    return llvm::InlineCost::getAlways("wrapper-only call");
200326938Sdim
201326938Sdim  InlineParams LocalParams = Params;
202326938Sdim  LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
203326938Sdim  bool RemarksEnabled = false;
204326938Sdim  const auto &BBs = Caller->getBasicBlockList();
205326938Sdim  if (!BBs.empty()) {
206326938Sdim    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
207326938Sdim    if (DI.isEnabled())
208326938Sdim      RemarksEnabled = true;
209326938Sdim  }
210326938Sdim
211326938Sdim  OptimizationRemarkEmitter ORE(Caller);
212326938Sdim  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
213326938Sdim      [this](Function &F) -> AssumptionCache & {
214326938Sdim    return ACT->getAssumptionCache(F);
215326938Sdim  };
216326938Sdim
217353358Sdim  auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
218353358Sdim                             LocalParams, TTI, GetAssumptionCache, None, PSI,
219353358Sdim                             RemarksEnabled ? &ORE : nullptr);
220353358Sdim
221353358Sdim  if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
222353358Sdim    // Single BB does not increase total BB amount, thus subtract 1
223353358Sdim    size_t Size = Caller->size() + Callee->size() - 1;
224353358Sdim    if (MaxBB && Size > MaxBB)
225353358Sdim      return llvm::InlineCost::getNever("max number of bb exceeded");
226353358Sdim  }
227353358Sdim  return IC;
228326938Sdim}
229