1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/InstructionSimplify.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/MemoryBuiltins.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ProfileSummaryInfo.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/Analysis/ValueTracking.h"
31#include "llvm/Config/llvm-config.h"
32#include "llvm/IR/AssemblyAnnotationWriter.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/GetElementPtrTypeIterator.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/PatternMatch.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/FormattedStream.h"
45#include "llvm/Support/raw_ostream.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57    DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58                     cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
65static cl::opt<bool> IgnoreTTIInlineCompatible(
66    "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67    cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68             "during inline cost calculation"));
69
70static cl::opt<bool> PrintInstructionComments(
71    "print-instruction-comments", cl::Hidden, cl::init(false),
72    cl::desc("Prints comments for instruction based on inline cost analysis"));
73
74static cl::opt<int> InlineThreshold(
75    "inline-threshold", cl::Hidden, cl::init(225),
76    cl::desc("Control the amount of inlining to perform (default = 225)"));
77
78static cl::opt<int> HintThreshold(
79    "inlinehint-threshold", cl::Hidden, cl::init(325),
80    cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83    ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84                          cl::init(45),
85                          cl::desc("Threshold for inlining cold callsites"));
86
87static cl::opt<bool> InlineEnableCostBenefitAnalysis(
88    "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89    cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
94static cl::opt<int> InlineSavingsMultiplier(
95    "inline-savings-multiplier", cl::Hidden, cl::init(8),
96    cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
101static cl::opt<int> InlineSavingsProfitableMultiplier(
102    "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103    cl::desc("A multiplier on top of cycle savings to decide whether the "
104             "savings won't justify the cost"));
105
106static cl::opt<int>
107    InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108                        cl::desc("The maximum size of a callee that get's "
109                                 "inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
114static cl::opt<int> ColdThreshold(
115    "inlinecold-threshold", cl::Hidden, cl::init(45),
116    cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119    HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120                         cl::desc("Threshold for hot callsites "));
121
122static cl::opt<int> LocallyHotCallSiteThreshold(
123    "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124    cl::desc("Threshold for locally hot callsites "));
125
126static cl::opt<int> ColdCallSiteRelFreq(
127    "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128    cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129             "entry frequency, for a callsite to be cold in the absence of "
130             "profile information."));
131
132static cl::opt<uint64_t> HotCallSiteRelFreq(
133    "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134    cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135             "entry frequency, for a callsite to be hot in the absence of "
136             "profile information."));
137
138static cl::opt<int>
139    InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140              cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143    MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144                  cl::desc("Cost of load/store instruction when inlining"));
145
146static cl::opt<int> CallPenalty(
147    "inline-call-penalty", cl::Hidden, cl::init(25),
148    cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151    StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152                       cl::init(std::numeric_limits<size_t>::max()),
153                       cl::desc("Do not inline functions with a stack size "
154                                "that exceeds the specified limit"));
155
156static cl::opt<size_t> RecurStackSizeThreshold(
157    "recursive-inline-max-stacksize", cl::Hidden,
158    cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
159    cl::desc("Do not inline recursive functions with a stack "
160             "size that exceeds the specified limit"));
161
162static cl::opt<bool> OptComputeFullInlineCost(
163    "inline-cost-full", cl::Hidden,
164    cl::desc("Compute the full inline cost of a call site even when the cost "
165             "exceeds the threshold."));
166
167static cl::opt<bool> InlineCallerSupersetNoBuiltin(
168    "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169    cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170             "attributes."));
171
172static cl::opt<bool> DisableGEPConstOperand(
173    "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174    cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178  if (Attr.isValid()) {
179    int AttrValue = 0;
180    if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181      return AttrValue;
182  }
183  return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187  return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191  return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207  int CostBefore = 0;
208  int CostAfter = 0;
209  int ThresholdBefore = 0;
210  int ThresholdAfter = 0;
211
212  int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214  int getCostDelta() const { return CostAfter - CostBefore; }
215
216  bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221  InlineCostCallAnalyzer *const ICCA;
222
223public:
224  InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
225  void emitInstructionAnnot(const Instruction *I,
226                            formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
238  typedef InstVisitor<CallAnalyzer, bool> Base;
239  friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242  virtual ~CallAnalyzer() = default;
243  /// The TargetTransformInfo available for this compilation.
244  const TargetTransformInfo &TTI;
245
246  /// Getter for the cache of @llvm.assume intrinsics.
247  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249  /// Getter for BlockFrequencyInfo
250  function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
251
252  /// Profile summary information.
253  ProfileSummaryInfo *PSI;
254
255  /// The called function.
256  Function &F;
257
258  // Cache the DataLayout since we use it a lot.
259  const DataLayout &DL;
260
261  /// The OptimizationRemarkEmitter available for this compilation.
262  OptimizationRemarkEmitter *ORE;
263
264  /// The candidate callsite being analyzed. Please do not use this to do
265  /// analysis in the caller function; we want the inline cost query to be
266  /// easily cacheable. Instead, use the cover function paramHasAttr.
267  CallBase &CandidateCall;
268
269  /// Extension points for handling callsite features.
270  // Called before a basic block was analyzed.
271  virtual void onBlockStart(const BasicBlock *BB) {}
272
273  /// Called after a basic block was analyzed.
274  virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276  /// Called before an instruction was analyzed
277  virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279  /// Called after an instruction was analyzed
280  virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282  /// Called at the end of the analysis of the callsite. Return the outcome of
283  /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284  /// the reason it can't.
285  virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286  /// Called when we're about to start processing a basic block, and every time
287  /// we are done processing an instruction. Return true if there is no point in
288  /// continuing the analysis (e.g. we've determined already the call site is
289  /// too expensive to inline)
290  virtual bool shouldStop() { return false; }
291
292  /// Called before the analysis of the callee body starts (with callsite
293  /// contexts propagated).  It checks callsite-specific information. Return a
294  /// reason analysis can't continue if that's the case, or 'true' if it may
295  /// continue.
296  virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297  /// Called if the analysis engine decides SROA cannot be done for the given
298  /// alloca.
299  virtual void onDisableSROA(AllocaInst *Arg) {}
300
301  /// Called the analysis engine determines load elimination won't happen.
302  virtual void onDisableLoadElimination() {}
303
304  /// Called when we visit a CallBase, before the analysis starts. Return false
305  /// to stop further processing of the instruction.
306  virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308  /// Called to account for a call.
309  virtual void onCallPenalty() {}
310
311  /// Called to account for a load or store.
312  virtual void onMemAccess(){};
313
314  /// Called to account for the expectation the inlining would result in a load
315  /// elimination.
316  virtual void onLoadEliminationOpportunity() {}
317
318  /// Called to account for the cost of argument setup for the Call in the
319  /// callee's body (not the callsite currently under analysis).
320  virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322  /// Called to account for a load relative intrinsic.
323  virtual void onLoadRelativeIntrinsic() {}
324
325  /// Called to account for a lowered call.
326  virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327  }
328
329  /// Account for a jump table of given size. Return false to stop further
330  /// processing the switch instruction
331  virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333  /// Account for a case cluster of given size. Return false to stop further
334  /// processing of the instruction.
335  virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337  /// Called at the end of processing a switch instruction, with the given
338  /// number of case clusters.
339  virtual void onFinalizeSwitch(unsigned JumpTableSize,
340                                unsigned NumCaseCluster) {}
341
342  /// Called to account for any other instruction not specifically accounted
343  /// for.
344  virtual void onMissedSimplification() {}
345
346  /// Start accounting potential benefits due to SROA for the given alloca.
347  virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349  /// Account SROA savings for the AllocaInst value.
350  virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352  bool handleSROA(Value *V, bool DoNotDisable) {
353    // Check for SROA candidates in comparisons.
354    if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355      if (DoNotDisable) {
356        onAggregateSROAUse(SROAArg);
357        return true;
358      }
359      disableSROAForArg(SROAArg);
360    }
361    return false;
362  }
363
364  bool IsCallerRecursive = false;
365  bool IsRecursiveCall = false;
366  bool ExposesReturnsTwice = false;
367  bool HasDynamicAlloca = false;
368  bool ContainsNoDuplicateCall = false;
369  bool HasReturn = false;
370  bool HasIndirectBr = false;
371  bool HasUninlineableIntrinsic = false;
372  bool InitsVargArgs = false;
373
374  /// Number of bytes allocated statically by the callee.
375  uint64_t AllocatedSize = 0;
376  unsigned NumInstructions = 0;
377  unsigned NumVectorInstructions = 0;
378
379  /// While we walk the potentially-inlined instructions, we build up and
380  /// maintain a mapping of simplified values specific to this callsite. The
381  /// idea is to propagate any special information we have about arguments to
382  /// this call through the inlinable section of the function, and account for
383  /// likely simplifications post-inlining. The most important aspect we track
384  /// is CFG altering simplifications -- when we prove a basic block dead, that
385  /// can cause dramatic shifts in the cost of inlining a function.
386  DenseMap<Value *, Constant *> SimplifiedValues;
387
388  /// Keep track of the values which map back (through function arguments) to
389  /// allocas on the caller stack which could be simplified through SROA.
390  DenseMap<Value *, AllocaInst *> SROAArgValues;
391
392  /// Keep track of Allocas for which we believe we may get SROA optimization.
393  DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395  /// Keep track of values which map to a pointer base and constant offset.
396  DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
397
398  /// Keep track of dead blocks due to the constant arguments.
399  SmallPtrSet<BasicBlock *, 16> DeadBlocks;
400
401  /// The mapping of the blocks to their known unique successors due to the
402  /// constant arguments.
403  DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
404
405  /// Model the elimination of repeated loads that is expected to happen
406  /// whenever we simplify away the stores that would otherwise cause them to be
407  /// loads.
408  bool EnableLoadElimination = true;
409
410  /// Whether we allow inlining for recursive call.
411  bool AllowRecursiveCall = false;
412
413  SmallPtrSet<Value *, 16> LoadAddrSet;
414
415  AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416    auto It = SROAArgValues.find(V);
417    if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418      return nullptr;
419    return It->second;
420  }
421
422  // Custom simplification helper routines.
423  bool isAllocaDerivedArg(Value *V);
424  void disableSROAForArg(AllocaInst *SROAArg);
425  void disableSROA(Value *V);
426  void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427  void disableLoadElimination();
428  bool isGEPFree(GetElementPtrInst &GEP);
429  bool canFoldInboundsGEP(GetElementPtrInst &I);
430  bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431  bool simplifyCallSite(Function *F, CallBase &Call);
432  bool simplifyInstruction(Instruction &I);
433  bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434  bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435  ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437  /// Return true if the given argument to the function being considered for
438  /// inlining has the given attribute set either at the call site or the
439  /// function declaration.  Primarily used to inspect call site specific
440  /// attributes since these can be more precise than the ones on the callee
441  /// itself.
442  bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444  /// Return true if the given value is known non null within the callee if
445  /// inlined through this particular callsite.
446  bool isKnownNonNullInCallee(Value *V);
447
448  /// Return true if size growth is allowed when inlining the callee at \p Call.
449  bool allowSizeGrowth(CallBase &Call);
450
451  // Custom analysis routines.
452  InlineResult analyzeBlock(BasicBlock *BB,
453                            SmallPtrSetImpl<const Value *> &EphValues);
454
455  // Disable several entry points to the visitor so we don't accidentally use
456  // them by declaring but not defining them here.
457  void visit(Module *);
458  void visit(Module &);
459  void visit(Function *);
460  void visit(Function &);
461  void visit(BasicBlock *);
462  void visit(BasicBlock &);
463
464  // Provide base case for our instruction visit.
465  bool visitInstruction(Instruction &I);
466
467  // Our visit overrides.
468  bool visitAlloca(AllocaInst &I);
469  bool visitPHI(PHINode &I);
470  bool visitGetElementPtr(GetElementPtrInst &I);
471  bool visitBitCast(BitCastInst &I);
472  bool visitPtrToInt(PtrToIntInst &I);
473  bool visitIntToPtr(IntToPtrInst &I);
474  bool visitCastInst(CastInst &I);
475  bool visitCmpInst(CmpInst &I);
476  bool visitSub(BinaryOperator &I);
477  bool visitBinaryOperator(BinaryOperator &I);
478  bool visitFNeg(UnaryOperator &I);
479  bool visitLoad(LoadInst &I);
480  bool visitStore(StoreInst &I);
481  bool visitExtractValue(ExtractValueInst &I);
482  bool visitInsertValue(InsertValueInst &I);
483  bool visitCallBase(CallBase &Call);
484  bool visitReturnInst(ReturnInst &RI);
485  bool visitBranchInst(BranchInst &BI);
486  bool visitSelectInst(SelectInst &SI);
487  bool visitSwitchInst(SwitchInst &SI);
488  bool visitIndirectBrInst(IndirectBrInst &IBI);
489  bool visitResumeInst(ResumeInst &RI);
490  bool visitCleanupReturnInst(CleanupReturnInst &RI);
491  bool visitCatchReturnInst(CatchReturnInst &RI);
492  bool visitUnreachableInst(UnreachableInst &I);
493
494public:
495  CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496               function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497               function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498               ProfileSummaryInfo *PSI = nullptr,
499               OptimizationRemarkEmitter *ORE = nullptr)
500      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
502        CandidateCall(Call) {}
503
504  InlineResult analyze();
505
506  std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507    if (SimplifiedValues.contains(I))
508      return SimplifiedValues[I];
509    return std::nullopt;
510  }
511
512  // Keep a bunch of stats about the cost savings found so we can print them
513  // out when debugging.
514  unsigned NumConstantArgs = 0;
515  unsigned NumConstantOffsetPtrArgs = 0;
516  unsigned NumAllocaArgs = 0;
517  unsigned NumConstantPtrCmps = 0;
518  unsigned NumConstantPtrDiffs = 0;
519  unsigned NumInstructionsSimplified = 0;
520
521  void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3.  So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533//   2^(log2(n) - 1) - 1
534//   = 2^log2(n) * 2^-1 - 1
535//   = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538//   n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540  return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546  const bool ComputeFullInlineCost;
547  int LoadEliminationCost = 0;
548  /// Bonus to be applied when percentage of vector instructions in callee is
549  /// high (see more details in updateThreshold).
550  int VectorBonus = 0;
551  /// Bonus to be applied when the callee has only one reachable basic block.
552  int SingleBBBonus = 0;
553
554  /// Tunable parameters that control the analysis.
555  const InlineParams &Params;
556
557  // This DenseMap stores the delta change in cost and threshold after
558  // accounting for the given instruction. The map is filled only with the
559  // flag PrintInstructionComments on.
560  DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
561
562  /// Upper bound for the inlining cost. Bonuses are being applied to account
563  /// for speculative "expected profit" of the inlining decision.
564  int Threshold = 0;
565
566  /// The amount of StaticBonus applied.
567  int StaticBonusApplied = 0;
568
569  /// Attempt to evaluate indirect calls to boost its inline cost.
570  const bool BoostIndirectCalls;
571
572  /// Ignore the threshold when finalizing analysis.
573  const bool IgnoreThreshold;
574
575  // True if the cost-benefit-analysis-based inliner is enabled.
576  const bool CostBenefitAnalysisEnabled;
577
578  /// Inlining cost measured in abstract units, accounts for all the
579  /// instructions expected to be executed for a given function invocation.
580  /// Instructions that are statically proven to be dead based on call-site
581  /// arguments are not counted here.
582  int Cost = 0;
583
584  // The cumulative cost at the beginning of the basic block being analyzed.  At
585  // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586  // the size of that basic block.
587  int CostAtBBStart = 0;
588
589  // The static size of live but cold basic blocks.  This is "static" in the
590  // sense that it's not weighted by profile counts at all.
591  int ColdSize = 0;
592
593  // Whether inlining is decided by cost-threshold analysis.
594  bool DecidedByCostThreshold = false;
595
596  // Whether inlining is decided by cost-benefit analysis.
597  bool DecidedByCostBenefit = false;
598
599  // The cost-benefit pair computed by cost-benefit analysis.
600  std::optional<CostBenefitPair> CostBenefit;
601
602  bool SingleBB = true;
603
604  unsigned SROACostSavings = 0;
605  unsigned SROACostSavingsLost = 0;
606
607  /// The mapping of caller Alloca values to their accumulated cost savings. If
608  /// we have to disable SROA for one of the allocas, this tells us how much
609  /// cost must be added.
610  DenseMap<AllocaInst *, int> SROAArgCosts;
611
612  /// Return true if \p Call is a cold callsite.
613  bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615  /// Update Threshold based on callsite properties such as callee
616  /// attributes and callee hotness for PGO builds. The Callee is explicitly
617  /// passed to support analyzing indirect calls whose target is inferred by
618  /// analysis.
619  void updateThreshold(CallBase &Call, Function &Callee);
620  /// Return a higher threshold if \p Call is a hot callsite.
621  std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622                                             BlockFrequencyInfo *CallerBFI);
623
624  /// Handle a capped 'int' increment for Cost.
625  void addCost(int64_t Inc) {
626    Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627    Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628  }
629
630  void onDisableSROA(AllocaInst *Arg) override {
631    auto CostIt = SROAArgCosts.find(Arg);
632    if (CostIt == SROAArgCosts.end())
633      return;
634    addCost(CostIt->second);
635    SROACostSavings -= CostIt->second;
636    SROACostSavingsLost += CostIt->second;
637    SROAArgCosts.erase(CostIt);
638  }
639
640  void onDisableLoadElimination() override {
641    addCost(LoadEliminationCost);
642    LoadEliminationCost = 0;
643  }
644
645  bool onCallBaseVisitStart(CallBase &Call) override {
646    if (std::optional<int> AttrCallThresholdBonus =
647            getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648      Threshold += *AttrCallThresholdBonus;
649
650    if (std::optional<int> AttrCallCost =
651            getStringFnAttrAsInt(Call, "call-inline-cost")) {
652      addCost(*AttrCallCost);
653      // Prevent further processing of the call since we want to override its
654      // inline cost, not just add to it.
655      return false;
656    }
657    return true;
658  }
659
660  void onCallPenalty() override { addCost(CallPenalty); }
661
662  void onMemAccess() override { addCost(MemAccessCost); }
663
664  void onCallArgumentSetup(const CallBase &Call) override {
665    // Pay the price of the argument setup. We account for the average 1
666    // instruction per call argument setup here.
667    addCost(Call.arg_size() * InstrCost);
668  }
669  void onLoadRelativeIntrinsic() override {
670    // This is normally lowered to 4 LLVM instructions.
671    addCost(3 * InstrCost);
672  }
673  void onLoweredCall(Function *F, CallBase &Call,
674                     bool IsIndirectCall) override {
675    // We account for the average 1 instruction per call argument setup here.
676    addCost(Call.arg_size() * InstrCost);
677
678    // If we have a constant that we are calling as a function, we can peer
679    // through it and see the function target. This happens not infrequently
680    // during devirtualization and so we want to give it a hefty bonus for
681    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682    // Pretend to inline the function, with a custom threshold.
683    if (IsIndirectCall && BoostIndirectCalls) {
684      auto IndirectCallParams = Params;
685      IndirectCallParams.DefaultThreshold =
686          InlineConstants::IndirectCallThreshold;
687      /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688      /// to instantiate the derived class.
689      InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690                                GetAssumptionCache, GetBFI, PSI, ORE, false);
691      if (CA.analyze().isSuccess()) {
692        // We were able to inline the indirect call! Subtract the cost from the
693        // threshold to get the bonus we want to apply, but don't go below zero.
694        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695      }
696    } else
697      // Otherwise simply add the cost for merely making the call.
698      addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699                                       CallPenalty));
700  }
701
702  void onFinalizeSwitch(unsigned JumpTableSize,
703                        unsigned NumCaseCluster) override {
704    // If suitable for a jump table, consider the cost for the table size and
705    // branch to destination.
706    // Maximum valid cost increased in this function.
707    if (JumpTableSize) {
708      int64_t JTCost =
709          static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
710
711      addCost(JTCost);
712      return;
713    }
714
715    if (NumCaseCluster <= 3) {
716      // Suppose a comparison includes one compare and one conditional branch.
717      addCost(NumCaseCluster * 2 * InstrCost);
718      return;
719    }
720
721    int64_t ExpectedNumberOfCompare =
722        getExpectedNumberOfCompare(NumCaseCluster);
723    int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
724
725    addCost(SwitchCost);
726  }
727  void onMissedSimplification() override { addCost(InstrCost); }
728
729  void onInitializeSROAArg(AllocaInst *Arg) override {
730    assert(Arg != nullptr &&
731           "Should not initialize SROA costs for null value.");
732    auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
733    SROACostSavings += SROAArgCost;
734    SROAArgCosts[Arg] = SROAArgCost;
735  }
736
737  void onAggregateSROAUse(AllocaInst *SROAArg) override {
738    auto CostIt = SROAArgCosts.find(SROAArg);
739    assert(CostIt != SROAArgCosts.end() &&
740           "expected this argument to have a cost");
741    CostIt->second += InstrCost;
742    SROACostSavings += InstrCost;
743  }
744
745  void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
746
747  void onBlockAnalyzed(const BasicBlock *BB) override {
748    if (CostBenefitAnalysisEnabled) {
749      // Keep track of the static size of live but cold basic blocks.  For now,
750      // we define a cold basic block to be one that's never executed.
751      assert(GetBFI && "GetBFI must be available");
752      BlockFrequencyInfo *BFI = &(GetBFI(F));
753      assert(BFI && "BFI must be available");
754      auto ProfileCount = BFI->getBlockProfileCount(BB);
755      if (*ProfileCount == 0)
756        ColdSize += Cost - CostAtBBStart;
757    }
758
759    auto *TI = BB->getTerminator();
760    // If we had any successors at this point, than post-inlining is likely to
761    // have them as well. Note that we assume any basic blocks which existed
762    // due to branches or switches which folded above will also fold after
763    // inlining.
764    if (SingleBB && TI->getNumSuccessors() > 1) {
765      // Take off the bonus we applied to the threshold.
766      Threshold -= SingleBBBonus;
767      SingleBB = false;
768    }
769  }
770
771  void onInstructionAnalysisStart(const Instruction *I) override {
772    // This function is called to store the initial cost of inlining before
773    // the given instruction was assessed.
774    if (!PrintInstructionComments)
775      return;
776    InstructionCostDetailMap[I].CostBefore = Cost;
777    InstructionCostDetailMap[I].ThresholdBefore = Threshold;
778  }
779
780  void onInstructionAnalysisFinish(const Instruction *I) override {
781    // This function is called to find new values of cost and threshold after
782    // the instruction has been assessed.
783    if (!PrintInstructionComments)
784      return;
785    InstructionCostDetailMap[I].CostAfter = Cost;
786    InstructionCostDetailMap[I].ThresholdAfter = Threshold;
787  }
788
789  bool isCostBenefitAnalysisEnabled() {
790    if (!PSI || !PSI->hasProfileSummary())
791      return false;
792
793    if (!GetBFI)
794      return false;
795
796    if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
797      // Honor the explicit request from the user.
798      if (!InlineEnableCostBenefitAnalysis)
799        return false;
800    } else {
801      // Otherwise, require instrumentation profile.
802      if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile()))
803        return false;
804    }
805
806    auto *Caller = CandidateCall.getParent()->getParent();
807    if (!Caller->getEntryCount())
808      return false;
809
810    BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
811    if (!CallerBFI)
812      return false;
813
814    // For now, limit to hot call site.
815    if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
816      return false;
817
818    // Make sure we have a nonzero entry count.
819    auto EntryCount = F.getEntryCount();
820    if (!EntryCount || !EntryCount->getCount())
821      return false;
822
823    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
824    if (!CalleeBFI)
825      return false;
826
827    return true;
828  }
829
830  // A helper function to choose between command line override and default.
831  unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
832    if (InlineSavingsMultiplier.getNumOccurrences())
833      return InlineSavingsMultiplier;
834    return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
835  }
836
837  // A helper function to choose between command line override and default.
838  unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
839    if (InlineSavingsProfitableMultiplier.getNumOccurrences())
840      return InlineSavingsProfitableMultiplier;
841    return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
842  }
843
844  void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
845    if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
846            CandidateCall, "inline-cycle-savings-for-test")) {
847      CycleSavings = *AttrCycleSavings;
848    }
849
850    if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
851            CandidateCall, "inline-runtime-cost-for-test")) {
852      Size = *AttrRuntimeCost;
853    }
854  }
855
856  // Determine whether we should inline the given call site, taking into account
857  // both the size cost and the cycle savings.  Return std::nullopt if we don't
858  // have sufficient profiling information to determine.
859  std::optional<bool> costBenefitAnalysis() {
860    if (!CostBenefitAnalysisEnabled)
861      return std::nullopt;
862
863    // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
864    // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by
865    // falling back to the cost-based metric.
866    // TODO: Improve this hacky condition.
867    if (Threshold == 0)
868      return std::nullopt;
869
870    assert(GetBFI);
871    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
872    assert(CalleeBFI);
873
874    // The cycle savings expressed as the sum of InstrCost
875    // multiplied by the estimated dynamic count of each instruction we can
876    // avoid.  Savings come from the call site cost, such as argument setup and
877    // the call instruction, as well as the instructions that are folded.
878    //
879    // We use 128-bit APInt here to avoid potential overflow.  This variable
880    // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case
881    // assumes that we can avoid or fold a billion instructions, each with a
882    // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
883    // period on a 4GHz machine.
884    APInt CycleSavings(128, 0);
885
886    for (auto &BB : F) {
887      APInt CurrentSavings(128, 0);
888      for (auto &I : BB) {
889        if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
890          // Count a conditional branch as savings if it becomes unconditional.
891          if (BI->isConditional() &&
892              isa_and_nonnull<ConstantInt>(
893                  SimplifiedValues.lookup(BI->getCondition()))) {
894            CurrentSavings += InstrCost;
895          }
896        } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
897          if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
898            CurrentSavings += InstrCost;
899        } else if (Value *V = dyn_cast<Value>(&I)) {
900          // Count an instruction as savings if we can fold it.
901          if (SimplifiedValues.count(V)) {
902            CurrentSavings += InstrCost;
903          }
904        }
905      }
906
907      auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
908      CurrentSavings *= *ProfileCount;
909      CycleSavings += CurrentSavings;
910    }
911
912    // Compute the cycle savings per call.
913    auto EntryProfileCount = F.getEntryCount();
914    assert(EntryProfileCount && EntryProfileCount->getCount());
915    auto EntryCount = EntryProfileCount->getCount();
916    CycleSavings += EntryCount / 2;
917    CycleSavings = CycleSavings.udiv(EntryCount);
918
919    // Compute the total savings for the call site.
920    auto *CallerBB = CandidateCall.getParent();
921    BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
922    CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
923    CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
924
925    // Remove the cost of the cold basic blocks to model the runtime cost more
926    // accurately. Both machine block placement and function splitting could
927    // place cold blocks further from hot blocks.
928    int Size = Cost - ColdSize;
929
930    // Allow tiny callees to be inlined regardless of whether they meet the
931    // savings threshold.
932    Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
933
934    OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
935    CostBenefit.emplace(APInt(128, Size), CycleSavings);
936
937    // Let R be the ratio of CycleSavings to Size.  We accept the inlining
938    // opportunity if R is really high and reject if R is really low.  If R is
939    // somewhere in the middle, we fall back to the cost-based analysis.
940    //
941    // Specifically, let R = CycleSavings / Size, we accept the inlining
942    // opportunity if:
943    //
944    //             PSI->getOrCompHotCountThreshold()
945    // R > -------------------------------------------------
946    //     getInliningCostBenefitAnalysisSavingsMultiplier()
947    //
948    // and reject the inlining opportunity if:
949    //
950    //                PSI->getOrCompHotCountThreshold()
951    // R <= ----------------------------------------------------
952    //      getInliningCostBenefitAnalysisProfitableMultiplier()
953    //
954    // Otherwise, we fall back to the cost-based analysis.
955    //
956    // Implementation-wise, use multiplication (CycleSavings * Multiplier,
957    // HotCountThreshold * Size) rather than division to avoid precision loss.
958    APInt Threshold(128, PSI->getOrCompHotCountThreshold());
959    Threshold *= Size;
960
961    APInt UpperBoundCycleSavings = CycleSavings;
962    UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
963    if (UpperBoundCycleSavings.uge(Threshold))
964      return true;
965
966    APInt LowerBoundCycleSavings = CycleSavings;
967    LowerBoundCycleSavings *=
968        getInliningCostBenefitAnalysisProfitableMultiplier();
969    if (LowerBoundCycleSavings.ult(Threshold))
970      return false;
971
972    // Otherwise, fall back to the cost-based analysis.
973    return std::nullopt;
974  }
975
976  InlineResult finalizeAnalysis() override {
977    // Loops generally act a lot like calls in that they act like barriers to
978    // movement, require a certain amount of setup, etc. So when optimising for
979    // size, we penalise any call sites that perform loops. We do this after all
980    // other costs here, so will likely only be dealing with relatively small
981    // functions (and hence DT and LI will hopefully be cheap).
982    auto *Caller = CandidateCall.getFunction();
983    if (Caller->hasMinSize()) {
984      DominatorTree DT(F);
985      LoopInfo LI(DT);
986      int NumLoops = 0;
987      for (Loop *L : LI) {
988        // Ignore loops that will not be executed
989        if (DeadBlocks.count(L->getHeader()))
990          continue;
991        NumLoops++;
992      }
993      addCost(NumLoops * InlineConstants::LoopPenalty);
994    }
995
996    // We applied the maximum possible vector bonus at the beginning. Now,
997    // subtract the excess bonus, if any, from the Threshold before
998    // comparing against Cost.
999    if (NumVectorInstructions <= NumInstructions / 10)
1000      Threshold -= VectorBonus;
1001    else if (NumVectorInstructions <= NumInstructions / 2)
1002      Threshold -= VectorBonus / 2;
1003
1004    if (std::optional<int> AttrCost =
1005            getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1006      Cost = *AttrCost;
1007
1008    if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1009            CandidateCall,
1010            InlineConstants::FunctionInlineCostMultiplierAttributeName))
1011      Cost *= *AttrCostMult;
1012
1013    if (std::optional<int> AttrThreshold =
1014            getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1015      Threshold = *AttrThreshold;
1016
1017    if (auto Result = costBenefitAnalysis()) {
1018      DecidedByCostBenefit = true;
1019      if (*Result)
1020        return InlineResult::success();
1021      else
1022        return InlineResult::failure("Cost over threshold.");
1023    }
1024
1025    if (IgnoreThreshold)
1026      return InlineResult::success();
1027
1028    DecidedByCostThreshold = true;
1029    return Cost < std::max(1, Threshold)
1030               ? InlineResult::success()
1031               : InlineResult::failure("Cost over threshold.");
1032  }
1033
1034  bool shouldStop() override {
1035    if (IgnoreThreshold || ComputeFullInlineCost)
1036      return false;
1037    // Bail out the moment we cross the threshold. This means we'll under-count
1038    // the cost, but only when undercounting doesn't matter.
1039    if (Cost < Threshold)
1040      return false;
1041    DecidedByCostThreshold = true;
1042    return true;
1043  }
1044
1045  void onLoadEliminationOpportunity() override {
1046    LoadEliminationCost += InstrCost;
1047  }
1048
1049  InlineResult onAnalysisStart() override {
1050    // Perform some tweaks to the cost and threshold based on the direct
1051    // callsite information.
1052
1053    // We want to more aggressively inline vector-dense kernels, so up the
1054    // threshold, and we'll lower it if the % of vector instructions gets too
1055    // low. Note that these bonuses are some what arbitrary and evolved over
1056    // time by accident as much as because they are principled bonuses.
1057    //
1058    // FIXME: It would be nice to remove all such bonuses. At least it would be
1059    // nice to base the bonus values on something more scientific.
1060    assert(NumInstructions == 0);
1061    assert(NumVectorInstructions == 0);
1062
1063    // Update the threshold based on callsite properties
1064    updateThreshold(CandidateCall, F);
1065
1066    // While Threshold depends on commandline options that can take negative
1067    // values, we want to enforce the invariant that the computed threshold and
1068    // bonuses are non-negative.
1069    assert(Threshold >= 0);
1070    assert(SingleBBBonus >= 0);
1071    assert(VectorBonus >= 0);
1072
1073    // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1074    // this Threshold any time, and cost cannot decrease, we can stop processing
1075    // the rest of the function body.
1076    Threshold += (SingleBBBonus + VectorBonus);
1077
1078    // Give out bonuses for the callsite, as the instructions setting them up
1079    // will be gone after inlining.
1080    addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1081
1082    // If this function uses the coldcc calling convention, prefer not to inline
1083    // it.
1084    if (F.getCallingConv() == CallingConv::Cold)
1085      Cost += InlineConstants::ColdccPenalty;
1086
1087    LLVM_DEBUG(dbgs() << "      Initial cost: " << Cost << "\n");
1088
1089    // Check if we're done. This can happen due to bonuses and penalties.
1090    if (Cost >= Threshold && !ComputeFullInlineCost)
1091      return InlineResult::failure("high cost");
1092
1093    return InlineResult::success();
1094  }
1095
1096public:
1097  InlineCostCallAnalyzer(
1098      Function &Callee, CallBase &Call, const InlineParams &Params,
1099      const TargetTransformInfo &TTI,
1100      function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1101      function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1102      ProfileSummaryInfo *PSI = nullptr,
1103      OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1104      bool IgnoreThreshold = false)
1105      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1106        ComputeFullInlineCost(OptComputeFullInlineCost ||
1107                              Params.ComputeFullInlineCost || ORE ||
1108                              isCostBenefitAnalysisEnabled()),
1109        Params(Params), Threshold(Params.DefaultThreshold),
1110        BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1111        CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1112        Writer(this) {
1113    AllowRecursiveCall = *Params.AllowRecursiveCall;
1114  }
1115
1116  /// Annotation Writer for instruction details
1117  InlineCostAnnotationWriter Writer;
1118
1119  void dump();
1120
1121  // Prints the same analysis as dump(), but its definition is not dependent
1122  // on the build.
1123  void print(raw_ostream &OS);
1124
1125  std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1126    if (InstructionCostDetailMap.contains(I))
1127      return InstructionCostDetailMap[I];
1128    return std::nullopt;
1129  }
1130
1131  virtual ~InlineCostCallAnalyzer() = default;
1132  int getThreshold() const { return Threshold; }
1133  int getCost() const { return Cost; }
1134  int getStaticBonusApplied() const { return StaticBonusApplied; }
1135  std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1136  bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1137  bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1138};
1139
1140// Return true if CB is the sole call to local function Callee.
1141static bool isSoleCallToLocalFunction(const CallBase &CB,
1142                                      const Function &Callee) {
1143  return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1144         &Callee == CB.getCalledFunction();
1145}
1146
1147class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1148private:
1149  InlineCostFeatures Cost = {};
1150
1151  // FIXME: These constants are taken from the heuristic-based cost visitor.
1152  // These should be removed entirely in a later revision to avoid reliance on
1153  // heuristics in the ML inliner.
1154  static constexpr int JTCostMultiplier = 4;
1155  static constexpr int CaseClusterCostMultiplier = 2;
1156  static constexpr int SwitchCostMultiplier = 2;
1157
1158  // FIXME: These are taken from the heuristic-based cost visitor: we should
1159  // eventually abstract these to the CallAnalyzer to avoid duplication.
1160  unsigned SROACostSavingOpportunities = 0;
1161  int VectorBonus = 0;
1162  int SingleBBBonus = 0;
1163  int Threshold = 5;
1164
1165  DenseMap<AllocaInst *, unsigned> SROACosts;
1166
1167  void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1168    Cost[static_cast<size_t>(Feature)] += Delta;
1169  }
1170
1171  void set(InlineCostFeatureIndex Feature, int64_t Value) {
1172    Cost[static_cast<size_t>(Feature)] = Value;
1173  }
1174
1175  void onDisableSROA(AllocaInst *Arg) override {
1176    auto CostIt = SROACosts.find(Arg);
1177    if (CostIt == SROACosts.end())
1178      return;
1179
1180    increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1181    SROACostSavingOpportunities -= CostIt->second;
1182    SROACosts.erase(CostIt);
1183  }
1184
1185  void onDisableLoadElimination() override {
1186    set(InlineCostFeatureIndex::load_elimination, 1);
1187  }
1188
1189  void onCallPenalty() override {
1190    increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1191  }
1192
1193  void onCallArgumentSetup(const CallBase &Call) override {
1194    increment(InlineCostFeatureIndex::call_argument_setup,
1195              Call.arg_size() * InstrCost);
1196  }
1197
1198  void onLoadRelativeIntrinsic() override {
1199    increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1200  }
1201
1202  void onLoweredCall(Function *F, CallBase &Call,
1203                     bool IsIndirectCall) override {
1204    increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1205              Call.arg_size() * InstrCost);
1206
1207    if (IsIndirectCall) {
1208      InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1209                                         /*HintThreshold*/ {},
1210                                         /*ColdThreshold*/ {},
1211                                         /*OptSizeThreshold*/ {},
1212                                         /*OptMinSizeThreshold*/ {},
1213                                         /*HotCallSiteThreshold*/ {},
1214                                         /*LocallyHotCallSiteThreshold*/ {},
1215                                         /*ColdCallSiteThreshold*/ {},
1216                                         /*ComputeFullInlineCost*/ true,
1217                                         /*EnableDeferral*/ true};
1218      IndirectCallParams.DefaultThreshold =
1219          InlineConstants::IndirectCallThreshold;
1220
1221      InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1222                                GetAssumptionCache, GetBFI, PSI, ORE, false,
1223                                true);
1224      if (CA.analyze().isSuccess()) {
1225        increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1226                  CA.getCost());
1227        increment(InlineCostFeatureIndex::nested_inlines, 1);
1228      }
1229    } else {
1230      onCallPenalty();
1231    }
1232  }
1233
1234  void onFinalizeSwitch(unsigned JumpTableSize,
1235                        unsigned NumCaseCluster) override {
1236
1237    if (JumpTableSize) {
1238      int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1239                       JTCostMultiplier * InstrCost;
1240      increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1241      return;
1242    }
1243
1244    if (NumCaseCluster <= 3) {
1245      increment(InlineCostFeatureIndex::case_cluster_penalty,
1246                NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1247      return;
1248    }
1249
1250    int64_t ExpectedNumberOfCompare =
1251        getExpectedNumberOfCompare(NumCaseCluster);
1252
1253    int64_t SwitchCost =
1254        ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1255    increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1256  }
1257
1258  void onMissedSimplification() override {
1259    increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1260              InstrCost);
1261  }
1262
1263  void onInitializeSROAArg(AllocaInst *Arg) override {
1264    auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1265    SROACosts[Arg] = SROAArgCost;
1266    SROACostSavingOpportunities += SROAArgCost;
1267  }
1268
1269  void onAggregateSROAUse(AllocaInst *Arg) override {
1270    SROACosts.find(Arg)->second += InstrCost;
1271    SROACostSavingOpportunities += InstrCost;
1272  }
1273
1274  void onBlockAnalyzed(const BasicBlock *BB) override {
1275    if (BB->getTerminator()->getNumSuccessors() > 1)
1276      set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1277    Threshold -= SingleBBBonus;
1278  }
1279
1280  InlineResult finalizeAnalysis() override {
1281    auto *Caller = CandidateCall.getFunction();
1282    if (Caller->hasMinSize()) {
1283      DominatorTree DT(F);
1284      LoopInfo LI(DT);
1285      for (Loop *L : LI) {
1286        // Ignore loops that will not be executed
1287        if (DeadBlocks.count(L->getHeader()))
1288          continue;
1289        increment(InlineCostFeatureIndex::num_loops,
1290                  InlineConstants::LoopPenalty);
1291      }
1292    }
1293    set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1294    set(InlineCostFeatureIndex::simplified_instructions,
1295        NumInstructionsSimplified);
1296    set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1297    set(InlineCostFeatureIndex::constant_offset_ptr_args,
1298        NumConstantOffsetPtrArgs);
1299    set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1300
1301    if (NumVectorInstructions <= NumInstructions / 10)
1302      Threshold -= VectorBonus;
1303    else if (NumVectorInstructions <= NumInstructions / 2)
1304      Threshold -= VectorBonus / 2;
1305
1306    set(InlineCostFeatureIndex::threshold, Threshold);
1307
1308    return InlineResult::success();
1309  }
1310
1311  bool shouldStop() override { return false; }
1312
1313  void onLoadEliminationOpportunity() override {
1314    increment(InlineCostFeatureIndex::load_elimination, 1);
1315  }
1316
1317  InlineResult onAnalysisStart() override {
1318    increment(InlineCostFeatureIndex::callsite_cost,
1319              -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1320
1321    set(InlineCostFeatureIndex::cold_cc_penalty,
1322        (F.getCallingConv() == CallingConv::Cold));
1323
1324    set(InlineCostFeatureIndex::last_call_to_static_bonus,
1325        isSoleCallToLocalFunction(CandidateCall, F));
1326
1327    // FIXME: we shouldn't repeat this logic in both the Features and Cost
1328    // analyzer - instead, we should abstract it to a common method in the
1329    // CallAnalyzer
1330    int SingleBBBonusPercent = 50;
1331    int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1332    Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1333    Threshold *= TTI.getInliningThresholdMultiplier();
1334    SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1335    VectorBonus = Threshold * VectorBonusPercent / 100;
1336    Threshold += (SingleBBBonus + VectorBonus);
1337
1338    return InlineResult::success();
1339  }
1340
1341public:
1342  InlineCostFeaturesAnalyzer(
1343      const TargetTransformInfo &TTI,
1344      function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1345      function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1346      ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1347      CallBase &Call)
1348      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1349
1350  const InlineCostFeatures &features() const { return Cost; }
1351};
1352
1353} // namespace
1354
1355/// Test whether the given value is an Alloca-derived function argument.
1356bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1357  return SROAArgValues.count(V);
1358}
1359
1360void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1361  onDisableSROA(SROAArg);
1362  EnabledSROAAllocas.erase(SROAArg);
1363  disableLoadElimination();
1364}
1365
1366void InlineCostAnnotationWriter::emitInstructionAnnot(
1367    const Instruction *I, formatted_raw_ostream &OS) {
1368  // The cost of inlining of the given instruction is printed always.
1369  // The threshold delta is printed only when it is non-zero. It happens
1370  // when we decided to give a bonus at a particular instruction.
1371  std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1372  if (!Record)
1373    OS << "; No analysis for the instruction";
1374  else {
1375    OS << "; cost before = " << Record->CostBefore
1376       << ", cost after = " << Record->CostAfter
1377       << ", threshold before = " << Record->ThresholdBefore
1378       << ", threshold after = " << Record->ThresholdAfter << ", ";
1379    OS << "cost delta = " << Record->getCostDelta();
1380    if (Record->hasThresholdChanged())
1381      OS << ", threshold delta = " << Record->getThresholdDelta();
1382  }
1383  auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1384  if (C) {
1385    OS << ", simplified to ";
1386    (*C)->print(OS, true);
1387  }
1388  OS << "\n";
1389}
1390
1391/// If 'V' maps to a SROA candidate, disable SROA for it.
1392void CallAnalyzer::disableSROA(Value *V) {
1393  if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1394    disableSROAForArg(SROAArg);
1395  }
1396}
1397
1398void CallAnalyzer::disableLoadElimination() {
1399  if (EnableLoadElimination) {
1400    onDisableLoadElimination();
1401    EnableLoadElimination = false;
1402  }
1403}
1404
1405/// Accumulate a constant GEP offset into an APInt if possible.
1406///
1407/// Returns false if unable to compute the offset for any reason. Respects any
1408/// simplified values known during the analysis of this callsite.
1409bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1410  unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1411  assert(IntPtrWidth == Offset.getBitWidth());
1412
1413  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1414       GTI != GTE; ++GTI) {
1415    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1416    if (!OpC)
1417      if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1418        OpC = dyn_cast<ConstantInt>(SimpleOp);
1419    if (!OpC)
1420      return false;
1421    if (OpC->isZero())
1422      continue;
1423
1424    // Handle a struct index, which adds its field offset to the pointer.
1425    if (StructType *STy = GTI.getStructTypeOrNull()) {
1426      unsigned ElementIdx = OpC->getZExtValue();
1427      const StructLayout *SL = DL.getStructLayout(STy);
1428      Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1429      continue;
1430    }
1431
1432    APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1433    Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1434  }
1435  return true;
1436}
1437
1438/// Use TTI to check whether a GEP is free.
1439///
1440/// Respects any simplified values known during the analysis of this callsite.
1441bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1442  SmallVector<Value *, 4> Operands;
1443  Operands.push_back(GEP.getOperand(0));
1444  for (const Use &Op : GEP.indices())
1445    if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1446      Operands.push_back(SimpleOp);
1447    else
1448      Operands.push_back(Op);
1449  return TTI.getInstructionCost(&GEP, Operands,
1450                                TargetTransformInfo::TCK_SizeAndLatency) ==
1451         TargetTransformInfo::TCC_Free;
1452}
1453
1454bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1455  disableSROA(I.getOperand(0));
1456
1457  // Check whether inlining will turn a dynamic alloca into a static
1458  // alloca and handle that case.
1459  if (I.isArrayAllocation()) {
1460    Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1461    if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1462      // Sometimes a dynamic alloca could be converted into a static alloca
1463      // after this constant prop, and become a huge static alloca on an
1464      // unconditional CFG path. Avoid inlining if this is going to happen above
1465      // a threshold.
1466      // FIXME: If the threshold is removed or lowered too much, we could end up
1467      // being too pessimistic and prevent inlining non-problematic code. This
1468      // could result in unintended perf regressions. A better overall strategy
1469      // is needed to track stack usage during inlining.
1470      Type *Ty = I.getAllocatedType();
1471      AllocatedSize = SaturatingMultiplyAdd(
1472          AllocSize->getLimitedValue(),
1473          DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1474      if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1475        HasDynamicAlloca = true;
1476      return false;
1477    }
1478  }
1479
1480  // Accumulate the allocated size.
1481  if (I.isStaticAlloca()) {
1482    Type *Ty = I.getAllocatedType();
1483    AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1484                                  AllocatedSize);
1485  }
1486
1487  // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1488  // a variety of reasons, and so we would like to not inline them into
1489  // functions which don't currently have a dynamic alloca. This simply
1490  // disables inlining altogether in the presence of a dynamic alloca.
1491  if (!I.isStaticAlloca())
1492    HasDynamicAlloca = true;
1493
1494  return false;
1495}
1496
1497bool CallAnalyzer::visitPHI(PHINode &I) {
1498  // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1499  // though we don't want to propagate it's bonuses. The idea is to disable
1500  // SROA if it *might* be used in an inappropriate manner.
1501
1502  // Phi nodes are always zero-cost.
1503  // FIXME: Pointer sizes may differ between different address spaces, so do we
1504  // need to use correct address space in the call to getPointerSizeInBits here?
1505  // Or could we skip the getPointerSizeInBits call completely? As far as I can
1506  // see the ZeroOffset is used as a dummy value, so we can probably use any
1507  // bit width for the ZeroOffset?
1508  APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1509  bool CheckSROA = I.getType()->isPointerTy();
1510
1511  // Track the constant or pointer with constant offset we've seen so far.
1512  Constant *FirstC = nullptr;
1513  std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1514  Value *FirstV = nullptr;
1515
1516  for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1517    BasicBlock *Pred = I.getIncomingBlock(i);
1518    // If the incoming block is dead, skip the incoming block.
1519    if (DeadBlocks.count(Pred))
1520      continue;
1521    // If the parent block of phi is not the known successor of the incoming
1522    // block, skip the incoming block.
1523    BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1524    if (KnownSuccessor && KnownSuccessor != I.getParent())
1525      continue;
1526
1527    Value *V = I.getIncomingValue(i);
1528    // If the incoming value is this phi itself, skip the incoming value.
1529    if (&I == V)
1530      continue;
1531
1532    Constant *C = dyn_cast<Constant>(V);
1533    if (!C)
1534      C = SimplifiedValues.lookup(V);
1535
1536    std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1537    if (!C && CheckSROA)
1538      BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1539
1540    if (!C && !BaseAndOffset.first)
1541      // The incoming value is neither a constant nor a pointer with constant
1542      // offset, exit early.
1543      return true;
1544
1545    if (FirstC) {
1546      if (FirstC == C)
1547        // If we've seen a constant incoming value before and it is the same
1548        // constant we see this time, continue checking the next incoming value.
1549        continue;
1550      // Otherwise early exit because we either see a different constant or saw
1551      // a constant before but we have a pointer with constant offset this time.
1552      return true;
1553    }
1554
1555    if (FirstV) {
1556      // The same logic as above, but check pointer with constant offset here.
1557      if (FirstBaseAndOffset == BaseAndOffset)
1558        continue;
1559      return true;
1560    }
1561
1562    if (C) {
1563      // This is the 1st time we've seen a constant, record it.
1564      FirstC = C;
1565      continue;
1566    }
1567
1568    // The remaining case is that this is the 1st time we've seen a pointer with
1569    // constant offset, record it.
1570    FirstV = V;
1571    FirstBaseAndOffset = BaseAndOffset;
1572  }
1573
1574  // Check if we can map phi to a constant.
1575  if (FirstC) {
1576    SimplifiedValues[&I] = FirstC;
1577    return true;
1578  }
1579
1580  // Check if we can map phi to a pointer with constant offset.
1581  if (FirstBaseAndOffset.first) {
1582    ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1583
1584    if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1585      SROAArgValues[&I] = SROAArg;
1586  }
1587
1588  return true;
1589}
1590
1591/// Check we can fold GEPs of constant-offset call site argument pointers.
1592/// This requires target data and inbounds GEPs.
1593///
1594/// \return true if the specified GEP can be folded.
1595bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1596  // Check if we have a base + offset for the pointer.
1597  std::pair<Value *, APInt> BaseAndOffset =
1598      ConstantOffsetPtrs.lookup(I.getPointerOperand());
1599  if (!BaseAndOffset.first)
1600    return false;
1601
1602  // Check if the offset of this GEP is constant, and if so accumulate it
1603  // into Offset.
1604  if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1605    return false;
1606
1607  // Add the result as a new mapping to Base + Offset.
1608  ConstantOffsetPtrs[&I] = BaseAndOffset;
1609
1610  return true;
1611}
1612
1613bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1614  auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1615
1616  // Lambda to check whether a GEP's indices are all constant.
1617  auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1618    for (const Use &Op : GEP.indices())
1619      if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1620        return false;
1621    return true;
1622  };
1623
1624  if (!DisableGEPConstOperand)
1625    if (simplifyInstruction(I))
1626      return true;
1627
1628  if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1629    if (SROAArg)
1630      SROAArgValues[&I] = SROAArg;
1631
1632    // Constant GEPs are modeled as free.
1633    return true;
1634  }
1635
1636  // Variable GEPs will require math and will disable SROA.
1637  if (SROAArg)
1638    disableSROAForArg(SROAArg);
1639  return isGEPFree(I);
1640}
1641
1642/// Simplify \p I if its operands are constants and update SimplifiedValues.
1643bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1644  SmallVector<Constant *> COps;
1645  for (Value *Op : I.operands()) {
1646    Constant *COp = dyn_cast<Constant>(Op);
1647    if (!COp)
1648      COp = SimplifiedValues.lookup(Op);
1649    if (!COp)
1650      return false;
1651    COps.push_back(COp);
1652  }
1653  auto *C = ConstantFoldInstOperands(&I, COps, DL);
1654  if (!C)
1655    return false;
1656  SimplifiedValues[&I] = C;
1657  return true;
1658}
1659
1660/// Try to simplify a call to llvm.is.constant.
1661///
1662/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1663/// we expect calls of this specific intrinsic to be infrequent.
1664///
1665/// FIXME: Given that we know CB's parent (F) caller
1666/// (CandidateCall->getParent()->getParent()), we might be able to determine
1667/// whether inlining F into F's caller would change how the call to
1668/// llvm.is.constant would evaluate.
1669bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1670  Value *Arg = CB.getArgOperand(0);
1671  auto *C = dyn_cast<Constant>(Arg);
1672
1673  if (!C)
1674    C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1675
1676  Type *RT = CB.getFunctionType()->getReturnType();
1677  SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1678  return true;
1679}
1680
1681bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1682  // As per the langref, "The fourth argument to llvm.objectsize determines if
1683  // the value should be evaluated at runtime."
1684  if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1685    return false;
1686
1687  Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1688                                 /*MustSucceed=*/true);
1689  Constant *C = dyn_cast_or_null<Constant>(V);
1690  if (C)
1691    SimplifiedValues[&CB] = C;
1692  return C;
1693}
1694
1695bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1696  // Propagate constants through bitcasts.
1697  if (simplifyInstruction(I))
1698    return true;
1699
1700  // Track base/offsets through casts
1701  std::pair<Value *, APInt> BaseAndOffset =
1702      ConstantOffsetPtrs.lookup(I.getOperand(0));
1703  // Casts don't change the offset, just wrap it up.
1704  if (BaseAndOffset.first)
1705    ConstantOffsetPtrs[&I] = BaseAndOffset;
1706
1707  // Also look for SROA candidates here.
1708  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1709    SROAArgValues[&I] = SROAArg;
1710
1711  // Bitcasts are always zero cost.
1712  return true;
1713}
1714
1715bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1716  // Propagate constants through ptrtoint.
1717  if (simplifyInstruction(I))
1718    return true;
1719
1720  // Track base/offset pairs when converted to a plain integer provided the
1721  // integer is large enough to represent the pointer.
1722  unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1723  unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1724  if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1725    std::pair<Value *, APInt> BaseAndOffset =
1726        ConstantOffsetPtrs.lookup(I.getOperand(0));
1727    if (BaseAndOffset.first)
1728      ConstantOffsetPtrs[&I] = BaseAndOffset;
1729  }
1730
1731  // This is really weird. Technically, ptrtoint will disable SROA. However,
1732  // unless that ptrtoint is *used* somewhere in the live basic blocks after
1733  // inlining, it will be nuked, and SROA should proceed. All of the uses which
1734  // would block SROA would also block SROA if applied directly to a pointer,
1735  // and so we can just add the integer in here. The only places where SROA is
1736  // preserved either cannot fire on an integer, or won't in-and-of themselves
1737  // disable SROA (ext) w/o some later use that we would see and disable.
1738  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1739    SROAArgValues[&I] = SROAArg;
1740
1741  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1742         TargetTransformInfo::TCC_Free;
1743}
1744
1745bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1746  // Propagate constants through ptrtoint.
1747  if (simplifyInstruction(I))
1748    return true;
1749
1750  // Track base/offset pairs when round-tripped through a pointer without
1751  // modifications provided the integer is not too large.
1752  Value *Op = I.getOperand(0);
1753  unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1754  if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1755    std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1756    if (BaseAndOffset.first)
1757      ConstantOffsetPtrs[&I] = BaseAndOffset;
1758  }
1759
1760  // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1761  if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1762    SROAArgValues[&I] = SROAArg;
1763
1764  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1765         TargetTransformInfo::TCC_Free;
1766}
1767
1768bool CallAnalyzer::visitCastInst(CastInst &I) {
1769  // Propagate constants through casts.
1770  if (simplifyInstruction(I))
1771    return true;
1772
1773  // Disable SROA in the face of arbitrary casts we don't explicitly list
1774  // elsewhere.
1775  disableSROA(I.getOperand(0));
1776
1777  // If this is a floating-point cast, and the target says this operation
1778  // is expensive, this may eventually become a library call. Treat the cost
1779  // as such.
1780  switch (I.getOpcode()) {
1781  case Instruction::FPTrunc:
1782  case Instruction::FPExt:
1783  case Instruction::UIToFP:
1784  case Instruction::SIToFP:
1785  case Instruction::FPToUI:
1786  case Instruction::FPToSI:
1787    if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1788      onCallPenalty();
1789    break;
1790  default:
1791    break;
1792  }
1793
1794  return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1795         TargetTransformInfo::TCC_Free;
1796}
1797
1798bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1799  return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1800}
1801
1802bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1803  // Does the *call site* have the NonNull attribute set on an argument?  We
1804  // use the attribute on the call site to memoize any analysis done in the
1805  // caller. This will also trip if the callee function has a non-null
1806  // parameter attribute, but that's a less interesting case because hopefully
1807  // the callee would already have been simplified based on that.
1808  if (Argument *A = dyn_cast<Argument>(V))
1809    if (paramHasAttr(A, Attribute::NonNull))
1810      return true;
1811
1812  // Is this an alloca in the caller?  This is distinct from the attribute case
1813  // above because attributes aren't updated within the inliner itself and we
1814  // always want to catch the alloca derived case.
1815  if (isAllocaDerivedArg(V))
1816    // We can actually predict the result of comparisons between an
1817    // alloca-derived value and null. Note that this fires regardless of
1818    // SROA firing.
1819    return true;
1820
1821  return false;
1822}
1823
1824bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1825  // If the normal destination of the invoke or the parent block of the call
1826  // site is unreachable-terminated, there is little point in inlining this
1827  // unless there is literally zero cost.
1828  // FIXME: Note that it is possible that an unreachable-terminated block has a
1829  // hot entry. For example, in below scenario inlining hot_call_X() may be
1830  // beneficial :
1831  // main() {
1832  //   hot_call_1();
1833  //   ...
1834  //   hot_call_N()
1835  //   exit(0);
1836  // }
1837  // For now, we are not handling this corner case here as it is rare in real
1838  // code. In future, we should elaborate this based on BPI and BFI in more
1839  // general threshold adjusting heuristics in updateThreshold().
1840  if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1841    if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1842      return false;
1843  } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1844    return false;
1845
1846  return true;
1847}
1848
1849bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1850                                            BlockFrequencyInfo *CallerBFI) {
1851  // If global profile summary is available, then callsite's coldness is
1852  // determined based on that.
1853  if (PSI && PSI->hasProfileSummary())
1854    return PSI->isColdCallSite(Call, CallerBFI);
1855
1856  // Otherwise we need BFI to be available.
1857  if (!CallerBFI)
1858    return false;
1859
1860  // Determine if the callsite is cold relative to caller's entry. We could
1861  // potentially cache the computation of scaled entry frequency, but the added
1862  // complexity is not worth it unless this scaling shows up high in the
1863  // profiles.
1864  const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1865  auto CallSiteBB = Call.getParent();
1866  auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1867  auto CallerEntryFreq =
1868      CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1869  return CallSiteFreq < CallerEntryFreq * ColdProb;
1870}
1871
1872std::optional<int>
1873InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1874                                                BlockFrequencyInfo *CallerBFI) {
1875
1876  // If global profile summary is available, then callsite's hotness is
1877  // determined based on that.
1878  if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1879    return Params.HotCallSiteThreshold;
1880
1881  // Otherwise we need BFI to be available and to have a locally hot callsite
1882  // threshold.
1883  if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1884    return std::nullopt;
1885
1886  // Determine if the callsite is hot relative to caller's entry. We could
1887  // potentially cache the computation of scaled entry frequency, but the added
1888  // complexity is not worth it unless this scaling shows up high in the
1889  // profiles.
1890  const BasicBlock *CallSiteBB = Call.getParent();
1891  BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1892  BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1893  std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1894  if (Limit && CallSiteFreq >= *Limit)
1895    return Params.LocallyHotCallSiteThreshold;
1896
1897  // Otherwise treat it normally.
1898  return std::nullopt;
1899}
1900
1901void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1902  // If no size growth is allowed for this inlining, set Threshold to 0.
1903  if (!allowSizeGrowth(Call)) {
1904    Threshold = 0;
1905    return;
1906  }
1907
1908  Function *Caller = Call.getCaller();
1909
1910  // return min(A, B) if B is valid.
1911  auto MinIfValid = [](int A, std::optional<int> B) {
1912    return B ? std::min(A, *B) : A;
1913  };
1914
1915  // return max(A, B) if B is valid.
1916  auto MaxIfValid = [](int A, std::optional<int> B) {
1917    return B ? std::max(A, *B) : A;
1918  };
1919
1920  // Various bonus percentages. These are multiplied by Threshold to get the
1921  // bonus values.
1922  // SingleBBBonus: This bonus is applied if the callee has a single reachable
1923  // basic block at the given callsite context. This is speculatively applied
1924  // and withdrawn if more than one basic block is seen.
1925  //
1926  // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1927  // of the last call to a static function as inlining such functions is
1928  // guaranteed to reduce code size.
1929  //
1930  // These bonus percentages may be set to 0 based on properties of the caller
1931  // and the callsite.
1932  int SingleBBBonusPercent = 50;
1933  int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1934  int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1935
1936  // Lambda to set all the above bonus and bonus percentages to 0.
1937  auto DisallowAllBonuses = [&]() {
1938    SingleBBBonusPercent = 0;
1939    VectorBonusPercent = 0;
1940    LastCallToStaticBonus = 0;
1941  };
1942
1943  // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1944  // and reduce the threshold if the caller has the necessary attribute.
1945  if (Caller->hasMinSize()) {
1946    Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1947    // For minsize, we want to disable the single BB bonus and the vector
1948    // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1949    // a static function will, at the minimum, eliminate the parameter setup and
1950    // call/return instructions.
1951    SingleBBBonusPercent = 0;
1952    VectorBonusPercent = 0;
1953  } else if (Caller->hasOptSize())
1954    Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1955
1956  // Adjust the threshold based on inlinehint attribute and profile based
1957  // hotness information if the caller does not have MinSize attribute.
1958  if (!Caller->hasMinSize()) {
1959    if (Callee.hasFnAttribute(Attribute::InlineHint))
1960      Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1961
1962    // FIXME: After switching to the new passmanager, simplify the logic below
1963    // by checking only the callsite hotness/coldness as we will reliably
1964    // have local profile information.
1965    //
1966    // Callsite hotness and coldness can be determined if sample profile is
1967    // used (which adds hotness metadata to calls) or if caller's
1968    // BlockFrequencyInfo is available.
1969    BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1970    auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1971    if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1972      LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1973      // FIXME: This should update the threshold only if it exceeds the
1974      // current threshold, but AutoFDO + ThinLTO currently relies on this
1975      // behavior to prevent inlining of hot callsites during ThinLTO
1976      // compile phase.
1977      Threshold = *HotCallSiteThreshold;
1978    } else if (isColdCallSite(Call, CallerBFI)) {
1979      LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1980      // Do not apply bonuses for a cold callsite including the
1981      // LastCallToStatic bonus. While this bonus might result in code size
1982      // reduction, it can cause the size of a non-cold caller to increase
1983      // preventing it from being inlined.
1984      DisallowAllBonuses();
1985      Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1986    } else if (PSI) {
1987      // Use callee's global profile information only if we have no way of
1988      // determining this via callsite information.
1989      if (PSI->isFunctionEntryHot(&Callee)) {
1990        LLVM_DEBUG(dbgs() << "Hot callee.\n");
1991        // If callsite hotness can not be determined, we may still know
1992        // that the callee is hot and treat it as a weaker hint for threshold
1993        // increase.
1994        Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1995      } else if (PSI->isFunctionEntryCold(&Callee)) {
1996        LLVM_DEBUG(dbgs() << "Cold callee.\n");
1997        // Do not apply bonuses for a cold callee including the
1998        // LastCallToStatic bonus. While this bonus might result in code size
1999        // reduction, it can cause the size of a non-cold caller to increase
2000        // preventing it from being inlined.
2001        DisallowAllBonuses();
2002        Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2003      }
2004    }
2005  }
2006
2007  Threshold += TTI.adjustInliningThreshold(&Call);
2008
2009  // Finally, take the target-specific inlining threshold multiplier into
2010  // account.
2011  Threshold *= TTI.getInliningThresholdMultiplier();
2012
2013  SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2014  VectorBonus = Threshold * VectorBonusPercent / 100;
2015
2016  // If there is only one call of the function, and it has internal linkage,
2017  // the cost of inlining it drops dramatically. It may seem odd to update
2018  // Cost in updateThreshold, but the bonus depends on the logic in this method.
2019  if (isSoleCallToLocalFunction(Call, F)) {
2020    Cost -= LastCallToStaticBonus;
2021    StaticBonusApplied = LastCallToStaticBonus;
2022  }
2023}
2024
2025bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2026  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2027  // First try to handle simplified comparisons.
2028  if (simplifyInstruction(I))
2029    return true;
2030
2031  if (I.getOpcode() == Instruction::FCmp)
2032    return false;
2033
2034  // Otherwise look for a comparison between constant offset pointers with
2035  // a common base.
2036  Value *LHSBase, *RHSBase;
2037  APInt LHSOffset, RHSOffset;
2038  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2039  if (LHSBase) {
2040    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2041    if (RHSBase && LHSBase == RHSBase) {
2042      // We have common bases, fold the icmp to a constant based on the
2043      // offsets.
2044      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2045      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2046      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
2047        SimplifiedValues[&I] = C;
2048        ++NumConstantPtrCmps;
2049        return true;
2050      }
2051    }
2052  }
2053
2054  auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2055    for (auto *User : I.users())
2056      if (auto *Instr = dyn_cast<Instruction>(User))
2057        if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2058          return false;
2059    return true;
2060  };
2061
2062  // If the comparison is an equality comparison with null, we can simplify it
2063  // if we know the value (argument) can't be null
2064  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2065    if (isKnownNonNullInCallee(I.getOperand(0))) {
2066      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2067      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2068                                        : ConstantInt::getFalse(I.getType());
2069      return true;
2070    }
2071    // Implicit null checks act as unconditional branches and their comparisons
2072    // should be treated as simplified and free of cost.
2073    if (isImplicitNullCheckCmp(I))
2074      return true;
2075  }
2076  return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2077}
2078
2079bool CallAnalyzer::visitSub(BinaryOperator &I) {
2080  // Try to handle a special case: we can fold computing the difference of two
2081  // constant-related pointers.
2082  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2083  Value *LHSBase, *RHSBase;
2084  APInt LHSOffset, RHSOffset;
2085  std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2086  if (LHSBase) {
2087    std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2088    if (RHSBase && LHSBase == RHSBase) {
2089      // We have common bases, fold the subtract to a constant based on the
2090      // offsets.
2091      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2092      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2093      if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2094        SimplifiedValues[&I] = C;
2095        ++NumConstantPtrDiffs;
2096        return true;
2097      }
2098    }
2099  }
2100
2101  // Otherwise, fall back to the generic logic for simplifying and handling
2102  // instructions.
2103  return Base::visitSub(I);
2104}
2105
2106bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2107  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2108  Constant *CLHS = dyn_cast<Constant>(LHS);
2109  if (!CLHS)
2110    CLHS = SimplifiedValues.lookup(LHS);
2111  Constant *CRHS = dyn_cast<Constant>(RHS);
2112  if (!CRHS)
2113    CRHS = SimplifiedValues.lookup(RHS);
2114
2115  Value *SimpleV = nullptr;
2116  if (auto FI = dyn_cast<FPMathOperator>(&I))
2117    SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2118                            FI->getFastMathFlags(), DL);
2119  else
2120    SimpleV =
2121        simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2122
2123  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2124    SimplifiedValues[&I] = C;
2125
2126  if (SimpleV)
2127    return true;
2128
2129  // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2130  disableSROA(LHS);
2131  disableSROA(RHS);
2132
2133  // If the instruction is floating point, and the target says this operation
2134  // is expensive, this may eventually become a library call. Treat the cost
2135  // as such. Unless it's fneg which can be implemented with an xor.
2136  using namespace llvm::PatternMatch;
2137  if (I.getType()->isFloatingPointTy() &&
2138      TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2139      !match(&I, m_FNeg(m_Value())))
2140    onCallPenalty();
2141
2142  return false;
2143}
2144
2145bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2146  Value *Op = I.getOperand(0);
2147  Constant *COp = dyn_cast<Constant>(Op);
2148  if (!COp)
2149    COp = SimplifiedValues.lookup(Op);
2150
2151  Value *SimpleV = simplifyFNegInst(
2152      COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2153
2154  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2155    SimplifiedValues[&I] = C;
2156
2157  if (SimpleV)
2158    return true;
2159
2160  // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2161  disableSROA(Op);
2162
2163  return false;
2164}
2165
2166bool CallAnalyzer::visitLoad(LoadInst &I) {
2167  if (handleSROA(I.getPointerOperand(), I.isSimple()))
2168    return true;
2169
2170  // If the data is already loaded from this address and hasn't been clobbered
2171  // by any stores or calls, this load is likely to be redundant and can be
2172  // eliminated.
2173  if (EnableLoadElimination &&
2174      !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2175    onLoadEliminationOpportunity();
2176    return true;
2177  }
2178
2179  onMemAccess();
2180  return false;
2181}
2182
2183bool CallAnalyzer::visitStore(StoreInst &I) {
2184  if (handleSROA(I.getPointerOperand(), I.isSimple()))
2185    return true;
2186
2187  // The store can potentially clobber loads and prevent repeated loads from
2188  // being eliminated.
2189  // FIXME:
2190  // 1. We can probably keep an initial set of eliminatable loads substracted
2191  // from the cost even when we finally see a store. We just need to disable
2192  // *further* accumulation of elimination savings.
2193  // 2. We should probably at some point thread MemorySSA for the callee into
2194  // this and then use that to actually compute *really* precise savings.
2195  disableLoadElimination();
2196
2197  onMemAccess();
2198  return false;
2199}
2200
2201bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2202  // Constant folding for extract value is trivial.
2203  if (simplifyInstruction(I))
2204    return true;
2205
2206  // SROA can't look through these, but they may be free.
2207  return Base::visitExtractValue(I);
2208}
2209
2210bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2211  // Constant folding for insert value is trivial.
2212  if (simplifyInstruction(I))
2213    return true;
2214
2215  // SROA can't look through these, but they may be free.
2216  return Base::visitInsertValue(I);
2217}
2218
2219/// Try to simplify a call site.
2220///
2221/// Takes a concrete function and callsite and tries to actually simplify it by
2222/// analyzing the arguments and call itself with instsimplify. Returns true if
2223/// it has simplified the callsite to some other entity (a constant), making it
2224/// free.
2225bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2226  // FIXME: Using the instsimplify logic directly for this is inefficient
2227  // because we have to continually rebuild the argument list even when no
2228  // simplifications can be performed. Until that is fixed with remapping
2229  // inside of instsimplify, directly constant fold calls here.
2230  if (!canConstantFoldCallTo(&Call, F))
2231    return false;
2232
2233  // Try to re-map the arguments to constants.
2234  SmallVector<Constant *, 4> ConstantArgs;
2235  ConstantArgs.reserve(Call.arg_size());
2236  for (Value *I : Call.args()) {
2237    Constant *C = dyn_cast<Constant>(I);
2238    if (!C)
2239      C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2240    if (!C)
2241      return false; // This argument doesn't map to a constant.
2242
2243    ConstantArgs.push_back(C);
2244  }
2245  if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2246    SimplifiedValues[&Call] = C;
2247    return true;
2248  }
2249
2250  return false;
2251}
2252
2253bool CallAnalyzer::visitCallBase(CallBase &Call) {
2254  if (!onCallBaseVisitStart(Call))
2255    return true;
2256
2257  if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2258      !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2259    // This aborts the entire analysis.
2260    ExposesReturnsTwice = true;
2261    return false;
2262  }
2263  if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2264    ContainsNoDuplicateCall = true;
2265
2266  Function *F = Call.getCalledFunction();
2267  bool IsIndirectCall = !F;
2268  if (IsIndirectCall) {
2269    // Check if this happens to be an indirect function call to a known function
2270    // in this inline context. If not, we've done all we can.
2271    Value *Callee = Call.getCalledOperand();
2272    F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2273    if (!F || F->getFunctionType() != Call.getFunctionType()) {
2274      onCallArgumentSetup(Call);
2275
2276      if (!Call.onlyReadsMemory())
2277        disableLoadElimination();
2278      return Base::visitCallBase(Call);
2279    }
2280  }
2281
2282  assert(F && "Expected a call to a known function");
2283
2284  // When we have a concrete function, first try to simplify it directly.
2285  if (simplifyCallSite(F, Call))
2286    return true;
2287
2288  // Next check if it is an intrinsic we know about.
2289  // FIXME: Lift this into part of the InstVisitor.
2290  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2291    switch (II->getIntrinsicID()) {
2292    default:
2293      if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2294        disableLoadElimination();
2295      return Base::visitCallBase(Call);
2296
2297    case Intrinsic::load_relative:
2298      onLoadRelativeIntrinsic();
2299      return false;
2300
2301    case Intrinsic::memset:
2302    case Intrinsic::memcpy:
2303    case Intrinsic::memmove:
2304      disableLoadElimination();
2305      // SROA can usually chew through these intrinsics, but they aren't free.
2306      return false;
2307    case Intrinsic::icall_branch_funnel:
2308    case Intrinsic::localescape:
2309      HasUninlineableIntrinsic = true;
2310      return false;
2311    case Intrinsic::vastart:
2312      InitsVargArgs = true;
2313      return false;
2314    case Intrinsic::launder_invariant_group:
2315    case Intrinsic::strip_invariant_group:
2316      if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2317        SROAArgValues[II] = SROAArg;
2318      return true;
2319    case Intrinsic::is_constant:
2320      return simplifyIntrinsicCallIsConstant(Call);
2321    case Intrinsic::objectsize:
2322      return simplifyIntrinsicCallObjectSize(Call);
2323    }
2324  }
2325
2326  if (F == Call.getFunction()) {
2327    // This flag will fully abort the analysis, so don't bother with anything
2328    // else.
2329    IsRecursiveCall = true;
2330    if (!AllowRecursiveCall)
2331      return false;
2332  }
2333
2334  if (TTI.isLoweredToCall(F)) {
2335    onLoweredCall(F, Call, IsIndirectCall);
2336  }
2337
2338  if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2339    disableLoadElimination();
2340  return Base::visitCallBase(Call);
2341}
2342
2343bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2344  // At least one return instruction will be free after inlining.
2345  bool Free = !HasReturn;
2346  HasReturn = true;
2347  return Free;
2348}
2349
2350bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2351  // We model unconditional branches as essentially free -- they really
2352  // shouldn't exist at all, but handling them makes the behavior of the
2353  // inliner more regular and predictable. Interestingly, conditional branches
2354  // which will fold away are also free.
2355  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2356         BI.getMetadata(LLVMContext::MD_make_implicit) ||
2357         isa_and_nonnull<ConstantInt>(
2358             SimplifiedValues.lookup(BI.getCondition()));
2359}
2360
2361bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2362  bool CheckSROA = SI.getType()->isPointerTy();
2363  Value *TrueVal = SI.getTrueValue();
2364  Value *FalseVal = SI.getFalseValue();
2365
2366  Constant *TrueC = dyn_cast<Constant>(TrueVal);
2367  if (!TrueC)
2368    TrueC = SimplifiedValues.lookup(TrueVal);
2369  Constant *FalseC = dyn_cast<Constant>(FalseVal);
2370  if (!FalseC)
2371    FalseC = SimplifiedValues.lookup(FalseVal);
2372  Constant *CondC =
2373      dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2374
2375  if (!CondC) {
2376    // Select C, X, X => X
2377    if (TrueC == FalseC && TrueC) {
2378      SimplifiedValues[&SI] = TrueC;
2379      return true;
2380    }
2381
2382    if (!CheckSROA)
2383      return Base::visitSelectInst(SI);
2384
2385    std::pair<Value *, APInt> TrueBaseAndOffset =
2386        ConstantOffsetPtrs.lookup(TrueVal);
2387    std::pair<Value *, APInt> FalseBaseAndOffset =
2388        ConstantOffsetPtrs.lookup(FalseVal);
2389    if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2390      ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2391
2392      if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2393        SROAArgValues[&SI] = SROAArg;
2394      return true;
2395    }
2396
2397    return Base::visitSelectInst(SI);
2398  }
2399
2400  // Select condition is a constant.
2401  Value *SelectedV = CondC->isAllOnesValue()  ? TrueVal
2402                     : (CondC->isNullValue()) ? FalseVal
2403                                              : nullptr;
2404  if (!SelectedV) {
2405    // Condition is a vector constant that is not all 1s or all 0s.  If all
2406    // operands are constants, ConstantFoldSelectInstruction() can handle the
2407    // cases such as select vectors.
2408    if (TrueC && FalseC) {
2409      if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2410        SimplifiedValues[&SI] = C;
2411        return true;
2412      }
2413    }
2414    return Base::visitSelectInst(SI);
2415  }
2416
2417  // Condition is either all 1s or all 0s. SI can be simplified.
2418  if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2419    SimplifiedValues[&SI] = SelectedC;
2420    return true;
2421  }
2422
2423  if (!CheckSROA)
2424    return true;
2425
2426  std::pair<Value *, APInt> BaseAndOffset =
2427      ConstantOffsetPtrs.lookup(SelectedV);
2428  if (BaseAndOffset.first) {
2429    ConstantOffsetPtrs[&SI] = BaseAndOffset;
2430
2431    if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2432      SROAArgValues[&SI] = SROAArg;
2433  }
2434
2435  return true;
2436}
2437
2438bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2439  // We model unconditional switches as free, see the comments on handling
2440  // branches.
2441  if (isa<ConstantInt>(SI.getCondition()))
2442    return true;
2443  if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2444    if (isa<ConstantInt>(V))
2445      return true;
2446
2447  // Assume the most general case where the switch is lowered into
2448  // either a jump table, bit test, or a balanced binary tree consisting of
2449  // case clusters without merging adjacent clusters with the same
2450  // destination. We do not consider the switches that are lowered with a mix
2451  // of jump table/bit test/binary search tree. The cost of the switch is
2452  // proportional to the size of the tree or the size of jump table range.
2453  //
2454  // NB: We convert large switches which are just used to initialize large phi
2455  // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2456  // inlining those. It will prevent inlining in cases where the optimization
2457  // does not (yet) fire.
2458
2459  unsigned JumpTableSize = 0;
2460  BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2461  unsigned NumCaseCluster =
2462      TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2463
2464  onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2465  return false;
2466}
2467
2468bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2469  // We never want to inline functions that contain an indirectbr.  This is
2470  // incorrect because all the blockaddress's (in static global initializers
2471  // for example) would be referring to the original function, and this
2472  // indirect jump would jump from the inlined copy of the function into the
2473  // original function which is extremely undefined behavior.
2474  // FIXME: This logic isn't really right; we can safely inline functions with
2475  // indirectbr's as long as no other function or global references the
2476  // blockaddress of a block within the current function.
2477  HasIndirectBr = true;
2478  return false;
2479}
2480
2481bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2482  // FIXME: It's not clear that a single instruction is an accurate model for
2483  // the inline cost of a resume instruction.
2484  return false;
2485}
2486
2487bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2488  // FIXME: It's not clear that a single instruction is an accurate model for
2489  // the inline cost of a cleanupret instruction.
2490  return false;
2491}
2492
2493bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2494  // FIXME: It's not clear that a single instruction is an accurate model for
2495  // the inline cost of a catchret instruction.
2496  return false;
2497}
2498
2499bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2500  // FIXME: It might be reasonably to discount the cost of instructions leading
2501  // to unreachable as they have the lowest possible impact on both runtime and
2502  // code size.
2503  return true; // No actual code is needed for unreachable.
2504}
2505
2506bool CallAnalyzer::visitInstruction(Instruction &I) {
2507  // Some instructions are free. All of the free intrinsics can also be
2508  // handled by SROA, etc.
2509  if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2510      TargetTransformInfo::TCC_Free)
2511    return true;
2512
2513  // We found something we don't understand or can't handle. Mark any SROA-able
2514  // values in the operand list as no longer viable.
2515  for (const Use &Op : I.operands())
2516    disableSROA(Op);
2517
2518  return false;
2519}
2520
2521/// Analyze a basic block for its contribution to the inline cost.
2522///
2523/// This method walks the analyzer over every instruction in the given basic
2524/// block and accounts for their cost during inlining at this callsite. It
2525/// aborts early if the threshold has been exceeded or an impossible to inline
2526/// construct has been detected. It returns false if inlining is no longer
2527/// viable, and true if inlining remains viable.
2528InlineResult
2529CallAnalyzer::analyzeBlock(BasicBlock *BB,
2530                           SmallPtrSetImpl<const Value *> &EphValues) {
2531  for (Instruction &I : *BB) {
2532    // FIXME: Currently, the number of instructions in a function regardless of
2533    // our ability to simplify them during inline to constants or dead code,
2534    // are actually used by the vector bonus heuristic. As long as that's true,
2535    // we have to special case debug intrinsics here to prevent differences in
2536    // inlining due to debug symbols. Eventually, the number of unsimplified
2537    // instructions shouldn't factor into the cost computation, but until then,
2538    // hack around it here.
2539    // Similarly, skip pseudo-probes.
2540    if (I.isDebugOrPseudoInst())
2541      continue;
2542
2543    // Skip ephemeral values.
2544    if (EphValues.count(&I))
2545      continue;
2546
2547    ++NumInstructions;
2548    if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2549      ++NumVectorInstructions;
2550
2551    // If the instruction simplified to a constant, there is no cost to this
2552    // instruction. Visit the instructions using our InstVisitor to account for
2553    // all of the per-instruction logic. The visit tree returns true if we
2554    // consumed the instruction in any way, and false if the instruction's base
2555    // cost should count against inlining.
2556    onInstructionAnalysisStart(&I);
2557
2558    if (Base::visit(&I))
2559      ++NumInstructionsSimplified;
2560    else
2561      onMissedSimplification();
2562
2563    onInstructionAnalysisFinish(&I);
2564    using namespace ore;
2565    // If the visit this instruction detected an uninlinable pattern, abort.
2566    InlineResult IR = InlineResult::success();
2567    if (IsRecursiveCall && !AllowRecursiveCall)
2568      IR = InlineResult::failure("recursive");
2569    else if (ExposesReturnsTwice)
2570      IR = InlineResult::failure("exposes returns twice");
2571    else if (HasDynamicAlloca)
2572      IR = InlineResult::failure("dynamic alloca");
2573    else if (HasIndirectBr)
2574      IR = InlineResult::failure("indirect branch");
2575    else if (HasUninlineableIntrinsic)
2576      IR = InlineResult::failure("uninlinable intrinsic");
2577    else if (InitsVargArgs)
2578      IR = InlineResult::failure("varargs");
2579    if (!IR.isSuccess()) {
2580      if (ORE)
2581        ORE->emit([&]() {
2582          return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2583                                          &CandidateCall)
2584                 << NV("Callee", &F) << " has uninlinable pattern ("
2585                 << NV("InlineResult", IR.getFailureReason())
2586                 << ") and cost is not fully computed";
2587        });
2588      return IR;
2589    }
2590
2591    // If the caller is a recursive function then we don't want to inline
2592    // functions which allocate a lot of stack space because it would increase
2593    // the caller stack usage dramatically.
2594    if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2595      auto IR =
2596          InlineResult::failure("recursive and allocates too much stack space");
2597      if (ORE)
2598        ORE->emit([&]() {
2599          return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2600                                          &CandidateCall)
2601                 << NV("Callee", &F) << " is "
2602                 << NV("InlineResult", IR.getFailureReason())
2603                 << ". Cost is not fully computed";
2604        });
2605      return IR;
2606    }
2607
2608    if (shouldStop())
2609      return InlineResult::failure(
2610          "Call site analysis is not favorable to inlining.");
2611  }
2612
2613  return InlineResult::success();
2614}
2615
2616/// Compute the base pointer and cumulative constant offsets for V.
2617///
2618/// This strips all constant offsets off of V, leaving it the base pointer, and
2619/// accumulates the total constant offset applied in the returned constant. It
2620/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2621/// no constant offsets applied.
2622ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2623  if (!V->getType()->isPointerTy())
2624    return nullptr;
2625
2626  unsigned AS = V->getType()->getPointerAddressSpace();
2627  unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2628  APInt Offset = APInt::getZero(IntPtrWidth);
2629
2630  // Even though we don't look through PHI nodes, we could be called on an
2631  // instruction in an unreachable block, which may be on a cycle.
2632  SmallPtrSet<Value *, 4> Visited;
2633  Visited.insert(V);
2634  do {
2635    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2636      if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2637        return nullptr;
2638      V = GEP->getPointerOperand();
2639    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2640      V = cast<Operator>(V)->getOperand(0);
2641    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2642      if (GA->isInterposable())
2643        break;
2644      V = GA->getAliasee();
2645    } else {
2646      break;
2647    }
2648    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2649  } while (Visited.insert(V).second);
2650
2651  Type *IdxPtrTy = DL.getIndexType(V->getType());
2652  return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2653}
2654
2655/// Find dead blocks due to deleted CFG edges during inlining.
2656///
2657/// If we know the successor of the current block, \p CurrBB, has to be \p
2658/// NextBB, the other successors of \p CurrBB are dead if these successors have
2659/// no live incoming CFG edges.  If one block is found to be dead, we can
2660/// continue growing the dead block list by checking the successors of the dead
2661/// blocks to see if all their incoming edges are dead or not.
2662void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2663  auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2664    // A CFG edge is dead if the predecessor is dead or the predecessor has a
2665    // known successor which is not the one under exam.
2666    return (DeadBlocks.count(Pred) ||
2667            (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2668  };
2669
2670  auto IsNewlyDead = [&](BasicBlock *BB) {
2671    // If all the edges to a block are dead, the block is also dead.
2672    return (!DeadBlocks.count(BB) &&
2673            llvm::all_of(predecessors(BB),
2674                         [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2675  };
2676
2677  for (BasicBlock *Succ : successors(CurrBB)) {
2678    if (Succ == NextBB || !IsNewlyDead(Succ))
2679      continue;
2680    SmallVector<BasicBlock *, 4> NewDead;
2681    NewDead.push_back(Succ);
2682    while (!NewDead.empty()) {
2683      BasicBlock *Dead = NewDead.pop_back_val();
2684      if (DeadBlocks.insert(Dead).second)
2685        // Continue growing the dead block lists.
2686        for (BasicBlock *S : successors(Dead))
2687          if (IsNewlyDead(S))
2688            NewDead.push_back(S);
2689    }
2690  }
2691}
2692
2693/// Analyze a call site for potential inlining.
2694///
2695/// Returns true if inlining this call is viable, and false if it is not
2696/// viable. It computes the cost and adjusts the threshold based on numerous
2697/// factors and heuristics. If this method returns false but the computed cost
2698/// is below the computed threshold, then inlining was forcibly disabled by
2699/// some artifact of the routine.
2700InlineResult CallAnalyzer::analyze() {
2701  ++NumCallsAnalyzed;
2702
2703  auto Result = onAnalysisStart();
2704  if (!Result.isSuccess())
2705    return Result;
2706
2707  if (F.empty())
2708    return InlineResult::success();
2709
2710  Function *Caller = CandidateCall.getFunction();
2711  // Check if the caller function is recursive itself.
2712  for (User *U : Caller->users()) {
2713    CallBase *Call = dyn_cast<CallBase>(U);
2714    if (Call && Call->getFunction() == Caller) {
2715      IsCallerRecursive = true;
2716      break;
2717    }
2718  }
2719
2720  // Populate our simplified values by mapping from function arguments to call
2721  // arguments with known important simplifications.
2722  auto CAI = CandidateCall.arg_begin();
2723  for (Argument &FAI : F.args()) {
2724    assert(CAI != CandidateCall.arg_end());
2725    if (Constant *C = dyn_cast<Constant>(CAI))
2726      SimplifiedValues[&FAI] = C;
2727
2728    Value *PtrArg = *CAI;
2729    if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2730      ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2731
2732      // We can SROA any pointer arguments derived from alloca instructions.
2733      if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2734        SROAArgValues[&FAI] = SROAArg;
2735        onInitializeSROAArg(SROAArg);
2736        EnabledSROAAllocas.insert(SROAArg);
2737      }
2738    }
2739    ++CAI;
2740  }
2741  NumConstantArgs = SimplifiedValues.size();
2742  NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2743  NumAllocaArgs = SROAArgValues.size();
2744
2745  // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2746  // the ephemeral values multiple times (and they're completely determined by
2747  // the callee, so this is purely duplicate work).
2748  SmallPtrSet<const Value *, 32> EphValues;
2749  CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2750
2751  // The worklist of live basic blocks in the callee *after* inlining. We avoid
2752  // adding basic blocks of the callee which can be proven to be dead for this
2753  // particular call site in order to get more accurate cost estimates. This
2754  // requires a somewhat heavyweight iteration pattern: we need to walk the
2755  // basic blocks in a breadth-first order as we insert live successors. To
2756  // accomplish this, prioritizing for small iterations because we exit after
2757  // crossing our threshold, we use a small-size optimized SetVector.
2758  typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2759  BBSetVector BBWorklist;
2760  BBWorklist.insert(&F.getEntryBlock());
2761
2762  // Note that we *must not* cache the size, this loop grows the worklist.
2763  for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2764    if (shouldStop())
2765      break;
2766
2767    BasicBlock *BB = BBWorklist[Idx];
2768    if (BB->empty())
2769      continue;
2770
2771    onBlockStart(BB);
2772
2773    // Disallow inlining a blockaddress with uses other than strictly callbr.
2774    // A blockaddress only has defined behavior for an indirect branch in the
2775    // same function, and we do not currently support inlining indirect
2776    // branches.  But, the inliner may not see an indirect branch that ends up
2777    // being dead code at a particular call site. If the blockaddress escapes
2778    // the function, e.g., via a global variable, inlining may lead to an
2779    // invalid cross-function reference.
2780    // FIXME: pr/39560: continue relaxing this overt restriction.
2781    if (BB->hasAddressTaken())
2782      for (User *U : BlockAddress::get(&*BB)->users())
2783        if (!isa<CallBrInst>(*U))
2784          return InlineResult::failure("blockaddress used outside of callbr");
2785
2786    // Analyze the cost of this block. If we blow through the threshold, this
2787    // returns false, and we can bail on out.
2788    InlineResult IR = analyzeBlock(BB, EphValues);
2789    if (!IR.isSuccess())
2790      return IR;
2791
2792    Instruction *TI = BB->getTerminator();
2793
2794    // Add in the live successors by first checking whether we have terminator
2795    // that may be simplified based on the values simplified by this call.
2796    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2797      if (BI->isConditional()) {
2798        Value *Cond = BI->getCondition();
2799        if (ConstantInt *SimpleCond =
2800                dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2801          BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2802          BBWorklist.insert(NextBB);
2803          KnownSuccessors[BB] = NextBB;
2804          findDeadBlocks(BB, NextBB);
2805          continue;
2806        }
2807      }
2808    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2809      Value *Cond = SI->getCondition();
2810      if (ConstantInt *SimpleCond =
2811              dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2812        BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2813        BBWorklist.insert(NextBB);
2814        KnownSuccessors[BB] = NextBB;
2815        findDeadBlocks(BB, NextBB);
2816        continue;
2817      }
2818    }
2819
2820    // If we're unable to select a particular successor, just count all of
2821    // them.
2822    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2823         ++TIdx)
2824      BBWorklist.insert(TI->getSuccessor(TIdx));
2825
2826    onBlockAnalyzed(BB);
2827  }
2828
2829  // If this is a noduplicate call, we can still inline as long as
2830  // inlining this would cause the removal of the caller (so the instruction
2831  // is not actually duplicated, just moved).
2832  if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2833    return InlineResult::failure("noduplicate");
2834
2835  // If the callee's stack size exceeds the user-specified threshold,
2836  // do not let it be inlined.
2837  // The command line option overrides a limit set in the function attributes.
2838  size_t FinalStackSizeThreshold = StackSizeThreshold;
2839  if (!StackSizeThreshold.getNumOccurrences())
2840    if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2841            Caller, InlineConstants::MaxInlineStackSizeAttributeName))
2842      FinalStackSizeThreshold = *AttrMaxStackSize;
2843  if (AllocatedSize > FinalStackSizeThreshold)
2844    return InlineResult::failure("stacksize");
2845
2846  return finalizeAnalysis();
2847}
2848
2849void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2850#define DEBUG_PRINT_STAT(x) OS << "      " #x ": " << x << "\n"
2851  if (PrintInstructionComments)
2852    F.print(OS, &Writer);
2853  DEBUG_PRINT_STAT(NumConstantArgs);
2854  DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2855  DEBUG_PRINT_STAT(NumAllocaArgs);
2856  DEBUG_PRINT_STAT(NumConstantPtrCmps);
2857  DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2858  DEBUG_PRINT_STAT(NumInstructionsSimplified);
2859  DEBUG_PRINT_STAT(NumInstructions);
2860  DEBUG_PRINT_STAT(SROACostSavings);
2861  DEBUG_PRINT_STAT(SROACostSavingsLost);
2862  DEBUG_PRINT_STAT(LoadEliminationCost);
2863  DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2864  DEBUG_PRINT_STAT(Cost);
2865  DEBUG_PRINT_STAT(Threshold);
2866#undef DEBUG_PRINT_STAT
2867}
2868
2869#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2870/// Dump stats about this call's analysis.
2871LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2872#endif
2873
2874/// Test that there are no attribute conflicts between Caller and Callee
2875///        that prevent inlining.
2876static bool functionsHaveCompatibleAttributes(
2877    Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2878    function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2879  // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2880  // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2881  // object, and always returns the same object (which is overwritten on each
2882  // GetTLI call). Therefore we copy the first result.
2883  auto CalleeTLI = GetTLI(*Callee);
2884  return (IgnoreTTIInlineCompatible ||
2885          TTI.areInlineCompatible(Caller, Callee)) &&
2886         GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2887                                             InlineCallerSupersetNoBuiltin) &&
2888         AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2889}
2890
2891int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2892                          const DataLayout &DL) {
2893  int64_t Cost = 0;
2894  for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2895    if (Call.isByValArgument(I)) {
2896      // We approximate the number of loads and stores needed by dividing the
2897      // size of the byval type by the target's pointer size.
2898      PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2899      unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2900      unsigned AS = PTy->getAddressSpace();
2901      unsigned PointerSize = DL.getPointerSizeInBits(AS);
2902      // Ceiling division.
2903      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2904
2905      // If it generates more than 8 stores it is likely to be expanded as an
2906      // inline memcpy so we take that as an upper bound. Otherwise we assume
2907      // one load and one store per word copied.
2908      // FIXME: The maxStoresPerMemcpy setting from the target should be used
2909      // here instead of a magic number of 8, but it's not available via
2910      // DataLayout.
2911      NumStores = std::min(NumStores, 8U);
2912
2913      Cost += 2 * NumStores * InstrCost;
2914    } else {
2915      // For non-byval arguments subtract off one instruction per call
2916      // argument.
2917      Cost += InstrCost;
2918    }
2919  }
2920  // The call instruction also disappears after inlining.
2921  Cost += InstrCost;
2922  Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2923
2924  return std::min<int64_t>(Cost, INT_MAX);
2925}
2926
2927InlineCost llvm::getInlineCost(
2928    CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2929    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2930    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2931    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2932    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2933  return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2934                       GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2935}
2936
2937std::optional<int> llvm::getInliningCostEstimate(
2938    CallBase &Call, TargetTransformInfo &CalleeTTI,
2939    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2940    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2941    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2942  const InlineParams Params = {/* DefaultThreshold*/ 0,
2943                               /*HintThreshold*/ {},
2944                               /*ColdThreshold*/ {},
2945                               /*OptSizeThreshold*/ {},
2946                               /*OptMinSizeThreshold*/ {},
2947                               /*HotCallSiteThreshold*/ {},
2948                               /*LocallyHotCallSiteThreshold*/ {},
2949                               /*ColdCallSiteThreshold*/ {},
2950                               /*ComputeFullInlineCost*/ true,
2951                               /*EnableDeferral*/ true};
2952
2953  InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2954                            GetAssumptionCache, GetBFI, PSI, ORE, true,
2955                            /*IgnoreThreshold*/ true);
2956  auto R = CA.analyze();
2957  if (!R.isSuccess())
2958    return std::nullopt;
2959  return CA.getCost();
2960}
2961
2962std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2963    CallBase &Call, TargetTransformInfo &CalleeTTI,
2964    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2965    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2966    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2967  InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2968                                 ORE, *Call.getCalledFunction(), Call);
2969  auto R = CFA.analyze();
2970  if (!R.isSuccess())
2971    return std::nullopt;
2972  return CFA.features();
2973}
2974
2975std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2976    CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2977    function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2978
2979  // Cannot inline indirect calls.
2980  if (!Callee)
2981    return InlineResult::failure("indirect call");
2982
2983  // When callee coroutine function is inlined into caller coroutine function
2984  // before coro-split pass,
2985  // coro-early pass can not handle this quiet well.
2986  // So we won't inline the coroutine function if it have not been unsplited
2987  if (Callee->isPresplitCoroutine())
2988    return InlineResult::failure("unsplited coroutine call");
2989
2990  // Never inline calls with byval arguments that does not have the alloca
2991  // address space. Since byval arguments can be replaced with a copy to an
2992  // alloca, the inlined code would need to be adjusted to handle that the
2993  // argument is in the alloca address space (so it is a little bit complicated
2994  // to solve).
2995  unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2996  for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2997    if (Call.isByValArgument(I)) {
2998      PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2999      if (PTy->getAddressSpace() != AllocaAS)
3000        return InlineResult::failure("byval arguments without alloca"
3001                                     " address space");
3002    }
3003
3004  // Calls to functions with always-inline attributes should be inlined
3005  // whenever possible.
3006  if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3007    if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3008      return InlineResult::failure("noinline call site attribute");
3009
3010    auto IsViable = isInlineViable(*Callee);
3011    if (IsViable.isSuccess())
3012      return InlineResult::success();
3013    return InlineResult::failure(IsViable.getFailureReason());
3014  }
3015
3016  // Never inline functions with conflicting attributes (unless callee has
3017  // always-inline attribute).
3018  Function *Caller = Call.getCaller();
3019  if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3020    return InlineResult::failure("conflicting attributes");
3021
3022  // Don't inline this call if the caller has the optnone attribute.
3023  if (Caller->hasOptNone())
3024    return InlineResult::failure("optnone attribute");
3025
3026  // Don't inline a function that treats null pointer as valid into a caller
3027  // that does not have this attribute.
3028  if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3029    return InlineResult::failure("nullptr definitions incompatible");
3030
3031  // Don't inline functions which can be interposed at link-time.
3032  if (Callee->isInterposable())
3033    return InlineResult::failure("interposable");
3034
3035  // Don't inline functions marked noinline.
3036  if (Callee->hasFnAttribute(Attribute::NoInline))
3037    return InlineResult::failure("noinline function attribute");
3038
3039  // Don't inline call sites marked noinline.
3040  if (Call.isNoInline())
3041    return InlineResult::failure("noinline call site attribute");
3042
3043  return std::nullopt;
3044}
3045
3046InlineCost llvm::getInlineCost(
3047    CallBase &Call, Function *Callee, const InlineParams &Params,
3048    TargetTransformInfo &CalleeTTI,
3049    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3050    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3051    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3052    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3053
3054  auto UserDecision =
3055      llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3056
3057  if (UserDecision) {
3058    if (UserDecision->isSuccess())
3059      return llvm::InlineCost::getAlways("always inline attribute");
3060    return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3061  }
3062
3063  LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
3064                          << "... (caller:" << Call.getCaller()->getName()
3065                          << ")\n");
3066
3067  InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3068                            GetAssumptionCache, GetBFI, PSI, ORE);
3069  InlineResult ShouldInline = CA.analyze();
3070
3071  LLVM_DEBUG(CA.dump());
3072
3073  // Always make cost benefit based decision explicit.
3074  // We use always/never here since threshold is not meaningful,
3075  // as it's not what drives cost-benefit analysis.
3076  if (CA.wasDecidedByCostBenefit()) {
3077    if (ShouldInline.isSuccess())
3078      return InlineCost::getAlways("benefit over cost",
3079                                   CA.getCostBenefitPair());
3080    else
3081      return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3082  }
3083
3084  if (CA.wasDecidedByCostThreshold())
3085    return InlineCost::get(CA.getCost(), CA.getThreshold(),
3086                           CA.getStaticBonusApplied());
3087
3088  // No details on how the decision was made, simply return always or never.
3089  return ShouldInline.isSuccess()
3090             ? InlineCost::getAlways("empty function")
3091             : InlineCost::getNever(ShouldInline.getFailureReason());
3092}
3093
3094InlineResult llvm::isInlineViable(Function &F) {
3095  bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3096  for (BasicBlock &BB : F) {
3097    // Disallow inlining of functions which contain indirect branches.
3098    if (isa<IndirectBrInst>(BB.getTerminator()))
3099      return InlineResult::failure("contains indirect branches");
3100
3101    // Disallow inlining of blockaddresses which are used by non-callbr
3102    // instructions.
3103    if (BB.hasAddressTaken())
3104      for (User *U : BlockAddress::get(&BB)->users())
3105        if (!isa<CallBrInst>(*U))
3106          return InlineResult::failure("blockaddress used outside of callbr");
3107
3108    for (auto &II : BB) {
3109      CallBase *Call = dyn_cast<CallBase>(&II);
3110      if (!Call)
3111        continue;
3112
3113      // Disallow recursive calls.
3114      Function *Callee = Call->getCalledFunction();
3115      if (&F == Callee)
3116        return InlineResult::failure("recursive call");
3117
3118      // Disallow calls which expose returns-twice to a function not previously
3119      // attributed as such.
3120      if (!ReturnsTwice && isa<CallInst>(Call) &&
3121          cast<CallInst>(Call)->canReturnTwice())
3122        return InlineResult::failure("exposes returns-twice attribute");
3123
3124      if (Callee)
3125        switch (Callee->getIntrinsicID()) {
3126        default:
3127          break;
3128        case llvm::Intrinsic::icall_branch_funnel:
3129          // Disallow inlining of @llvm.icall.branch.funnel because current
3130          // backend can't separate call targets from call arguments.
3131          return InlineResult::failure(
3132              "disallowed inlining of @llvm.icall.branch.funnel");
3133        case llvm::Intrinsic::localescape:
3134          // Disallow inlining functions that call @llvm.localescape. Doing this
3135          // correctly would require major changes to the inliner.
3136          return InlineResult::failure(
3137              "disallowed inlining of @llvm.localescape");
3138        case llvm::Intrinsic::vastart:
3139          // Disallow inlining of functions that initialize VarArgs with
3140          // va_start.
3141          return InlineResult::failure(
3142              "contains VarArgs initialized with va_start");
3143        }
3144    }
3145  }
3146
3147  return InlineResult::success();
3148}
3149
3150// APIs to create InlineParams based on command line flags and/or other
3151// parameters.
3152
3153InlineParams llvm::getInlineParams(int Threshold) {
3154  InlineParams Params;
3155
3156  // This field is the threshold to use for a callee by default. This is
3157  // derived from one or more of:
3158  //  * optimization or size-optimization levels,
3159  //  * a value passed to createFunctionInliningPass function, or
3160  //  * the -inline-threshold flag.
3161  //  If the -inline-threshold flag is explicitly specified, that is used
3162  //  irrespective of anything else.
3163  if (InlineThreshold.getNumOccurrences() > 0)
3164    Params.DefaultThreshold = InlineThreshold;
3165  else
3166    Params.DefaultThreshold = Threshold;
3167
3168  // Set the HintThreshold knob from the -inlinehint-threshold.
3169  Params.HintThreshold = HintThreshold;
3170
3171  // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3172  Params.HotCallSiteThreshold = HotCallSiteThreshold;
3173
3174  // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3175  // populate LocallyHotCallSiteThreshold. Later, we populate
3176  // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3177  // we know that optimization level is O3 (in the getInlineParams variant that
3178  // takes the opt and size levels).
3179  // FIXME: Remove this check (and make the assignment unconditional) after
3180  // addressing size regression issues at O2.
3181  if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3182    Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3183
3184  // Set the ColdCallSiteThreshold knob from the
3185  // -inline-cold-callsite-threshold.
3186  Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3187
3188  // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3189  // -inlinehint-threshold commandline option is not explicitly given. If that
3190  // option is present, then its value applies even for callees with size and
3191  // minsize attributes.
3192  // If the -inline-threshold is not specified, set the ColdThreshold from the
3193  // -inlinecold-threshold even if it is not explicitly passed. If
3194  // -inline-threshold is specified, then -inlinecold-threshold needs to be
3195  // explicitly specified to set the ColdThreshold knob
3196  if (InlineThreshold.getNumOccurrences() == 0) {
3197    Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3198    Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3199    Params.ColdThreshold = ColdThreshold;
3200  } else if (ColdThreshold.getNumOccurrences() > 0) {
3201    Params.ColdThreshold = ColdThreshold;
3202  }
3203  return Params;
3204}
3205
3206InlineParams llvm::getInlineParams() {
3207  return getInlineParams(DefaultThreshold);
3208}
3209
3210// Compute the default threshold for inlining based on the opt level and the
3211// size opt level.
3212static int computeThresholdFromOptLevels(unsigned OptLevel,
3213                                         unsigned SizeOptLevel) {
3214  if (OptLevel > 2)
3215    return InlineConstants::OptAggressiveThreshold;
3216  if (SizeOptLevel == 1) // -Os
3217    return InlineConstants::OptSizeThreshold;
3218  if (SizeOptLevel == 2) // -Oz
3219    return InlineConstants::OptMinSizeThreshold;
3220  return DefaultThreshold;
3221}
3222
3223InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3224  auto Params =
3225      getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3226  // At O3, use the value of -locally-hot-callsite-threshold option to populate
3227  // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3228  // when it is specified explicitly.
3229  if (OptLevel > 2)
3230    Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3231  return Params;
3232}
3233
3234PreservedAnalyses
3235InlineCostAnnotationPrinterPass::run(Function &F,
3236                                     FunctionAnalysisManager &FAM) {
3237  PrintInstructionComments = true;
3238  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3239      [&](Function &F) -> AssumptionCache & {
3240    return FAM.getResult<AssumptionAnalysis>(F);
3241  };
3242  Module *M = F.getParent();
3243  ProfileSummaryInfo PSI(*M);
3244  DataLayout DL(M);
3245  TargetTransformInfo TTI(DL);
3246  // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3247  // In the current implementation, the type of InlineParams doesn't matter as
3248  // the pass serves only for verification of inliner's decisions.
3249  // We can add a flag which determines InlineParams for this run. Right now,
3250  // the default InlineParams are used.
3251  const InlineParams Params = llvm::getInlineParams();
3252  for (BasicBlock &BB : F) {
3253    for (Instruction &I : BB) {
3254      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3255        Function *CalledFunction = CI->getCalledFunction();
3256        if (!CalledFunction || CalledFunction->isDeclaration())
3257          continue;
3258        OptimizationRemarkEmitter ORE(CalledFunction);
3259        InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3260                                    GetAssumptionCache, nullptr, &PSI, &ORE);
3261        ICCA.analyze();
3262        OS << "      Analyzing call of " << CalledFunction->getName()
3263           << "... (caller:" << CI->getCaller()->getName() << ")\n";
3264        ICCA.print(OS);
3265        OS << "\n";
3266      }
3267    }
3268  }
3269  return PreservedAnalyses::all();
3270}
3271