1//===- PartialInlining.cpp - Inline parts of functions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs partial inlining, typically by inlining an if statement
10// that surrounds the body of the function.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/IPO/PartialInlining.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseSet.h"
17#include "llvm/ADT/None.h"
18#include "llvm/ADT/Optional.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/Analysis/BlockFrequencyInfo.h"
23#include "llvm/Analysis/BranchProbabilityInfo.h"
24#include "llvm/Analysis/InlineCost.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ProfileSummaryInfo.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallSite.h"
34#include "llvm/IR/DebugLoc.h"
35#include "llvm/IR/DiagnosticInfo.h"
36#include "llvm/IR/Dominators.h"
37#include "llvm/IR/Function.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
40#include "llvm/IR/Instructions.h"
41#include "llvm/IR/IntrinsicInst.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/Module.h"
44#include "llvm/IR/User.h"
45#include "llvm/InitializePasses.h"
46#include "llvm/Pass.h"
47#include "llvm/Support/BlockFrequency.h"
48#include "llvm/Support/BranchProbability.h"
49#include "llvm/Support/Casting.h"
50#include "llvm/Support/CommandLine.h"
51#include "llvm/Support/ErrorHandling.h"
52#include "llvm/Transforms/IPO.h"
53#include "llvm/Transforms/Utils/Cloning.h"
54#include "llvm/Transforms/Utils/CodeExtractor.h"
55#include "llvm/Transforms/Utils/ValueMapper.h"
56#include <algorithm>
57#include <cassert>
58#include <cstdint>
59#include <functional>
60#include <iterator>
61#include <memory>
62#include <tuple>
63#include <vector>
64
65using namespace llvm;
66
67#define DEBUG_TYPE "partial-inlining"
68
69STATISTIC(NumPartialInlined,
70          "Number of callsites functions partially inlined into.");
71STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72                                        "cold outlined regions were partially "
73                                        "inlined into its caller(s).");
74STATISTIC(NumColdRegionsFound,
75           "Number of cold single entry/exit regions found.");
76STATISTIC(NumColdRegionsOutlined,
77           "Number of cold single entry/exit regions outlined.");
78
79// Command line option to disable partial-inlining. The default is false:
80static cl::opt<bool>
81    DisablePartialInlining("disable-partial-inlining", cl::init(false),
82                           cl::Hidden, cl::desc("Disable partial inlining"));
83// Command line option to disable multi-region partial-inlining. The default is
84// false:
85static cl::opt<bool> DisableMultiRegionPartialInline(
86    "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87    cl::desc("Disable multi-region partial inlining"));
88
89// Command line option to force outlining in regions with live exit variables.
90// The default is false:
91static cl::opt<bool>
92    ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93               cl::desc("Force outline regions with live exits"));
94
95// Command line option to enable marking outline functions with Cold Calling
96// Convention. The default is false:
97static cl::opt<bool>
98    MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99                       cl::desc("Mark outline function calls with ColdCC"));
100
101#ifndef NDEBUG
102// Command line option to debug partial-inlining. The default is none:
103static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
104                                          cl::init(false), cl::Hidden,
105                                          cl::desc("Trace partial inlining."));
106#endif
107
108// This is an option used by testing:
109static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
110                                      cl::init(false), cl::ZeroOrMore,
111                                      cl::ReallyHidden,
112                                      cl::desc("Skip Cost Analysis"));
113// Used to determine if a cold region is worth outlining based on
114// its inlining cost compared to the original function.  Default is set at 10%.
115// ie. if the cold region reduces the inlining cost of the original function by
116// at least 10%.
117static cl::opt<float> MinRegionSizeRatio(
118    "min-region-size-ratio", cl::init(0.1), cl::Hidden,
119    cl::desc("Minimum ratio comparing relative sizes of each "
120             "outline candidate and original function"));
121// Used to tune the minimum number of execution counts needed in the predecessor
122// block to the cold edge. ie. confidence interval.
123static cl::opt<unsigned>
124    MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
125                             cl::desc("Minimum block executions to consider "
126                                      "its BranchProbabilityInfo valid"));
127// Used to determine when an edge is considered cold. Default is set to 10%. ie.
128// if the branch probability is 10% or less, then it is deemed as 'cold'.
129static cl::opt<float> ColdBranchRatio(
130    "cold-branch-ratio", cl::init(0.1), cl::Hidden,
131    cl::desc("Minimum BranchProbability to consider a region cold."));
132
133static cl::opt<unsigned> MaxNumInlineBlocks(
134    "max-num-inline-blocks", cl::init(5), cl::Hidden,
135    cl::desc("Max number of blocks to be partially inlined"));
136
137// Command line option to set the maximum number of partial inlining allowed
138// for the module. The default value of -1 means no limit.
139static cl::opt<int> MaxNumPartialInlining(
140    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
141    cl::desc("Max number of partial inlining. The default is unlimited"));
142
143// Used only when PGO or user annotated branch data is absent. It is
144// the least value that is used to weigh the outline region. If BFI
145// produces larger value, the BFI value will be used.
146static cl::opt<int>
147    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
148                             cl::Hidden, cl::ZeroOrMore,
149                             cl::desc("Relative frequency of outline region to "
150                                      "the entry block"));
151
152static cl::opt<unsigned> ExtraOutliningPenalty(
153    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
154    cl::desc("A debug option to add additional penalty to the computed one."));
155
156namespace {
157
158struct FunctionOutliningInfo {
159  FunctionOutliningInfo() = default;
160
161  // Returns the number of blocks to be inlined including all blocks
162  // in Entries and one return block.
163  unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
164
165  // A set of blocks including the function entry that guard
166  // the region to be outlined.
167  SmallVector<BasicBlock *, 4> Entries;
168
169  // The return block that is not included in the outlined region.
170  BasicBlock *ReturnBlock = nullptr;
171
172  // The dominating block of the region to be outlined.
173  BasicBlock *NonReturnBlock = nullptr;
174
175  // The set of blocks in Entries that that are predecessors to ReturnBlock
176  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
177};
178
179struct FunctionOutliningMultiRegionInfo {
180  FunctionOutliningMultiRegionInfo()
181      : ORI() {}
182
183  // Container for outline regions
184  struct OutlineRegionInfo {
185    OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
186                      BasicBlock *EntryBlock, BasicBlock *ExitBlock,
187                      BasicBlock *ReturnBlock)
188        : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
189          ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
190    SmallVector<BasicBlock *, 8> Region;
191    BasicBlock *EntryBlock;
192    BasicBlock *ExitBlock;
193    BasicBlock *ReturnBlock;
194  };
195
196  SmallVector<OutlineRegionInfo, 4> ORI;
197};
198
199struct PartialInlinerImpl {
200
201  PartialInlinerImpl(
202      std::function<AssumptionCache &(Function &)> *GetAC,
203      function_ref<AssumptionCache *(Function &)> LookupAC,
204      std::function<TargetTransformInfo &(Function &)> *GTTI,
205      Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
206      ProfileSummaryInfo *ProfSI)
207      : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208        GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
209
210  bool run(Module &M);
211  // Main part of the transformation that calls helper functions to find
212  // outlining candidates, clone & outline the function, and attempt to
213  // partially inline the resulting function. Returns true if
214  // inlining was successful, false otherwise.  Also returns the outline
215  // function (only if we partially inlined early returns) as there is a
216  // possibility to further "peel" early return statements that were left in the
217  // outline function due to code size.
218  std::pair<bool, Function *> unswitchFunction(Function *F);
219
220  // This class speculatively clones the function to be partial inlined.
221  // At the end of partial inlining, the remaining callsites to the cloned
222  // function that are not partially inlined will be fixed up to reference
223  // the original function, and the cloned function will be erased.
224  struct FunctionCloner {
225    // Two constructors, one for single region outlining, the other for
226    // multi-region outlining.
227    FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228                   OptimizationRemarkEmitter &ORE,
229                   function_ref<AssumptionCache *(Function &)> LookupAC);
230    FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231                   OptimizationRemarkEmitter &ORE,
232                   function_ref<AssumptionCache *(Function &)> LookupAC);
233    ~FunctionCloner();
234
235    // Prepare for function outlining: making sure there is only
236    // one incoming edge from the extracted/outlined region to
237    // the return block.
238    void NormalizeReturnBlock();
239
240    // Do function outlining for cold regions.
241    bool doMultiRegionFunctionOutlining();
242    // Do function outlining for region after early return block(s).
243    // NOTE: For vararg functions that do the vararg handling in the outlined
244    //       function, we temporarily generate IR that does not properly
245    //       forward varargs to the outlined function. Calling InlineFunction
246    //       will update calls to the outlined functions to properly forward
247    //       the varargs.
248    Function *doSingleRegionFunctionOutlining();
249
250    Function *OrigFunc = nullptr;
251    Function *ClonedFunc = nullptr;
252
253    typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
254    // Keep track of Outlined Functions and the basic block they're called from.
255    SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
256
257    // ClonedFunc is inlined in one of its callers after function
258    // outlining.
259    bool IsFunctionInlined = false;
260    // The cost of the region to be outlined.
261    int OutlinedRegionCost = 0;
262    // ClonedOI is specific to outlining non-early return blocks.
263    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
264    // ClonedOMRI is specific to outlining cold regions.
265    std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
266    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
267    OptimizationRemarkEmitter &ORE;
268    function_ref<AssumptionCache *(Function &)> LookupAC;
269  };
270
271private:
272  int NumPartialInlining = 0;
273  std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
274  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
275  std::function<TargetTransformInfo &(Function &)> *GetTTI;
276  Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
277  ProfileSummaryInfo *PSI;
278
279  // Return the frequency of the OutlininingBB relative to F's entry point.
280  // The result is no larger than 1 and is represented using BP.
281  // (Note that the outlined region's 'head' block can only have incoming
282  // edges from the guarding entry blocks).
283  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
284
285  // Return true if the callee of CS should be partially inlined with
286  // profit.
287  bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
288                           BlockFrequency WeightedOutliningRcost,
289                           OptimizationRemarkEmitter &ORE);
290
291  // Try to inline DuplicateFunction (cloned from F with call to
292  // the OutlinedFunction into its callers. Return true
293  // if there is any successful inlining.
294  bool tryPartialInline(FunctionCloner &Cloner);
295
296  // Compute the mapping from use site of DuplicationFunction to the enclosing
297  // BB's profile count.
298  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
299                                     DenseMap<User *, uint64_t> &SiteCountMap);
300
301  bool IsLimitReached() {
302    return (MaxNumPartialInlining != -1 &&
303            NumPartialInlining >= MaxNumPartialInlining);
304  }
305
306  static CallSite getCallSite(User *U) {
307    CallSite CS;
308    if (CallInst *CI = dyn_cast<CallInst>(U))
309      CS = CallSite(CI);
310    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
311      CS = CallSite(II);
312    else
313      llvm_unreachable("All uses must be calls");
314    return CS;
315  }
316
317  static CallSite getOneCallSiteTo(Function *F) {
318    User *User = *F->user_begin();
319    return getCallSite(User);
320  }
321
322  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
323    CallSite CS = getOneCallSiteTo(F);
324    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
325    BasicBlock *Block = CS.getParent();
326    return std::make_tuple(DLoc, Block);
327  }
328
329  // Returns the costs associated with function outlining:
330  // - The first value is the non-weighted runtime cost for making the call
331  //   to the outlined function, including the addtional  setup cost in the
332  //    outlined function itself;
333  // - The second value is the estimated size of the new call sequence in
334  //   basic block Cloner.OutliningCallBB;
335  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
336
337  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
338  // approximate both the size and runtime cost (Note that in the current
339  // inline cost analysis, there is no clear distinction there either).
340  static int computeBBInlineCost(BasicBlock *BB);
341
342  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
343  std::unique_ptr<FunctionOutliningMultiRegionInfo>
344  computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
345};
346
347struct PartialInlinerLegacyPass : public ModulePass {
348  static char ID; // Pass identification, replacement for typeid
349
350  PartialInlinerLegacyPass() : ModulePass(ID) {
351    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
352  }
353
354  void getAnalysisUsage(AnalysisUsage &AU) const override {
355    AU.addRequired<AssumptionCacheTracker>();
356    AU.addRequired<ProfileSummaryInfoWrapperPass>();
357    AU.addRequired<TargetTransformInfoWrapperPass>();
358  }
359
360  bool runOnModule(Module &M) override {
361    if (skipModule(M))
362      return false;
363
364    AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
365    TargetTransformInfoWrapperPass *TTIWP =
366        &getAnalysis<TargetTransformInfoWrapperPass>();
367    ProfileSummaryInfo *PSI =
368        &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
369
370    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
371        [&ACT](Function &F) -> AssumptionCache & {
372      return ACT->getAssumptionCache(F);
373    };
374
375    auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
376      return ACT->lookupAssumptionCache(F);
377    };
378
379    std::function<TargetTransformInfo &(Function &)> GetTTI =
380        [&TTIWP](Function &F) -> TargetTransformInfo & {
381      return TTIWP->getTTI(F);
382    };
383
384    return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
385                              &GetTTI, NoneType::None, PSI)
386        .run(M);
387  }
388};
389
390} // end anonymous namespace
391
392std::unique_ptr<FunctionOutliningMultiRegionInfo>
393PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
394                                                    OptimizationRemarkEmitter &ORE) {
395  BasicBlock *EntryBlock = &F->front();
396
397  DominatorTree DT(*F);
398  LoopInfo LI(DT);
399  BranchProbabilityInfo BPI(*F, LI);
400  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
401  BlockFrequencyInfo *BFI;
402  if (!GetBFI) {
403    ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
404    BFI = ScopedBFI.get();
405  } else
406    BFI = &(*GetBFI)(*F);
407
408  // Return if we don't have profiling information.
409  if (!PSI->hasInstrumentationProfile())
410    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
411
412  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
413      std::make_unique<FunctionOutliningMultiRegionInfo>();
414
415  auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
416    BasicBlock *Dom = BlockList.front();
417    return BlockList.size() > 1 && Dom->hasNPredecessors(1);
418  };
419
420  auto IsSingleExit =
421      [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
422    BasicBlock *ExitBlock = nullptr;
423    for (auto *Block : BlockList) {
424      for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
425        if (!is_contained(BlockList, *SI)) {
426          if (ExitBlock) {
427            ORE.emit([&]() {
428              return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
429                                              &SI->front())
430                     << "Region dominated by "
431                     << ore::NV("Block", BlockList.front()->getName())
432                     << " has more than one region exit edge.";
433            });
434            return nullptr;
435          } else
436            ExitBlock = Block;
437        }
438      }
439    }
440    return ExitBlock;
441  };
442
443  auto BBProfileCount = [BFI](BasicBlock *BB) {
444    return BFI->getBlockProfileCount(BB)
445               ? BFI->getBlockProfileCount(BB).getValue()
446               : 0;
447  };
448
449  // Use the same computeBBInlineCost function to compute the cost savings of
450  // the outlining the candidate region.
451  int OverallFunctionCost = 0;
452  for (auto &BB : *F)
453    OverallFunctionCost += computeBBInlineCost(&BB);
454
455#ifndef NDEBUG
456  if (TracePartialInlining)
457    dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
458#endif
459  int MinOutlineRegionCost =
460      static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
461  BranchProbability MinBranchProbability(
462      static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463      MinBlockCounterExecution);
464  bool ColdCandidateFound = false;
465  BasicBlock *CurrEntry = EntryBlock;
466  std::vector<BasicBlock *> DFS;
467  DenseMap<BasicBlock *, bool> VisitedMap;
468  DFS.push_back(CurrEntry);
469  VisitedMap[CurrEntry] = true;
470  // Use Depth First Search on the basic blocks to find CFG edges that are
471  // considered cold.
472  // Cold regions considered must also have its inline cost compared to the
473  // overall inline cost of the original function.  The region is outlined only
474  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475  // more.
476  while (!DFS.empty()) {
477    auto *thisBB = DFS.back();
478    DFS.pop_back();
479    // Only consider regions with predecessor blocks that are considered
480    // not-cold (default: part of the top 99.99% of all block counters)
481    // AND greater than our minimum block execution count (default: 100).
482    if (PSI->isColdBlock(thisBB, BFI) ||
483        BBProfileCount(thisBB) < MinBlockCounterExecution)
484      continue;
485    for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
486      if (VisitedMap[*SI])
487        continue;
488      VisitedMap[*SI] = true;
489      DFS.push_back(*SI);
490      // If branch isn't cold, we skip to the next one.
491      BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
492      if (SuccProb > MinBranchProbability)
493        continue;
494#ifndef NDEBUG
495      if (TracePartialInlining) {
496        dbgs() << "Found cold edge: " << thisBB->getName() << "->"
497               << (*SI)->getName() << "\nBranch Probability = " << SuccProb
498               << "\n";
499      }
500#endif
501      SmallVector<BasicBlock *, 8> DominateVector;
502      DT.getDescendants(*SI, DominateVector);
503      // We can only outline single entry regions (for now).
504      if (!IsSingleEntry(DominateVector))
505        continue;
506      BasicBlock *ExitBlock = nullptr;
507      // We can only outline single exit regions (for now).
508      if (!(ExitBlock = IsSingleExit(DominateVector)))
509        continue;
510      int OutlineRegionCost = 0;
511      for (auto *BB : DominateVector)
512        OutlineRegionCost += computeBBInlineCost(BB);
513
514#ifndef NDEBUG
515      if (TracePartialInlining)
516        dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
517#endif
518
519      if (OutlineRegionCost < MinOutlineRegionCost) {
520        ORE.emit([&]() {
521          return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
522                                            &SI->front())
523                 << ore::NV("Callee", F) << " inline cost-savings smaller than "
524                 << ore::NV("Cost", MinOutlineRegionCost);
525        });
526        continue;
527      }
528      // For now, ignore blocks that belong to a SISE region that is a
529      // candidate for outlining.  In the future, we may want to look
530      // at inner regions because the outer region may have live-exit
531      // variables.
532      for (auto *BB : DominateVector)
533        VisitedMap[BB] = true;
534      // ReturnBlock here means the block after the outline call
535      BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
536      // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
537      FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
538          DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
539      OutliningInfo->ORI.push_back(RegInfo);
540#ifndef NDEBUG
541      if (TracePartialInlining) {
542        dbgs() << "Found Cold Candidate starting at block: "
543               << DominateVector.front()->getName() << "\n";
544      }
545#endif
546      ColdCandidateFound = true;
547      NumColdRegionsFound++;
548    }
549  }
550  if (ColdCandidateFound)
551    return OutliningInfo;
552  else
553    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
554}
555
556std::unique_ptr<FunctionOutliningInfo>
557PartialInlinerImpl::computeOutliningInfo(Function *F) {
558  BasicBlock *EntryBlock = &F->front();
559  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
560  if (!BR || BR->isUnconditional())
561    return std::unique_ptr<FunctionOutliningInfo>();
562
563  // Returns true if Succ is BB's successor
564  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
565    return is_contained(successors(BB), Succ);
566  };
567
568  auto IsReturnBlock = [](BasicBlock *BB) {
569    Instruction *TI = BB->getTerminator();
570    return isa<ReturnInst>(TI);
571  };
572
573  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574    if (IsReturnBlock(Succ1))
575      return std::make_tuple(Succ1, Succ2);
576    if (IsReturnBlock(Succ2))
577      return std::make_tuple(Succ2, Succ1);
578
579    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580  };
581
582  // Detect a triangular shape:
583  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584    if (IsSuccessor(Succ1, Succ2))
585      return std::make_tuple(Succ1, Succ2);
586    if (IsSuccessor(Succ2, Succ1))
587      return std::make_tuple(Succ2, Succ1);
588
589    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590  };
591
592  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
593      std::make_unique<FunctionOutliningInfo>();
594
595  BasicBlock *CurrEntry = EntryBlock;
596  bool CandidateFound = false;
597  do {
598    // The number of blocks to be inlined has already reached
599    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
600    // disables partial inlining for the function.
601    if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
602      break;
603
604    if (succ_size(CurrEntry) != 2)
605      break;
606
607    BasicBlock *Succ1 = *succ_begin(CurrEntry);
608    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
609
610    BasicBlock *ReturnBlock, *NonReturnBlock;
611    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
612
613    if (ReturnBlock) {
614      OutliningInfo->Entries.push_back(CurrEntry);
615      OutliningInfo->ReturnBlock = ReturnBlock;
616      OutliningInfo->NonReturnBlock = NonReturnBlock;
617      CandidateFound = true;
618      break;
619    }
620
621    BasicBlock *CommSucc;
622    BasicBlock *OtherSucc;
623    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624
625    if (!CommSucc)
626      break;
627
628    OutliningInfo->Entries.push_back(CurrEntry);
629    CurrEntry = OtherSucc;
630  } while (true);
631
632  if (!CandidateFound)
633    return std::unique_ptr<FunctionOutliningInfo>();
634
635  // Do sanity check of the entries: threre should not
636  // be any successors (not in the entry set) other than
637  // {ReturnBlock, NonReturnBlock}
638  assert(OutliningInfo->Entries[0] == &F->front() &&
639         "Function Entry must be the first in Entries vector");
640  DenseSet<BasicBlock *> Entries;
641  for (BasicBlock *E : OutliningInfo->Entries)
642    Entries.insert(E);
643
644  // Returns true of BB has Predecessor which is not
645  // in Entries set.
646  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647    for (auto Pred : predecessors(BB)) {
648      if (!Entries.count(Pred))
649        return true;
650    }
651    return false;
652  };
653  auto CheckAndNormalizeCandidate =
654      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655        for (BasicBlock *E : OutliningInfo->Entries) {
656          for (auto Succ : successors(E)) {
657            if (Entries.count(Succ))
658              continue;
659            if (Succ == OutliningInfo->ReturnBlock)
660              OutliningInfo->ReturnBlockPreds.push_back(E);
661            else if (Succ != OutliningInfo->NonReturnBlock)
662              return false;
663          }
664          // There should not be any outside incoming edges either:
665          if (HasNonEntryPred(E))
666            return false;
667        }
668        return true;
669      };
670
671  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672    return std::unique_ptr<FunctionOutliningInfo>();
673
674  // Now further growing the candidate's inlining region by
675  // peeling off dominating blocks from the outlining region:
676  while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
677    BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678    if (succ_size(Cand) != 2)
679      break;
680
681    if (HasNonEntryPred(Cand))
682      break;
683
684    BasicBlock *Succ1 = *succ_begin(Cand);
685    BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686
687    BasicBlock *ReturnBlock, *NonReturnBlock;
688    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690      break;
691
692    if (NonReturnBlock->getSinglePredecessor() != Cand)
693      break;
694
695    // Now grow and update OutlininigInfo:
696    OutliningInfo->Entries.push_back(Cand);
697    OutliningInfo->NonReturnBlock = NonReturnBlock;
698    OutliningInfo->ReturnBlockPreds.push_back(Cand);
699    Entries.insert(Cand);
700  }
701
702  return OutliningInfo;
703}
704
705// Check if there is PGO data or user annotated branch data:
706static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
707  if (F->hasProfileData())
708    return true;
709  // Now check if any of the entry block has MD_prof data:
710  for (auto *E : OI->Entries) {
711    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712    if (!BR || BR->isUnconditional())
713      continue;
714    uint64_t T, F;
715    if (BR->extractProfMetadata(T, F))
716      return true;
717  }
718  return false;
719}
720
721BranchProbability
722PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
723  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724  auto EntryFreq =
725      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726  auto OutliningCallFreq =
727      Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729  // we outlined any regions, so we may encounter situations where the
730  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
732    OutliningCallFreq = EntryFreq;
733  }
734  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735      OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736
737  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
738    return OutlineRegionRelFreq;
739
740  // When profile data is not available, we need to be conservative in
741  // estimating the overall savings. Static branch prediction can usually
742  // guess the branch direction right (taken/non-taken), but the guessed
743  // branch probability is usually not biased enough. In case when the
744  // outlined region is predicted to be likely, its probability needs
745  // to be made higher (more biased) to not under-estimate the cost of
746  // function outlining. On the other hand, if the outlined region
747  // is predicted to be less likely, the predicted probablity is usually
748  // higher than the actual. For instance, the actual probability of the
749  // less likely target is only 5%, but the guessed probablity can be
750  // 40%. In the latter case, there is no need for further adjustement.
751  // FIXME: add an option for this.
752  if (OutlineRegionRelFreq < BranchProbability(45, 100))
753    return OutlineRegionRelFreq;
754
755  OutlineRegionRelFreq = std::max(
756      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757
758  return OutlineRegionRelFreq;
759}
760
761bool PartialInlinerImpl::shouldPartialInline(
762    CallSite CS, FunctionCloner &Cloner,
763    BlockFrequency WeightedOutliningRcost,
764    OptimizationRemarkEmitter &ORE) {
765  using namespace ore;
766
767  Instruction *Call = CS.getInstruction();
768  Function *Callee = CS.getCalledFunction();
769  assert(Callee == Cloner.ClonedFunc);
770
771  if (SkipCostAnalysis)
772    return isInlineViable(*Callee);
773
774  Function *Caller = CS.getCaller();
775  auto &CalleeTTI = (*GetTTI)(*Callee);
776  bool RemarksEnabled =
777      Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
778          DEBUG_TYPE);
779  assert(Call && "invalid callsite for partial inline");
780  InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
781                                CalleeTTI, *GetAssumptionCache, GetBFI, PSI,
782                                RemarksEnabled ? &ORE : nullptr);
783
784  if (IC.isAlways()) {
785    ORE.emit([&]() {
786      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
787             << NV("Callee", Cloner.OrigFunc)
788             << " should always be fully inlined, not partially";
789    });
790    return false;
791  }
792
793  if (IC.isNever()) {
794    ORE.emit([&]() {
795      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
796             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
797             << NV("Caller", Caller)
798             << " because it should never be inlined (cost=never)";
799    });
800    return false;
801  }
802
803  if (!IC) {
804    ORE.emit([&]() {
805      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
806             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
807             << NV("Caller", Caller) << " because too costly to inline (cost="
808             << NV("Cost", IC.getCost()) << ", threshold="
809             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
810    });
811    return false;
812  }
813  const DataLayout &DL = Caller->getParent()->getDataLayout();
814
815  // The savings of eliminating the call:
816  int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
817  BlockFrequency NormWeightedSavings(NonWeightedSavings);
818
819  // Weighted saving is smaller than weighted cost, return false
820  if (NormWeightedSavings < WeightedOutliningRcost) {
821    ORE.emit([&]() {
822      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
823                                        Call)
824             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
825             << NV("Caller", Caller) << " runtime overhead (overhead="
826             << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
827             << ", savings="
828             << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
829             << ")"
830             << " of making the outlined call is too high";
831    });
832
833    return false;
834  }
835
836  ORE.emit([&]() {
837    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
838           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
839           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
840           << " (threshold="
841           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
842  });
843  return true;
844}
845
846// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
847// For now use a simplified version. The returned 'InlineCost' will be used
848// to esimate the size cost as well as runtime cost of the BB.
849int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
850  int InlineCost = 0;
851  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
852  for (Instruction &I : BB->instructionsWithoutDebug()) {
853    // Skip free instructions.
854    switch (I.getOpcode()) {
855    case Instruction::BitCast:
856    case Instruction::PtrToInt:
857    case Instruction::IntToPtr:
858    case Instruction::Alloca:
859    case Instruction::PHI:
860      continue;
861    case Instruction::GetElementPtr:
862      if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
863        continue;
864      break;
865    default:
866      break;
867    }
868
869    if (I.isLifetimeStartOrEnd())
870      continue;
871
872    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
873      InlineCost += getCallsiteCost(*CI, DL);
874      continue;
875    }
876
877    if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
878      InlineCost += getCallsiteCost(*II, DL);
879      continue;
880    }
881
882    if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
883      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
884      continue;
885    }
886    InlineCost += InlineConstants::InstrCost;
887  }
888  return InlineCost;
889}
890
891std::tuple<int, int>
892PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
893  int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
894  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
895    Function *OutlinedFunc = FuncBBPair.first;
896    BasicBlock* OutliningCallBB = FuncBBPair.second;
897    // Now compute the cost of the call sequence to the outlined function
898    // 'OutlinedFunction' in BB 'OutliningCallBB':
899    OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
900
901    // Now compute the cost of the extracted/outlined function itself:
902    for (BasicBlock &BB : *OutlinedFunc)
903      OutlinedFunctionCost += computeBBInlineCost(&BB);
904  }
905  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
906         "Outlined function cost should be no less than the outlined region");
907
908  // The code extractor introduces a new root and exit stub blocks with
909  // additional unconditional branches. Those branches will be eliminated
910  // later with bb layout. The cost should be adjusted accordingly:
911  OutlinedFunctionCost -=
912      2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
913
914  int OutliningRuntimeOverhead =
915      OutliningFuncCallCost +
916      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
917      ExtraOutliningPenalty;
918
919  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
920}
921
922// Create the callsite to profile count map which is
923// used to update the original function's entry count,
924// after the function is partially inlined into the callsite.
925void PartialInlinerImpl::computeCallsiteToProfCountMap(
926    Function *DuplicateFunction,
927    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
928  std::vector<User *> Users(DuplicateFunction->user_begin(),
929                            DuplicateFunction->user_end());
930  Function *CurrentCaller = nullptr;
931  std::unique_ptr<BlockFrequencyInfo> TempBFI;
932  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
933
934  auto ComputeCurrBFI = [&,this](Function *Caller) {
935      // For the old pass manager:
936      if (!GetBFI) {
937        DominatorTree DT(*Caller);
938        LoopInfo LI(DT);
939        BranchProbabilityInfo BPI(*Caller, LI);
940        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
941        CurrentCallerBFI = TempBFI.get();
942      } else {
943        // New pass manager:
944        CurrentCallerBFI = &(*GetBFI)(*Caller);
945      }
946  };
947
948  for (User *User : Users) {
949    CallSite CS = getCallSite(User);
950    Function *Caller = CS.getCaller();
951    if (CurrentCaller != Caller) {
952      CurrentCaller = Caller;
953      ComputeCurrBFI(Caller);
954    } else {
955      assert(CurrentCallerBFI && "CallerBFI is not set");
956    }
957    BasicBlock *CallBB = CS.getInstruction()->getParent();
958    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
959    if (Count)
960      CallSiteToProfCountMap[User] = *Count;
961    else
962      CallSiteToProfCountMap[User] = 0;
963  }
964}
965
966PartialInlinerImpl::FunctionCloner::FunctionCloner(
967    Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
968    function_ref<AssumptionCache *(Function &)> LookupAC)
969    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
970  ClonedOI = std::make_unique<FunctionOutliningInfo>();
971
972  // Clone the function, so that we can hack away on it.
973  ValueToValueMapTy VMap;
974  ClonedFunc = CloneFunction(F, VMap);
975
976  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
977  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
978  for (BasicBlock *BB : OI->Entries) {
979    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
980  }
981  for (BasicBlock *E : OI->ReturnBlockPreds) {
982    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
983    ClonedOI->ReturnBlockPreds.push_back(NewE);
984  }
985  // Go ahead and update all uses to the duplicate, so that we can just
986  // use the inliner functionality when we're done hacking.
987  F->replaceAllUsesWith(ClonedFunc);
988}
989
990PartialInlinerImpl::FunctionCloner::FunctionCloner(
991    Function *F, FunctionOutliningMultiRegionInfo *OI,
992    OptimizationRemarkEmitter &ORE,
993    function_ref<AssumptionCache *(Function &)> LookupAC)
994    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
995  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
996
997  // Clone the function, so that we can hack away on it.
998  ValueToValueMapTy VMap;
999  ClonedFunc = CloneFunction(F, VMap);
1000
1001  // Go through all Outline Candidate Regions and update all BasicBlock
1002  // information.
1003  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1004       OI->ORI) {
1005    SmallVector<BasicBlock *, 8> Region;
1006    for (BasicBlock *BB : RegionInfo.Region) {
1007      Region.push_back(cast<BasicBlock>(VMap[BB]));
1008    }
1009    BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1010    BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1011    BasicBlock *NewReturnBlock = nullptr;
1012    if (RegionInfo.ReturnBlock)
1013      NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1014    FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1015        Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1016    ClonedOMRI->ORI.push_back(MappedRegionInfo);
1017  }
1018  // Go ahead and update all uses to the duplicate, so that we can just
1019  // use the inliner functionality when we're done hacking.
1020  F->replaceAllUsesWith(ClonedFunc);
1021}
1022
1023void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1024  auto getFirstPHI = [](BasicBlock *BB) {
1025    BasicBlock::iterator I = BB->begin();
1026    PHINode *FirstPhi = nullptr;
1027    while (I != BB->end()) {
1028      PHINode *Phi = dyn_cast<PHINode>(I);
1029      if (!Phi)
1030        break;
1031      if (!FirstPhi) {
1032        FirstPhi = Phi;
1033        break;
1034      }
1035    }
1036    return FirstPhi;
1037  };
1038
1039  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1040  // blocks.
1041  if (!ClonedOI)
1042    return;
1043
1044  // Special hackery is needed with PHI nodes that have inputs from more than
1045  // one extracted block.  For simplicity, just split the PHIs into a two-level
1046  // sequence of PHIs, some of which will go in the extracted region, and some
1047  // of which will go outside.
1048  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1049  // only split block when necessary:
1050  PHINode *FirstPhi = getFirstPHI(PreReturn);
1051  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1052
1053  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1054    return;
1055
1056  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1057    Value *CommonValue = PN->getIncomingValue(0);
1058    if (all_of(PN->incoming_values(),
1059               [&](Value *V) { return V == CommonValue; }))
1060      return CommonValue;
1061    return nullptr;
1062  };
1063
1064  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1065      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1066  BasicBlock::iterator I = PreReturn->begin();
1067  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1068  SmallVector<Instruction *, 4> DeadPhis;
1069  while (I != PreReturn->end()) {
1070    PHINode *OldPhi = dyn_cast<PHINode>(I);
1071    if (!OldPhi)
1072      break;
1073
1074    PHINode *RetPhi =
1075        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1076    OldPhi->replaceAllUsesWith(RetPhi);
1077    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1078
1079    RetPhi->addIncoming(&*I, PreReturn);
1080    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1081      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1082      OldPhi->removeIncomingValue(E);
1083    }
1084
1085    // After incoming values splitting, the old phi may become trivial.
1086    // Keeping the trivial phi can introduce definition inside the outline
1087    // region which is live-out, causing necessary overhead (load, store
1088    // arg passing etc).
1089    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1090      OldPhi->replaceAllUsesWith(OldPhiVal);
1091      DeadPhis.push_back(OldPhi);
1092    }
1093    ++I;
1094  }
1095  for (auto *DP : DeadPhis)
1096    DP->eraseFromParent();
1097
1098  for (auto E : ClonedOI->ReturnBlockPreds) {
1099    E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1100  }
1101}
1102
1103bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1104
1105  auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1106    int Cost = 0;
1107    for (BasicBlock* BB : Region)
1108      Cost += computeBBInlineCost(BB);
1109    return Cost;
1110  };
1111
1112  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1113
1114  if (ClonedOMRI->ORI.empty())
1115    return false;
1116
1117  // The CodeExtractor needs a dominator tree.
1118  DominatorTree DT;
1119  DT.recalculate(*ClonedFunc);
1120
1121  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1122  LoopInfo LI(DT);
1123  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1124  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1125
1126  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1127  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1128
1129  SetVector<Value *> Inputs, Outputs, Sinks;
1130  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1131       ClonedOMRI->ORI) {
1132    int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1133
1134    CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1135                     ClonedFuncBFI.get(), &BPI,
1136                     LookupAC(*RegionInfo.EntryBlock->getParent()),
1137                     /* AllowVarargs */ false);
1138
1139    CE.findInputsOutputs(Inputs, Outputs, Sinks);
1140
1141#ifndef NDEBUG
1142    if (TracePartialInlining) {
1143      dbgs() << "inputs: " << Inputs.size() << "\n";
1144      dbgs() << "outputs: " << Outputs.size() << "\n";
1145      for (Value *value : Inputs)
1146        dbgs() << "value used in func: " << *value << "\n";
1147      for (Value *output : Outputs)
1148        dbgs() << "instr used in func: " << *output << "\n";
1149    }
1150#endif
1151    // Do not extract regions that have live exit variables.
1152    if (Outputs.size() > 0 && !ForceLiveExit)
1153      continue;
1154
1155    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1156
1157    if (OutlinedFunc) {
1158      CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1159      BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1160      assert(OutliningCallBB->getParent() == ClonedFunc);
1161      OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1162      NumColdRegionsOutlined++;
1163      OutlinedRegionCost += CurrentOutlinedRegionCost;
1164
1165      if (MarkOutlinedColdCC) {
1166        OutlinedFunc->setCallingConv(CallingConv::Cold);
1167        OCS.setCallingConv(CallingConv::Cold);
1168      }
1169    } else
1170      ORE.emit([&]() {
1171        return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1172                                        &RegionInfo.Region.front()->front())
1173               << "Failed to extract region at block "
1174               << ore::NV("Block", RegionInfo.Region.front());
1175      });
1176  }
1177
1178  return !OutlinedFunctions.empty();
1179}
1180
1181Function *
1182PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1183  // Returns true if the block is to be partial inlined into the caller
1184  // (i.e. not to be extracted to the out of line function)
1185  auto ToBeInlined = [&, this](BasicBlock *BB) {
1186    return BB == ClonedOI->ReturnBlock ||
1187           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1188            ClonedOI->Entries.end());
1189  };
1190
1191  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1192  // The CodeExtractor needs a dominator tree.
1193  DominatorTree DT;
1194  DT.recalculate(*ClonedFunc);
1195
1196  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1197  LoopInfo LI(DT);
1198  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1199  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1200
1201  // Gather up the blocks that we're going to extract.
1202  std::vector<BasicBlock *> ToExtract;
1203  ToExtract.push_back(ClonedOI->NonReturnBlock);
1204  OutlinedRegionCost +=
1205      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1206  for (BasicBlock &BB : *ClonedFunc)
1207    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1208      ToExtract.push_back(&BB);
1209      // FIXME: the code extractor may hoist/sink more code
1210      // into the outlined function which may make the outlining
1211      // overhead (the difference of the outlined function cost
1212      // and OutliningRegionCost) look larger.
1213      OutlinedRegionCost += computeBBInlineCost(&BB);
1214    }
1215
1216  // Extract the body of the if.
1217  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1218  Function *OutlinedFunc =
1219      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1220                    ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1221                    /* AllowVarargs */ true)
1222          .extractCodeRegion(CEAC);
1223
1224  if (OutlinedFunc) {
1225    BasicBlock *OutliningCallBB =
1226        PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1227            .getInstruction()
1228            ->getParent();
1229    assert(OutliningCallBB->getParent() == ClonedFunc);
1230    OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1231  } else
1232    ORE.emit([&]() {
1233      return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1234                                      &ToExtract.front()->front())
1235             << "Failed to extract region at block "
1236             << ore::NV("Block", ToExtract.front());
1237    });
1238
1239  return OutlinedFunc;
1240}
1241
1242PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1243  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1244  // users (function pointers, etc.) back to the original function.
1245  ClonedFunc->replaceAllUsesWith(OrigFunc);
1246  ClonedFunc->eraseFromParent();
1247  if (!IsFunctionInlined) {
1248    // Remove each function that was speculatively created if there is no
1249    // reference.
1250    for (auto FuncBBPair : OutlinedFunctions) {
1251      Function *Func = FuncBBPair.first;
1252      Func->eraseFromParent();
1253    }
1254  }
1255}
1256
1257std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1258
1259  if (F->hasAddressTaken())
1260    return {false, nullptr};
1261
1262  // Let inliner handle it
1263  if (F->hasFnAttribute(Attribute::AlwaysInline))
1264    return {false, nullptr};
1265
1266  if (F->hasFnAttribute(Attribute::NoInline))
1267    return {false, nullptr};
1268
1269  if (PSI->isFunctionEntryCold(F))
1270    return {false, nullptr};
1271
1272  if (F->users().empty())
1273    return {false, nullptr};
1274
1275  OptimizationRemarkEmitter ORE(F);
1276
1277  // Only try to outline cold regions if we have a profile summary, which
1278  // implies we have profiling information.
1279  if (PSI->hasProfileSummary() && F->hasProfileData() &&
1280      !DisableMultiRegionPartialInline) {
1281    std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1282        computeOutliningColdRegionsInfo(F, ORE);
1283    if (OMRI) {
1284      FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1285
1286#ifndef NDEBUG
1287      if (TracePartialInlining) {
1288        dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1289        dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1290               << "\n";
1291      }
1292#endif
1293      bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1294
1295      if (DidOutline) {
1296#ifndef NDEBUG
1297        if (TracePartialInlining) {
1298          dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1299          Cloner.ClonedFunc->print(dbgs());
1300          dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1301        }
1302#endif
1303
1304        if (tryPartialInline(Cloner))
1305          return {true, nullptr};
1306      }
1307    }
1308  }
1309
1310  // Fall-thru to regular partial inlining if we:
1311  //    i) can't find any cold regions to outline, or
1312  //   ii) can't inline the outlined function anywhere.
1313  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1314  if (!OI)
1315    return {false, nullptr};
1316
1317  FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1318  Cloner.NormalizeReturnBlock();
1319
1320  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1321
1322  if (!OutlinedFunction)
1323    return {false, nullptr};
1324
1325  bool AnyInline = tryPartialInline(Cloner);
1326
1327  if (AnyInline)
1328    return {true, OutlinedFunction};
1329
1330  return {false, nullptr};
1331}
1332
1333bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1334  if (Cloner.OutlinedFunctions.empty())
1335    return false;
1336
1337  int SizeCost = 0;
1338  BlockFrequency WeightedRcost;
1339  int NonWeightedRcost;
1340  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1341
1342  // Only calculate RelativeToEntryFreq when we are doing single region
1343  // outlining.
1344  BranchProbability RelativeToEntryFreq;
1345  if (Cloner.ClonedOI) {
1346    RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1347  } else
1348    // RelativeToEntryFreq doesn't make sense when we have more than one
1349    // outlined call because each call will have a different relative frequency
1350    // to the entry block.  We can consider using the average, but the
1351    // usefulness of that information is questionable. For now, assume we never
1352    // execute the calls to outlined functions.
1353    RelativeToEntryFreq = BranchProbability(0, 1);
1354
1355  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1356
1357  // The call sequence(s) to the outlined function(s) are larger than the sum of
1358  // the original outlined region size(s), it does not increase the chances of
1359  // inlining the function with outlining (The inliner uses the size increase to
1360  // model the cost of inlining a callee).
1361  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1362    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1363    DebugLoc DLoc;
1364    BasicBlock *Block;
1365    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1366    OrigFuncORE.emit([&]() {
1367      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1368                                        DLoc, Block)
1369             << ore::NV("Function", Cloner.OrigFunc)
1370             << " not partially inlined into callers (Original Size = "
1371             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1372             << ", Size of call sequence to outlined function = "
1373             << ore::NV("NewSize", SizeCost) << ")";
1374    });
1375    return false;
1376  }
1377
1378  assert(Cloner.OrigFunc->users().empty() &&
1379         "F's users should all be replaced!");
1380
1381  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1382                            Cloner.ClonedFunc->user_end());
1383
1384  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1385  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1386  if (CalleeEntryCount)
1387    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1388
1389  uint64_t CalleeEntryCountV =
1390      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1391
1392  bool AnyInline = false;
1393  for (User *User : Users) {
1394    CallSite CS = getCallSite(User);
1395
1396    if (IsLimitReached())
1397      continue;
1398
1399    OptimizationRemarkEmitter CallerORE(CS.getCaller());
1400    if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1401      continue;
1402
1403    // Construct remark before doing the inlining, as after successful inlining
1404    // the callsite is removed.
1405    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1406    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1407       << ore::NV("Caller", CS.getCaller());
1408
1409    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1410    // We can only forward varargs when we outlined a single region, else we
1411    // bail on vararg functions.
1412    if (!InlineFunction(CS, IFI, nullptr, true,
1413                        (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1414                                         : nullptr)))
1415      continue;
1416
1417    CallerORE.emit(OR);
1418
1419    // Now update the entry count:
1420    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1421      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1422      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1423    }
1424
1425    AnyInline = true;
1426    NumPartialInlining++;
1427    // Update the stats
1428    if (Cloner.ClonedOI)
1429      NumPartialInlined++;
1430    else
1431      NumColdOutlinePartialInlined++;
1432
1433  }
1434
1435  if (AnyInline) {
1436    Cloner.IsFunctionInlined = true;
1437    if (CalleeEntryCount)
1438      Cloner.OrigFunc->setEntryCount(
1439          CalleeEntryCount.setCount(CalleeEntryCountV));
1440    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1441    OrigFuncORE.emit([&]() {
1442      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1443             << "Partially inlined into at least one caller";
1444    });
1445
1446  }
1447
1448  return AnyInline;
1449}
1450
1451bool PartialInlinerImpl::run(Module &M) {
1452  if (DisablePartialInlining)
1453    return false;
1454
1455  std::vector<Function *> Worklist;
1456  Worklist.reserve(M.size());
1457  for (Function &F : M)
1458    if (!F.use_empty() && !F.isDeclaration())
1459      Worklist.push_back(&F);
1460
1461  bool Changed = false;
1462  while (!Worklist.empty()) {
1463    Function *CurrFunc = Worklist.back();
1464    Worklist.pop_back();
1465
1466    if (CurrFunc->use_empty())
1467      continue;
1468
1469    bool Recursive = false;
1470    for (User *U : CurrFunc->users())
1471      if (Instruction *I = dyn_cast<Instruction>(U))
1472        if (I->getParent()->getParent() == CurrFunc) {
1473          Recursive = true;
1474          break;
1475        }
1476    if (Recursive)
1477      continue;
1478
1479    std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1480    if (Result.second)
1481      Worklist.push_back(Result.second);
1482    Changed |= Result.first;
1483  }
1484
1485  return Changed;
1486}
1487
1488char PartialInlinerLegacyPass::ID = 0;
1489
1490INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1491                      "Partial Inliner", false, false)
1492INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1493INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1494INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1495INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1496                    "Partial Inliner", false, false)
1497
1498ModulePass *llvm::createPartialInliningPass() {
1499  return new PartialInlinerLegacyPass();
1500}
1501
1502PreservedAnalyses PartialInlinerPass::run(Module &M,
1503                                          ModuleAnalysisManager &AM) {
1504  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1505
1506  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1507      [&FAM](Function &F) -> AssumptionCache & {
1508    return FAM.getResult<AssumptionAnalysis>(F);
1509  };
1510
1511  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1512    return FAM.getCachedResult<AssumptionAnalysis>(F);
1513  };
1514
1515  std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1516      [&FAM](Function &F) -> BlockFrequencyInfo & {
1517    return FAM.getResult<BlockFrequencyAnalysis>(F);
1518  };
1519
1520  std::function<TargetTransformInfo &(Function &)> GetTTI =
1521      [&FAM](Function &F) -> TargetTransformInfo & {
1522    return FAM.getResult<TargetIRAnalysis>(F);
1523  };
1524
1525  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1526
1527  if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
1528                         {GetBFI}, PSI)
1529          .run(M))
1530    return PreservedAnalyses::none();
1531  return PreservedAnalyses::all();
1532}
1533