1//===- PartialInlining.cpp - Inline parts of functions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs partial inlining, typically by inlining an if statement
10// that surrounds the body of the function.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/IPO/PartialInlining.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseSet.h"
17#include "llvm/ADT/None.h"
18#include "llvm/ADT/Optional.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/Statistic.h"
22#include "llvm/Analysis/BlockFrequencyInfo.h"
23#include "llvm/Analysis/BranchProbabilityInfo.h"
24#include "llvm/Analysis/InlineCost.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ProfileSummaryInfo.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/DebugLoc.h"
34#include "llvm/IR/DiagnosticInfo.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Instruction.h"
39#include "llvm/IR/Instructions.h"
40#include "llvm/IR/IntrinsicInst.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Module.h"
43#include "llvm/IR/User.h"
44#include "llvm/InitializePasses.h"
45#include "llvm/Pass.h"
46#include "llvm/Support/BlockFrequency.h"
47#include "llvm/Support/BranchProbability.h"
48#include "llvm/Support/Casting.h"
49#include "llvm/Support/CommandLine.h"
50#include "llvm/Support/ErrorHandling.h"
51#include "llvm/Transforms/IPO.h"
52#include "llvm/Transforms/Utils/Cloning.h"
53#include "llvm/Transforms/Utils/CodeExtractor.h"
54#include "llvm/Transforms/Utils/ValueMapper.h"
55#include <algorithm>
56#include <cassert>
57#include <cstdint>
58#include <functional>
59#include <iterator>
60#include <memory>
61#include <tuple>
62#include <vector>
63
64using namespace llvm;
65
66#define DEBUG_TYPE "partial-inlining"
67
68STATISTIC(NumPartialInlined,
69          "Number of callsites functions partially inlined into.");
70STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                        "cold outlined regions were partially "
72                                        "inlined into its caller(s).");
73STATISTIC(NumColdRegionsFound,
74           "Number of cold single entry/exit regions found.");
75STATISTIC(NumColdRegionsOutlined,
76           "Number of cold single entry/exit regions outlined.");
77
78// Command line option to disable partial-inlining. The default is false:
79static cl::opt<bool>
80    DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                           cl::Hidden, cl::desc("Disable partial inlining"));
82// Command line option to disable multi-region partial-inlining. The default is
83// false:
84static cl::opt<bool> DisableMultiRegionPartialInline(
85    "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86    cl::desc("Disable multi-region partial inlining"));
87
88// Command line option to force outlining in regions with live exit variables.
89// The default is false:
90static cl::opt<bool>
91    ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92               cl::desc("Force outline regions with live exits"));
93
94// Command line option to enable marking outline functions with Cold Calling
95// Convention. The default is false:
96static cl::opt<bool>
97    MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                       cl::desc("Mark outline function calls with ColdCC"));
99
100#ifndef NDEBUG
101// Command line option to debug partial-inlining. The default is none:
102static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103                                          cl::init(false), cl::Hidden,
104                                          cl::desc("Trace partial inlining."));
105#endif
106
107// This is an option used by testing:
108static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109                                      cl::init(false), cl::ZeroOrMore,
110                                      cl::ReallyHidden,
111                                      cl::desc("Skip Cost Analysis"));
112// Used to determine if a cold region is worth outlining based on
113// its inlining cost compared to the original function.  Default is set at 10%.
114// ie. if the cold region reduces the inlining cost of the original function by
115// at least 10%.
116static cl::opt<float> MinRegionSizeRatio(
117    "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118    cl::desc("Minimum ratio comparing relative sizes of each "
119             "outline candidate and original function"));
120// Used to tune the minimum number of execution counts needed in the predecessor
121// block to the cold edge. ie. confidence interval.
122static cl::opt<unsigned>
123    MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124                             cl::desc("Minimum block executions to consider "
125                                      "its BranchProbabilityInfo valid"));
126// Used to determine when an edge is considered cold. Default is set to 10%. ie.
127// if the branch probability is 10% or less, then it is deemed as 'cold'.
128static cl::opt<float> ColdBranchRatio(
129    "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130    cl::desc("Minimum BranchProbability to consider a region cold."));
131
132static cl::opt<unsigned> MaxNumInlineBlocks(
133    "max-num-inline-blocks", cl::init(5), cl::Hidden,
134    cl::desc("Max number of blocks to be partially inlined"));
135
136// Command line option to set the maximum number of partial inlining allowed
137// for the module. The default value of -1 means no limit.
138static cl::opt<int> MaxNumPartialInlining(
139    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140    cl::desc("Max number of partial inlining. The default is unlimited"));
141
142// Used only when PGO or user annotated branch data is absent. It is
143// the least value that is used to weigh the outline region. If BFI
144// produces larger value, the BFI value will be used.
145static cl::opt<int>
146    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147                             cl::Hidden, cl::ZeroOrMore,
148                             cl::desc("Relative frequency of outline region to "
149                                      "the entry block"));
150
151static cl::opt<unsigned> ExtraOutliningPenalty(
152    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153    cl::desc("A debug option to add additional penalty to the computed one."));
154
155namespace {
156
157struct FunctionOutliningInfo {
158  FunctionOutliningInfo() = default;
159
160  // Returns the number of blocks to be inlined including all blocks
161  // in Entries and one return block.
162  unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163
164  // A set of blocks including the function entry that guard
165  // the region to be outlined.
166  SmallVector<BasicBlock *, 4> Entries;
167
168  // The return block that is not included in the outlined region.
169  BasicBlock *ReturnBlock = nullptr;
170
171  // The dominating block of the region to be outlined.
172  BasicBlock *NonReturnBlock = nullptr;
173
174  // The set of blocks in Entries that that are predecessors to ReturnBlock
175  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176};
177
178struct FunctionOutliningMultiRegionInfo {
179  FunctionOutliningMultiRegionInfo()
180      : ORI() {}
181
182  // Container for outline regions
183  struct OutlineRegionInfo {
184    OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
185                      BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186                      BasicBlock *ReturnBlock)
187        : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
188          ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
189    SmallVector<BasicBlock *, 8> Region;
190    BasicBlock *EntryBlock;
191    BasicBlock *ExitBlock;
192    BasicBlock *ReturnBlock;
193  };
194
195  SmallVector<OutlineRegionInfo, 4> ORI;
196};
197
198struct PartialInlinerImpl {
199
200  PartialInlinerImpl(
201      function_ref<AssumptionCache &(Function &)> GetAC,
202      function_ref<AssumptionCache *(Function &)> LookupAC,
203      function_ref<TargetTransformInfo &(Function &)> GTTI,
204      function_ref<const TargetLibraryInfo &(Function &)> GTLI,
205      ProfileSummaryInfo &ProfSI,
206      function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
207      : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208        GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
209
210  bool run(Module &M);
211  // Main part of the transformation that calls helper functions to find
212  // outlining candidates, clone & outline the function, and attempt to
213  // partially inline the resulting function. Returns true if
214  // inlining was successful, false otherwise.  Also returns the outline
215  // function (only if we partially inlined early returns) as there is a
216  // possibility to further "peel" early return statements that were left in the
217  // outline function due to code size.
218  std::pair<bool, Function *> unswitchFunction(Function *F);
219
220  // This class speculatively clones the function to be partial inlined.
221  // At the end of partial inlining, the remaining callsites to the cloned
222  // function that are not partially inlined will be fixed up to reference
223  // the original function, and the cloned function will be erased.
224  struct FunctionCloner {
225    // Two constructors, one for single region outlining, the other for
226    // multi-region outlining.
227    FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228                   OptimizationRemarkEmitter &ORE,
229                   function_ref<AssumptionCache *(Function &)> LookupAC);
230    FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231                   OptimizationRemarkEmitter &ORE,
232                   function_ref<AssumptionCache *(Function &)> LookupAC);
233    ~FunctionCloner();
234
235    // Prepare for function outlining: making sure there is only
236    // one incoming edge from the extracted/outlined region to
237    // the return block.
238    void NormalizeReturnBlock();
239
240    // Do function outlining for cold regions.
241    bool doMultiRegionFunctionOutlining();
242    // Do function outlining for region after early return block(s).
243    // NOTE: For vararg functions that do the vararg handling in the outlined
244    //       function, we temporarily generate IR that does not properly
245    //       forward varargs to the outlined function. Calling InlineFunction
246    //       will update calls to the outlined functions to properly forward
247    //       the varargs.
248    Function *doSingleRegionFunctionOutlining();
249
250    Function *OrigFunc = nullptr;
251    Function *ClonedFunc = nullptr;
252
253    typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
254    // Keep track of Outlined Functions and the basic block they're called from.
255    SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
256
257    // ClonedFunc is inlined in one of its callers after function
258    // outlining.
259    bool IsFunctionInlined = false;
260    // The cost of the region to be outlined.
261    int OutlinedRegionCost = 0;
262    // ClonedOI is specific to outlining non-early return blocks.
263    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
264    // ClonedOMRI is specific to outlining cold regions.
265    std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
266    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
267    OptimizationRemarkEmitter &ORE;
268    function_ref<AssumptionCache *(Function &)> LookupAC;
269  };
270
271private:
272  int NumPartialInlining = 0;
273  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
274  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
275  function_ref<TargetTransformInfo &(Function &)> GetTTI;
276  function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
277  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
278  ProfileSummaryInfo &PSI;
279
280  // Return the frequency of the OutlininingBB relative to F's entry point.
281  // The result is no larger than 1 and is represented using BP.
282  // (Note that the outlined region's 'head' block can only have incoming
283  // edges from the guarding entry blocks).
284  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
285
286  // Return true if the callee of CB should be partially inlined with
287  // profit.
288  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
289                           BlockFrequency WeightedOutliningRcost,
290                           OptimizationRemarkEmitter &ORE);
291
292  // Try to inline DuplicateFunction (cloned from F with call to
293  // the OutlinedFunction into its callers. Return true
294  // if there is any successful inlining.
295  bool tryPartialInline(FunctionCloner &Cloner);
296
297  // Compute the mapping from use site of DuplicationFunction to the enclosing
298  // BB's profile count.
299  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
300                                     DenseMap<User *, uint64_t> &SiteCountMap);
301
302  bool IsLimitReached() {
303    return (MaxNumPartialInlining != -1 &&
304            NumPartialInlining >= MaxNumPartialInlining);
305  }
306
307  static CallBase *getSupportedCallBase(User *U) {
308    if (isa<CallInst>(U) || isa<InvokeInst>(U))
309      return cast<CallBase>(U);
310    llvm_unreachable("All uses must be calls");
311    return nullptr;
312  }
313
314  static CallBase *getOneCallSiteTo(Function *F) {
315    User *User = *F->user_begin();
316    return getSupportedCallBase(User);
317  }
318
319  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
320    CallBase *CB = getOneCallSiteTo(F);
321    DebugLoc DLoc = CB->getDebugLoc();
322    BasicBlock *Block = CB->getParent();
323    return std::make_tuple(DLoc, Block);
324  }
325
326  // Returns the costs associated with function outlining:
327  // - The first value is the non-weighted runtime cost for making the call
328  //   to the outlined function, including the addtional  setup cost in the
329  //    outlined function itself;
330  // - The second value is the estimated size of the new call sequence in
331  //   basic block Cloner.OutliningCallBB;
332  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
333
334  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335  // approximate both the size and runtime cost (Note that in the current
336  // inline cost analysis, there is no clear distinction there either).
337  static int computeBBInlineCost(BasicBlock *BB);
338
339  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
340  std::unique_ptr<FunctionOutliningMultiRegionInfo>
341  computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
342};
343
344struct PartialInlinerLegacyPass : public ModulePass {
345  static char ID; // Pass identification, replacement for typeid
346
347  PartialInlinerLegacyPass() : ModulePass(ID) {
348    initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
349  }
350
351  void getAnalysisUsage(AnalysisUsage &AU) const override {
352    AU.addRequired<AssumptionCacheTracker>();
353    AU.addRequired<ProfileSummaryInfoWrapperPass>();
354    AU.addRequired<TargetTransformInfoWrapperPass>();
355    AU.addRequired<TargetLibraryInfoWrapperPass>();
356  }
357
358  bool runOnModule(Module &M) override {
359    if (skipModule(M))
360      return false;
361
362    AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
363    TargetTransformInfoWrapperPass *TTIWP =
364        &getAnalysis<TargetTransformInfoWrapperPass>();
365    ProfileSummaryInfo &PSI =
366        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
367
368    auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
369      return ACT->getAssumptionCache(F);
370    };
371
372    auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
373      return ACT->lookupAssumptionCache(F);
374    };
375
376    auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
377      return TTIWP->getTTI(F);
378    };
379
380    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
381      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
382    };
383
384    return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
385                              GetTLI, PSI)
386        .run(M);
387  }
388};
389
390} // end anonymous namespace
391
392std::unique_ptr<FunctionOutliningMultiRegionInfo>
393PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
394                                                    OptimizationRemarkEmitter &ORE) {
395  BasicBlock *EntryBlock = &F->front();
396
397  DominatorTree DT(*F);
398  LoopInfo LI(DT);
399  BranchProbabilityInfo BPI(*F, LI);
400  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
401  BlockFrequencyInfo *BFI;
402  if (!GetBFI) {
403    ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
404    BFI = ScopedBFI.get();
405  } else
406    BFI = &(GetBFI(*F));
407
408  // Return if we don't have profiling information.
409  if (!PSI.hasInstrumentationProfile())
410    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
411
412  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
413      std::make_unique<FunctionOutliningMultiRegionInfo>();
414
415  auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
416    BasicBlock *Dom = BlockList.front();
417    return BlockList.size() > 1 && Dom->hasNPredecessors(1);
418  };
419
420  auto IsSingleExit =
421      [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
422    BasicBlock *ExitBlock = nullptr;
423    for (auto *Block : BlockList) {
424      for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
425        if (!is_contained(BlockList, *SI)) {
426          if (ExitBlock) {
427            ORE.emit([&]() {
428              return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
429                                              &SI->front())
430                     << "Region dominated by "
431                     << ore::NV("Block", BlockList.front()->getName())
432                     << " has more than one region exit edge.";
433            });
434            return nullptr;
435          } else
436            ExitBlock = Block;
437        }
438      }
439    }
440    return ExitBlock;
441  };
442
443  auto BBProfileCount = [BFI](BasicBlock *BB) {
444    return BFI->getBlockProfileCount(BB)
445               ? BFI->getBlockProfileCount(BB).getValue()
446               : 0;
447  };
448
449  // Use the same computeBBInlineCost function to compute the cost savings of
450  // the outlining the candidate region.
451  int OverallFunctionCost = 0;
452  for (auto &BB : *F)
453    OverallFunctionCost += computeBBInlineCost(&BB);
454
455#ifndef NDEBUG
456  if (TracePartialInlining)
457    dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
458#endif
459  int MinOutlineRegionCost =
460      static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
461  BranchProbability MinBranchProbability(
462      static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463      MinBlockCounterExecution);
464  bool ColdCandidateFound = false;
465  BasicBlock *CurrEntry = EntryBlock;
466  std::vector<BasicBlock *> DFS;
467  DenseMap<BasicBlock *, bool> VisitedMap;
468  DFS.push_back(CurrEntry);
469  VisitedMap[CurrEntry] = true;
470  // Use Depth First Search on the basic blocks to find CFG edges that are
471  // considered cold.
472  // Cold regions considered must also have its inline cost compared to the
473  // overall inline cost of the original function.  The region is outlined only
474  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475  // more.
476  while (!DFS.empty()) {
477    auto *thisBB = DFS.back();
478    DFS.pop_back();
479    // Only consider regions with predecessor blocks that are considered
480    // not-cold (default: part of the top 99.99% of all block counters)
481    // AND greater than our minimum block execution count (default: 100).
482    if (PSI.isColdBlock(thisBB, BFI) ||
483        BBProfileCount(thisBB) < MinBlockCounterExecution)
484      continue;
485    for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
486      if (VisitedMap[*SI])
487        continue;
488      VisitedMap[*SI] = true;
489      DFS.push_back(*SI);
490      // If branch isn't cold, we skip to the next one.
491      BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
492      if (SuccProb > MinBranchProbability)
493        continue;
494#ifndef NDEBUG
495      if (TracePartialInlining) {
496        dbgs() << "Found cold edge: " << thisBB->getName() << "->"
497               << (*SI)->getName() << "\nBranch Probability = " << SuccProb
498               << "\n";
499      }
500#endif
501      SmallVector<BasicBlock *, 8> DominateVector;
502      DT.getDescendants(*SI, DominateVector);
503      // We can only outline single entry regions (for now).
504      if (!IsSingleEntry(DominateVector))
505        continue;
506      BasicBlock *ExitBlock = nullptr;
507      // We can only outline single exit regions (for now).
508      if (!(ExitBlock = IsSingleExit(DominateVector)))
509        continue;
510      int OutlineRegionCost = 0;
511      for (auto *BB : DominateVector)
512        OutlineRegionCost += computeBBInlineCost(BB);
513
514#ifndef NDEBUG
515      if (TracePartialInlining)
516        dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
517#endif
518
519      if (OutlineRegionCost < MinOutlineRegionCost) {
520        ORE.emit([&]() {
521          return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
522                                            &SI->front())
523                 << ore::NV("Callee", F) << " inline cost-savings smaller than "
524                 << ore::NV("Cost", MinOutlineRegionCost);
525        });
526        continue;
527      }
528      // For now, ignore blocks that belong to a SISE region that is a
529      // candidate for outlining.  In the future, we may want to look
530      // at inner regions because the outer region may have live-exit
531      // variables.
532      for (auto *BB : DominateVector)
533        VisitedMap[BB] = true;
534      // ReturnBlock here means the block after the outline call
535      BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
536      // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
537      FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
538          DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
539      OutliningInfo->ORI.push_back(RegInfo);
540#ifndef NDEBUG
541      if (TracePartialInlining) {
542        dbgs() << "Found Cold Candidate starting at block: "
543               << DominateVector.front()->getName() << "\n";
544      }
545#endif
546      ColdCandidateFound = true;
547      NumColdRegionsFound++;
548    }
549  }
550  if (ColdCandidateFound)
551    return OutliningInfo;
552  else
553    return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
554}
555
556std::unique_ptr<FunctionOutliningInfo>
557PartialInlinerImpl::computeOutliningInfo(Function *F) {
558  BasicBlock *EntryBlock = &F->front();
559  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
560  if (!BR || BR->isUnconditional())
561    return std::unique_ptr<FunctionOutliningInfo>();
562
563  // Returns true if Succ is BB's successor
564  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
565    return is_contained(successors(BB), Succ);
566  };
567
568  auto IsReturnBlock = [](BasicBlock *BB) {
569    Instruction *TI = BB->getTerminator();
570    return isa<ReturnInst>(TI);
571  };
572
573  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574    if (IsReturnBlock(Succ1))
575      return std::make_tuple(Succ1, Succ2);
576    if (IsReturnBlock(Succ2))
577      return std::make_tuple(Succ2, Succ1);
578
579    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580  };
581
582  // Detect a triangular shape:
583  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584    if (IsSuccessor(Succ1, Succ2))
585      return std::make_tuple(Succ1, Succ2);
586    if (IsSuccessor(Succ2, Succ1))
587      return std::make_tuple(Succ2, Succ1);
588
589    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590  };
591
592  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
593      std::make_unique<FunctionOutliningInfo>();
594
595  BasicBlock *CurrEntry = EntryBlock;
596  bool CandidateFound = false;
597  do {
598    // The number of blocks to be inlined has already reached
599    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
600    // disables partial inlining for the function.
601    if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
602      break;
603
604    if (succ_size(CurrEntry) != 2)
605      break;
606
607    BasicBlock *Succ1 = *succ_begin(CurrEntry);
608    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
609
610    BasicBlock *ReturnBlock, *NonReturnBlock;
611    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
612
613    if (ReturnBlock) {
614      OutliningInfo->Entries.push_back(CurrEntry);
615      OutliningInfo->ReturnBlock = ReturnBlock;
616      OutliningInfo->NonReturnBlock = NonReturnBlock;
617      CandidateFound = true;
618      break;
619    }
620
621    BasicBlock *CommSucc;
622    BasicBlock *OtherSucc;
623    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624
625    if (!CommSucc)
626      break;
627
628    OutliningInfo->Entries.push_back(CurrEntry);
629    CurrEntry = OtherSucc;
630  } while (true);
631
632  if (!CandidateFound)
633    return std::unique_ptr<FunctionOutliningInfo>();
634
635  // Do sanity check of the entries: threre should not
636  // be any successors (not in the entry set) other than
637  // {ReturnBlock, NonReturnBlock}
638  assert(OutliningInfo->Entries[0] == &F->front() &&
639         "Function Entry must be the first in Entries vector");
640  DenseSet<BasicBlock *> Entries;
641  for (BasicBlock *E : OutliningInfo->Entries)
642    Entries.insert(E);
643
644  // Returns true of BB has Predecessor which is not
645  // in Entries set.
646  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647    for (auto Pred : predecessors(BB)) {
648      if (!Entries.count(Pred))
649        return true;
650    }
651    return false;
652  };
653  auto CheckAndNormalizeCandidate =
654      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655        for (BasicBlock *E : OutliningInfo->Entries) {
656          for (auto Succ : successors(E)) {
657            if (Entries.count(Succ))
658              continue;
659            if (Succ == OutliningInfo->ReturnBlock)
660              OutliningInfo->ReturnBlockPreds.push_back(E);
661            else if (Succ != OutliningInfo->NonReturnBlock)
662              return false;
663          }
664          // There should not be any outside incoming edges either:
665          if (HasNonEntryPred(E))
666            return false;
667        }
668        return true;
669      };
670
671  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672    return std::unique_ptr<FunctionOutliningInfo>();
673
674  // Now further growing the candidate's inlining region by
675  // peeling off dominating blocks from the outlining region:
676  while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
677    BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678    if (succ_size(Cand) != 2)
679      break;
680
681    if (HasNonEntryPred(Cand))
682      break;
683
684    BasicBlock *Succ1 = *succ_begin(Cand);
685    BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686
687    BasicBlock *ReturnBlock, *NonReturnBlock;
688    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690      break;
691
692    if (NonReturnBlock->getSinglePredecessor() != Cand)
693      break;
694
695    // Now grow and update OutlininigInfo:
696    OutliningInfo->Entries.push_back(Cand);
697    OutliningInfo->NonReturnBlock = NonReturnBlock;
698    OutliningInfo->ReturnBlockPreds.push_back(Cand);
699    Entries.insert(Cand);
700  }
701
702  return OutliningInfo;
703}
704
705// Check if there is PGO data or user annotated branch data:
706static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
707  if (F->hasProfileData())
708    return true;
709  // Now check if any of the entry block has MD_prof data:
710  for (auto *E : OI->Entries) {
711    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712    if (!BR || BR->isUnconditional())
713      continue;
714    uint64_t T, F;
715    if (BR->extractProfMetadata(T, F))
716      return true;
717  }
718  return false;
719}
720
721BranchProbability
722PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
723  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724  auto EntryFreq =
725      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726  auto OutliningCallFreq =
727      Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729  // we outlined any regions, so we may encounter situations where the
730  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
732    OutliningCallFreq = EntryFreq;
733  }
734  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735      OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736
737  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
738    return OutlineRegionRelFreq;
739
740  // When profile data is not available, we need to be conservative in
741  // estimating the overall savings. Static branch prediction can usually
742  // guess the branch direction right (taken/non-taken), but the guessed
743  // branch probability is usually not biased enough. In case when the
744  // outlined region is predicted to be likely, its probability needs
745  // to be made higher (more biased) to not under-estimate the cost of
746  // function outlining. On the other hand, if the outlined region
747  // is predicted to be less likely, the predicted probablity is usually
748  // higher than the actual. For instance, the actual probability of the
749  // less likely target is only 5%, but the guessed probablity can be
750  // 40%. In the latter case, there is no need for further adjustement.
751  // FIXME: add an option for this.
752  if (OutlineRegionRelFreq < BranchProbability(45, 100))
753    return OutlineRegionRelFreq;
754
755  OutlineRegionRelFreq = std::max(
756      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757
758  return OutlineRegionRelFreq;
759}
760
761bool PartialInlinerImpl::shouldPartialInline(
762    CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
763    OptimizationRemarkEmitter &ORE) {
764  using namespace ore;
765
766  Function *Callee = CB.getCalledFunction();
767  assert(Callee == Cloner.ClonedFunc);
768
769  if (SkipCostAnalysis)
770    return isInlineViable(*Callee).isSuccess();
771
772  Function *Caller = CB.getCaller();
773  auto &CalleeTTI = GetTTI(*Callee);
774  bool RemarksEnabled =
775      Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
776          DEBUG_TYPE);
777  InlineCost IC =
778      getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
779                    GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
780
781  if (IC.isAlways()) {
782    ORE.emit([&]() {
783      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
784             << NV("Callee", Cloner.OrigFunc)
785             << " should always be fully inlined, not partially";
786    });
787    return false;
788  }
789
790  if (IC.isNever()) {
791    ORE.emit([&]() {
792      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
793             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
794             << NV("Caller", Caller)
795             << " because it should never be inlined (cost=never)";
796    });
797    return false;
798  }
799
800  if (!IC) {
801    ORE.emit([&]() {
802      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
803             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
804             << NV("Caller", Caller) << " because too costly to inline (cost="
805             << NV("Cost", IC.getCost()) << ", threshold="
806             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
807    });
808    return false;
809  }
810  const DataLayout &DL = Caller->getParent()->getDataLayout();
811
812  // The savings of eliminating the call:
813  int NonWeightedSavings = getCallsiteCost(CB, DL);
814  BlockFrequency NormWeightedSavings(NonWeightedSavings);
815
816  // Weighted saving is smaller than weighted cost, return false
817  if (NormWeightedSavings < WeightedOutliningRcost) {
818    ORE.emit([&]() {
819      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
820                                        &CB)
821             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
822             << NV("Caller", Caller) << " runtime overhead (overhead="
823             << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
824             << ", savings="
825             << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
826             << ")"
827             << " of making the outlined call is too high";
828    });
829
830    return false;
831  }
832
833  ORE.emit([&]() {
834    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
835           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
836           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
837           << " (threshold="
838           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
839  });
840  return true;
841}
842
843// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
844// For now use a simplified version. The returned 'InlineCost' will be used
845// to esimate the size cost as well as runtime cost of the BB.
846int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
847  int InlineCost = 0;
848  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
849  for (Instruction &I : BB->instructionsWithoutDebug()) {
850    // Skip free instructions.
851    switch (I.getOpcode()) {
852    case Instruction::BitCast:
853    case Instruction::PtrToInt:
854    case Instruction::IntToPtr:
855    case Instruction::Alloca:
856    case Instruction::PHI:
857      continue;
858    case Instruction::GetElementPtr:
859      if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
860        continue;
861      break;
862    default:
863      break;
864    }
865
866    if (I.isLifetimeStartOrEnd())
867      continue;
868
869    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
870      InlineCost += getCallsiteCost(*CI, DL);
871      continue;
872    }
873
874    if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
875      InlineCost += getCallsiteCost(*II, DL);
876      continue;
877    }
878
879    if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
880      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
881      continue;
882    }
883    InlineCost += InlineConstants::InstrCost;
884  }
885  return InlineCost;
886}
887
888std::tuple<int, int>
889PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
890  int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
891  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
892    Function *OutlinedFunc = FuncBBPair.first;
893    BasicBlock* OutliningCallBB = FuncBBPair.second;
894    // Now compute the cost of the call sequence to the outlined function
895    // 'OutlinedFunction' in BB 'OutliningCallBB':
896    OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
897
898    // Now compute the cost of the extracted/outlined function itself:
899    for (BasicBlock &BB : *OutlinedFunc)
900      OutlinedFunctionCost += computeBBInlineCost(&BB);
901  }
902  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
903         "Outlined function cost should be no less than the outlined region");
904
905  // The code extractor introduces a new root and exit stub blocks with
906  // additional unconditional branches. Those branches will be eliminated
907  // later with bb layout. The cost should be adjusted accordingly:
908  OutlinedFunctionCost -=
909      2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
910
911  int OutliningRuntimeOverhead =
912      OutliningFuncCallCost +
913      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
914      ExtraOutliningPenalty;
915
916  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
917}
918
919// Create the callsite to profile count map which is
920// used to update the original function's entry count,
921// after the function is partially inlined into the callsite.
922void PartialInlinerImpl::computeCallsiteToProfCountMap(
923    Function *DuplicateFunction,
924    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
925  std::vector<User *> Users(DuplicateFunction->user_begin(),
926                            DuplicateFunction->user_end());
927  Function *CurrentCaller = nullptr;
928  std::unique_ptr<BlockFrequencyInfo> TempBFI;
929  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
930
931  auto ComputeCurrBFI = [&,this](Function *Caller) {
932      // For the old pass manager:
933      if (!GetBFI) {
934        DominatorTree DT(*Caller);
935        LoopInfo LI(DT);
936        BranchProbabilityInfo BPI(*Caller, LI);
937        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
938        CurrentCallerBFI = TempBFI.get();
939      } else {
940        // New pass manager:
941        CurrentCallerBFI = &(GetBFI(*Caller));
942      }
943  };
944
945  for (User *User : Users) {
946    CallBase *CB = getSupportedCallBase(User);
947    Function *Caller = CB->getCaller();
948    if (CurrentCaller != Caller) {
949      CurrentCaller = Caller;
950      ComputeCurrBFI(Caller);
951    } else {
952      assert(CurrentCallerBFI && "CallerBFI is not set");
953    }
954    BasicBlock *CallBB = CB->getParent();
955    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
956    if (Count)
957      CallSiteToProfCountMap[User] = *Count;
958    else
959      CallSiteToProfCountMap[User] = 0;
960  }
961}
962
963PartialInlinerImpl::FunctionCloner::FunctionCloner(
964    Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
965    function_ref<AssumptionCache *(Function &)> LookupAC)
966    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
967  ClonedOI = std::make_unique<FunctionOutliningInfo>();
968
969  // Clone the function, so that we can hack away on it.
970  ValueToValueMapTy VMap;
971  ClonedFunc = CloneFunction(F, VMap);
972
973  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
974  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
975  for (BasicBlock *BB : OI->Entries) {
976    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
977  }
978  for (BasicBlock *E : OI->ReturnBlockPreds) {
979    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
980    ClonedOI->ReturnBlockPreds.push_back(NewE);
981  }
982  // Go ahead and update all uses to the duplicate, so that we can just
983  // use the inliner functionality when we're done hacking.
984  F->replaceAllUsesWith(ClonedFunc);
985}
986
987PartialInlinerImpl::FunctionCloner::FunctionCloner(
988    Function *F, FunctionOutliningMultiRegionInfo *OI,
989    OptimizationRemarkEmitter &ORE,
990    function_ref<AssumptionCache *(Function &)> LookupAC)
991    : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
992  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
993
994  // Clone the function, so that we can hack away on it.
995  ValueToValueMapTy VMap;
996  ClonedFunc = CloneFunction(F, VMap);
997
998  // Go through all Outline Candidate Regions and update all BasicBlock
999  // information.
1000  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1001       OI->ORI) {
1002    SmallVector<BasicBlock *, 8> Region;
1003    for (BasicBlock *BB : RegionInfo.Region) {
1004      Region.push_back(cast<BasicBlock>(VMap[BB]));
1005    }
1006    BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1007    BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1008    BasicBlock *NewReturnBlock = nullptr;
1009    if (RegionInfo.ReturnBlock)
1010      NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1011    FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1012        Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1013    ClonedOMRI->ORI.push_back(MappedRegionInfo);
1014  }
1015  // Go ahead and update all uses to the duplicate, so that we can just
1016  // use the inliner functionality when we're done hacking.
1017  F->replaceAllUsesWith(ClonedFunc);
1018}
1019
1020void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1021  auto getFirstPHI = [](BasicBlock *BB) {
1022    BasicBlock::iterator I = BB->begin();
1023    PHINode *FirstPhi = nullptr;
1024    while (I != BB->end()) {
1025      PHINode *Phi = dyn_cast<PHINode>(I);
1026      if (!Phi)
1027        break;
1028      if (!FirstPhi) {
1029        FirstPhi = Phi;
1030        break;
1031      }
1032    }
1033    return FirstPhi;
1034  };
1035
1036  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1037  // blocks.
1038  if (!ClonedOI)
1039    return;
1040
1041  // Special hackery is needed with PHI nodes that have inputs from more than
1042  // one extracted block.  For simplicity, just split the PHIs into a two-level
1043  // sequence of PHIs, some of which will go in the extracted region, and some
1044  // of which will go outside.
1045  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1046  // only split block when necessary:
1047  PHINode *FirstPhi = getFirstPHI(PreReturn);
1048  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1049
1050  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1051    return;
1052
1053  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1054    Value *CommonValue = PN->getIncomingValue(0);
1055    if (all_of(PN->incoming_values(),
1056               [&](Value *V) { return V == CommonValue; }))
1057      return CommonValue;
1058    return nullptr;
1059  };
1060
1061  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1062      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1063  BasicBlock::iterator I = PreReturn->begin();
1064  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1065  SmallVector<Instruction *, 4> DeadPhis;
1066  while (I != PreReturn->end()) {
1067    PHINode *OldPhi = dyn_cast<PHINode>(I);
1068    if (!OldPhi)
1069      break;
1070
1071    PHINode *RetPhi =
1072        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1073    OldPhi->replaceAllUsesWith(RetPhi);
1074    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1075
1076    RetPhi->addIncoming(&*I, PreReturn);
1077    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1078      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1079      OldPhi->removeIncomingValue(E);
1080    }
1081
1082    // After incoming values splitting, the old phi may become trivial.
1083    // Keeping the trivial phi can introduce definition inside the outline
1084    // region which is live-out, causing necessary overhead (load, store
1085    // arg passing etc).
1086    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1087      OldPhi->replaceAllUsesWith(OldPhiVal);
1088      DeadPhis.push_back(OldPhi);
1089    }
1090    ++I;
1091  }
1092  for (auto *DP : DeadPhis)
1093    DP->eraseFromParent();
1094
1095  for (auto E : ClonedOI->ReturnBlockPreds) {
1096    E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1097  }
1098}
1099
1100bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1101
1102  auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1103    int Cost = 0;
1104    for (BasicBlock* BB : Region)
1105      Cost += computeBBInlineCost(BB);
1106    return Cost;
1107  };
1108
1109  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1110
1111  if (ClonedOMRI->ORI.empty())
1112    return false;
1113
1114  // The CodeExtractor needs a dominator tree.
1115  DominatorTree DT;
1116  DT.recalculate(*ClonedFunc);
1117
1118  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1119  LoopInfo LI(DT);
1120  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1121  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1122
1123  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1124  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1125
1126  SetVector<Value *> Inputs, Outputs, Sinks;
1127  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1128       ClonedOMRI->ORI) {
1129    int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1130
1131    CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1132                     ClonedFuncBFI.get(), &BPI,
1133                     LookupAC(*RegionInfo.EntryBlock->getParent()),
1134                     /* AllowVarargs */ false);
1135
1136    CE.findInputsOutputs(Inputs, Outputs, Sinks);
1137
1138#ifndef NDEBUG
1139    if (TracePartialInlining) {
1140      dbgs() << "inputs: " << Inputs.size() << "\n";
1141      dbgs() << "outputs: " << Outputs.size() << "\n";
1142      for (Value *value : Inputs)
1143        dbgs() << "value used in func: " << *value << "\n";
1144      for (Value *output : Outputs)
1145        dbgs() << "instr used in func: " << *output << "\n";
1146    }
1147#endif
1148    // Do not extract regions that have live exit variables.
1149    if (Outputs.size() > 0 && !ForceLiveExit)
1150      continue;
1151
1152    Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1153
1154    if (OutlinedFunc) {
1155      CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1156      BasicBlock *OutliningCallBB = OCS->getParent();
1157      assert(OutliningCallBB->getParent() == ClonedFunc);
1158      OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1159      NumColdRegionsOutlined++;
1160      OutlinedRegionCost += CurrentOutlinedRegionCost;
1161
1162      if (MarkOutlinedColdCC) {
1163        OutlinedFunc->setCallingConv(CallingConv::Cold);
1164        OCS->setCallingConv(CallingConv::Cold);
1165      }
1166    } else
1167      ORE.emit([&]() {
1168        return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1169                                        &RegionInfo.Region.front()->front())
1170               << "Failed to extract region at block "
1171               << ore::NV("Block", RegionInfo.Region.front());
1172      });
1173  }
1174
1175  return !OutlinedFunctions.empty();
1176}
1177
1178Function *
1179PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1180  // Returns true if the block is to be partial inlined into the caller
1181  // (i.e. not to be extracted to the out of line function)
1182  auto ToBeInlined = [&, this](BasicBlock *BB) {
1183    return BB == ClonedOI->ReturnBlock ||
1184           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1185            ClonedOI->Entries.end());
1186  };
1187
1188  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1189  // The CodeExtractor needs a dominator tree.
1190  DominatorTree DT;
1191  DT.recalculate(*ClonedFunc);
1192
1193  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1194  LoopInfo LI(DT);
1195  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1196  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1197
1198  // Gather up the blocks that we're going to extract.
1199  std::vector<BasicBlock *> ToExtract;
1200  ToExtract.push_back(ClonedOI->NonReturnBlock);
1201  OutlinedRegionCost +=
1202      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1203  for (BasicBlock &BB : *ClonedFunc)
1204    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1205      ToExtract.push_back(&BB);
1206      // FIXME: the code extractor may hoist/sink more code
1207      // into the outlined function which may make the outlining
1208      // overhead (the difference of the outlined function cost
1209      // and OutliningRegionCost) look larger.
1210      OutlinedRegionCost += computeBBInlineCost(&BB);
1211    }
1212
1213  // Extract the body of the if.
1214  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1215  Function *OutlinedFunc =
1216      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1217                    ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1218                    /* AllowVarargs */ true)
1219          .extractCodeRegion(CEAC);
1220
1221  if (OutlinedFunc) {
1222    BasicBlock *OutliningCallBB =
1223        PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1224            ->getParent();
1225    assert(OutliningCallBB->getParent() == ClonedFunc);
1226    OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1227  } else
1228    ORE.emit([&]() {
1229      return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1230                                      &ToExtract.front()->front())
1231             << "Failed to extract region at block "
1232             << ore::NV("Block", ToExtract.front());
1233    });
1234
1235  return OutlinedFunc;
1236}
1237
1238PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1239  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1240  // users (function pointers, etc.) back to the original function.
1241  ClonedFunc->replaceAllUsesWith(OrigFunc);
1242  ClonedFunc->eraseFromParent();
1243  if (!IsFunctionInlined) {
1244    // Remove each function that was speculatively created if there is no
1245    // reference.
1246    for (auto FuncBBPair : OutlinedFunctions) {
1247      Function *Func = FuncBBPair.first;
1248      Func->eraseFromParent();
1249    }
1250  }
1251}
1252
1253std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1254
1255  if (F->hasAddressTaken())
1256    return {false, nullptr};
1257
1258  // Let inliner handle it
1259  if (F->hasFnAttribute(Attribute::AlwaysInline))
1260    return {false, nullptr};
1261
1262  if (F->hasFnAttribute(Attribute::NoInline))
1263    return {false, nullptr};
1264
1265  if (PSI.isFunctionEntryCold(F))
1266    return {false, nullptr};
1267
1268  if (F->users().empty())
1269    return {false, nullptr};
1270
1271  OptimizationRemarkEmitter ORE(F);
1272
1273  // Only try to outline cold regions if we have a profile summary, which
1274  // implies we have profiling information.
1275  if (PSI.hasProfileSummary() && F->hasProfileData() &&
1276      !DisableMultiRegionPartialInline) {
1277    std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1278        computeOutliningColdRegionsInfo(F, ORE);
1279    if (OMRI) {
1280      FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1281
1282#ifndef NDEBUG
1283      if (TracePartialInlining) {
1284        dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1285        dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1286               << "\n";
1287      }
1288#endif
1289      bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1290
1291      if (DidOutline) {
1292#ifndef NDEBUG
1293        if (TracePartialInlining) {
1294          dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1295          Cloner.ClonedFunc->print(dbgs());
1296          dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1297        }
1298#endif
1299
1300        if (tryPartialInline(Cloner))
1301          return {true, nullptr};
1302      }
1303    }
1304  }
1305
1306  // Fall-thru to regular partial inlining if we:
1307  //    i) can't find any cold regions to outline, or
1308  //   ii) can't inline the outlined function anywhere.
1309  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1310  if (!OI)
1311    return {false, nullptr};
1312
1313  FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1314  Cloner.NormalizeReturnBlock();
1315
1316  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1317
1318  if (!OutlinedFunction)
1319    return {false, nullptr};
1320
1321  bool AnyInline = tryPartialInline(Cloner);
1322
1323  if (AnyInline)
1324    return {true, OutlinedFunction};
1325
1326  return {false, nullptr};
1327}
1328
1329bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1330  if (Cloner.OutlinedFunctions.empty())
1331    return false;
1332
1333  int SizeCost = 0;
1334  BlockFrequency WeightedRcost;
1335  int NonWeightedRcost;
1336  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1337
1338  // Only calculate RelativeToEntryFreq when we are doing single region
1339  // outlining.
1340  BranchProbability RelativeToEntryFreq;
1341  if (Cloner.ClonedOI) {
1342    RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1343  } else
1344    // RelativeToEntryFreq doesn't make sense when we have more than one
1345    // outlined call because each call will have a different relative frequency
1346    // to the entry block.  We can consider using the average, but the
1347    // usefulness of that information is questionable. For now, assume we never
1348    // execute the calls to outlined functions.
1349    RelativeToEntryFreq = BranchProbability(0, 1);
1350
1351  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1352
1353  // The call sequence(s) to the outlined function(s) are larger than the sum of
1354  // the original outlined region size(s), it does not increase the chances of
1355  // inlining the function with outlining (The inliner uses the size increase to
1356  // model the cost of inlining a callee).
1357  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1358    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1359    DebugLoc DLoc;
1360    BasicBlock *Block;
1361    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1362    OrigFuncORE.emit([&]() {
1363      return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1364                                        DLoc, Block)
1365             << ore::NV("Function", Cloner.OrigFunc)
1366             << " not partially inlined into callers (Original Size = "
1367             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1368             << ", Size of call sequence to outlined function = "
1369             << ore::NV("NewSize", SizeCost) << ")";
1370    });
1371    return false;
1372  }
1373
1374  assert(Cloner.OrigFunc->users().empty() &&
1375         "F's users should all be replaced!");
1376
1377  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1378                            Cloner.ClonedFunc->user_end());
1379
1380  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1381  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1382  if (CalleeEntryCount)
1383    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1384
1385  uint64_t CalleeEntryCountV =
1386      (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1387
1388  bool AnyInline = false;
1389  for (User *User : Users) {
1390    CallBase *CB = getSupportedCallBase(User);
1391
1392    if (IsLimitReached())
1393      continue;
1394
1395    OptimizationRemarkEmitter CallerORE(CB->getCaller());
1396    if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1397      continue;
1398
1399    // Construct remark before doing the inlining, as after successful inlining
1400    // the callsite is removed.
1401    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1402    OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1403       << ore::NV("Caller", CB->getCaller());
1404
1405    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1406    // We can only forward varargs when we outlined a single region, else we
1407    // bail on vararg functions.
1408    if (!InlineFunction(*CB, IFI, nullptr, true,
1409                        (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1410                                         : nullptr))
1411             .isSuccess())
1412      continue;
1413
1414    CallerORE.emit(OR);
1415
1416    // Now update the entry count:
1417    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1418      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1419      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1420    }
1421
1422    AnyInline = true;
1423    NumPartialInlining++;
1424    // Update the stats
1425    if (Cloner.ClonedOI)
1426      NumPartialInlined++;
1427    else
1428      NumColdOutlinePartialInlined++;
1429
1430  }
1431
1432  if (AnyInline) {
1433    Cloner.IsFunctionInlined = true;
1434    if (CalleeEntryCount)
1435      Cloner.OrigFunc->setEntryCount(
1436          CalleeEntryCount.setCount(CalleeEntryCountV));
1437    OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1438    OrigFuncORE.emit([&]() {
1439      return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1440             << "Partially inlined into at least one caller";
1441    });
1442
1443  }
1444
1445  return AnyInline;
1446}
1447
1448bool PartialInlinerImpl::run(Module &M) {
1449  if (DisablePartialInlining)
1450    return false;
1451
1452  std::vector<Function *> Worklist;
1453  Worklist.reserve(M.size());
1454  for (Function &F : M)
1455    if (!F.use_empty() && !F.isDeclaration())
1456      Worklist.push_back(&F);
1457
1458  bool Changed = false;
1459  while (!Worklist.empty()) {
1460    Function *CurrFunc = Worklist.back();
1461    Worklist.pop_back();
1462
1463    if (CurrFunc->use_empty())
1464      continue;
1465
1466    bool Recursive = false;
1467    for (User *U : CurrFunc->users())
1468      if (Instruction *I = dyn_cast<Instruction>(U))
1469        if (I->getParent()->getParent() == CurrFunc) {
1470          Recursive = true;
1471          break;
1472        }
1473    if (Recursive)
1474      continue;
1475
1476    std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1477    if (Result.second)
1478      Worklist.push_back(Result.second);
1479    Changed |= Result.first;
1480  }
1481
1482  return Changed;
1483}
1484
1485char PartialInlinerLegacyPass::ID = 0;
1486
1487INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1488                      "Partial Inliner", false, false)
1489INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1490INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1491INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1492INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1493INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1494                    "Partial Inliner", false, false)
1495
1496ModulePass *llvm::createPartialInliningPass() {
1497  return new PartialInlinerLegacyPass();
1498}
1499
1500PreservedAnalyses PartialInlinerPass::run(Module &M,
1501                                          ModuleAnalysisManager &AM) {
1502  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1503
1504  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1505    return FAM.getResult<AssumptionAnalysis>(F);
1506  };
1507
1508  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1509    return FAM.getCachedResult<AssumptionAnalysis>(F);
1510  };
1511
1512  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1513    return FAM.getResult<BlockFrequencyAnalysis>(F);
1514  };
1515
1516  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1517    return FAM.getResult<TargetIRAnalysis>(F);
1518  };
1519
1520  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1521    return FAM.getResult<TargetLibraryAnalysis>(F);
1522  };
1523
1524  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1525
1526  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1527                         GetTLI, PSI, GetBFI)
1528          .run(M))
1529    return PreservedAnalyses::none();
1530  return PreservedAnalyses::all();
1531}
1532