1343171Sdim//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2343171Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6343171Sdim//
7343171Sdim//===----------------------------------------------------------------------===//
8353358Sdim///
9353358Sdim/// \file
10353358Sdim/// The goal of hot/cold splitting is to improve the memory locality of code.
11353358Sdim/// The splitting pass does this by identifying cold blocks and moving them into
12353358Sdim/// separate functions.
13353358Sdim///
14353358Sdim/// When the splitting pass finds a cold block (referred to as "the sink"), it
15353358Sdim/// grows a maximal cold region around that block. The maximal region contains
16353358Sdim/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17353358Sdim/// cold as the sink. Once a region is found, it's split out of the original
18353358Sdim/// function provided it's profitable to do so.
19353358Sdim///
20353358Sdim/// [*] In practice, there is some added complexity because some blocks are not
21353358Sdim/// safe to extract.
22353358Sdim///
23353358Sdim/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24353358Sdim/// TODO: Reorder outlined functions.
25353358Sdim///
26343171Sdim//===----------------------------------------------------------------------===//
27343171Sdim
28360784Sdim#include "llvm/Transforms/IPO/HotColdSplitting.h"
29343171Sdim#include "llvm/ADT/PostOrderIterator.h"
30343171Sdim#include "llvm/ADT/SmallVector.h"
31343171Sdim#include "llvm/ADT/Statistic.h"
32343171Sdim#include "llvm/Analysis/AliasAnalysis.h"
33343171Sdim#include "llvm/Analysis/BlockFrequencyInfo.h"
34343171Sdim#include "llvm/Analysis/BranchProbabilityInfo.h"
35343171Sdim#include "llvm/Analysis/CFG.h"
36343171Sdim#include "llvm/Analysis/OptimizationRemarkEmitter.h"
37343171Sdim#include "llvm/Analysis/PostDominators.h"
38343171Sdim#include "llvm/Analysis/ProfileSummaryInfo.h"
39343171Sdim#include "llvm/Analysis/TargetTransformInfo.h"
40343171Sdim#include "llvm/IR/BasicBlock.h"
41343171Sdim#include "llvm/IR/CFG.h"
42343171Sdim#include "llvm/IR/CallSite.h"
43343171Sdim#include "llvm/IR/DataLayout.h"
44343171Sdim#include "llvm/IR/DiagnosticInfo.h"
45343171Sdim#include "llvm/IR/Dominators.h"
46343171Sdim#include "llvm/IR/Function.h"
47343171Sdim#include "llvm/IR/Instruction.h"
48343171Sdim#include "llvm/IR/Instructions.h"
49343171Sdim#include "llvm/IR/IntrinsicInst.h"
50343171Sdim#include "llvm/IR/Metadata.h"
51343171Sdim#include "llvm/IR/Module.h"
52343171Sdim#include "llvm/IR/PassManager.h"
53343171Sdim#include "llvm/IR/Type.h"
54343171Sdim#include "llvm/IR/Use.h"
55343171Sdim#include "llvm/IR/User.h"
56343171Sdim#include "llvm/IR/Value.h"
57360784Sdim#include "llvm/InitializePasses.h"
58343171Sdim#include "llvm/Pass.h"
59343171Sdim#include "llvm/Support/BlockFrequency.h"
60343171Sdim#include "llvm/Support/BranchProbability.h"
61360784Sdim#include "llvm/Support/CommandLine.h"
62343171Sdim#include "llvm/Support/Debug.h"
63343171Sdim#include "llvm/Support/raw_ostream.h"
64343171Sdim#include "llvm/Transforms/IPO.h"
65343171Sdim#include "llvm/Transforms/Scalar.h"
66343171Sdim#include "llvm/Transforms/Utils/BasicBlockUtils.h"
67343171Sdim#include "llvm/Transforms/Utils/Cloning.h"
68343171Sdim#include "llvm/Transforms/Utils/CodeExtractor.h"
69343171Sdim#include "llvm/Transforms/Utils/Local.h"
70343171Sdim#include "llvm/Transforms/Utils/ValueMapper.h"
71343171Sdim#include <algorithm>
72343171Sdim#include <cassert>
73343171Sdim
74343171Sdim#define DEBUG_TYPE "hotcoldsplit"
75343171Sdim
76343171SdimSTATISTIC(NumColdRegionsFound, "Number of cold regions found.");
77343171SdimSTATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
78343171Sdim
79343171Sdimusing namespace llvm;
80343171Sdim
81343171Sdimstatic cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
82343171Sdim                              cl::init(true), cl::Hidden);
83343171Sdim
84343171Sdimstatic cl::opt<int>
85353358Sdim    SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
86353358Sdim                       cl::desc("Base penalty for splitting cold code (as a "
87353358Sdim                                "multiple of TCC_Basic)"));
88343171Sdim
89343171Sdimnamespace {
90343171Sdim// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
91343171Sdim// this function unless you modify the MBB version as well.
92343171Sdim//
93343171Sdim/// A no successor, non-return block probably ends in unreachable and is cold.
94343171Sdim/// Also consider a block that ends in an indirect branch to be a return block,
95343171Sdim/// since many targets use plain indirect branches to return.
96343171Sdimbool blockEndsInUnreachable(const BasicBlock &BB) {
97343171Sdim  if (!succ_empty(&BB))
98343171Sdim    return false;
99343171Sdim  if (BB.empty())
100343171Sdim    return true;
101343171Sdim  const Instruction *I = BB.getTerminator();
102343171Sdim  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
103343171Sdim}
104343171Sdim
105343171Sdimbool unlikelyExecuted(BasicBlock &BB) {
106343171Sdim  // Exception handling blocks are unlikely executed.
107353358Sdim  if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
108343171Sdim    return true;
109343171Sdim
110353358Sdim  // The block is cold if it calls/invokes a cold function. However, do not
111353358Sdim  // mark sanitizer traps as cold.
112343171Sdim  for (Instruction &I : BB)
113343171Sdim    if (auto CS = CallSite(&I))
114353358Sdim      if (CS.hasFnAttr(Attribute::Cold) && !CS->getMetadata("nosanitize"))
115343171Sdim        return true;
116343171Sdim
117343171Sdim  // The block is cold if it has an unreachable terminator, unless it's
118343171Sdim  // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
119343171Sdim  if (blockEndsInUnreachable(BB)) {
120343171Sdim    if (auto *CI =
121343171Sdim            dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
122343171Sdim      if (CI->hasFnAttr(Attribute::NoReturn))
123343171Sdim        return false;
124343171Sdim    return true;
125343171Sdim  }
126343171Sdim
127343171Sdim  return false;
128343171Sdim}
129343171Sdim
130343171Sdim/// Check whether it's safe to outline \p BB.
131343171Sdimstatic bool mayExtractBlock(const BasicBlock &BB) {
132353358Sdim  // EH pads are unsafe to outline because doing so breaks EH type tables. It
133353358Sdim  // follows that invoke instructions cannot be extracted, because CodeExtractor
134353358Sdim  // requires unwind destinations to be within the extraction region.
135353358Sdim  //
136353358Sdim  // Resumes that are not reachable from a cleanup landing pad are considered to
137353358Sdim  // be unreachable. It���s not safe to split them out either.
138353358Sdim  auto Term = BB.getTerminator();
139353358Sdim  return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
140353358Sdim         !isa<ResumeInst>(Term);
141343171Sdim}
142343171Sdim
143353358Sdim/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
144353358Sdim/// If \p UpdateEntryCount is true (set when this is a new split function and
145353358Sdim/// module has profile data), set entry count to 0 to ensure treated as cold.
146353358Sdim/// Return true if the function is changed.
147353358Sdimstatic bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
148353358Sdim  assert(!F.hasOptNone() && "Can't mark this cold");
149353358Sdim  bool Changed = false;
150353358Sdim  if (!F.hasFnAttribute(Attribute::Cold)) {
151353358Sdim    F.addFnAttr(Attribute::Cold);
152353358Sdim    Changed = true;
153343171Sdim  }
154343171Sdim  if (!F.hasFnAttribute(Attribute::MinSize)) {
155343171Sdim    F.addFnAttr(Attribute::MinSize);
156343171Sdim    Changed = true;
157343171Sdim  }
158353358Sdim  if (UpdateEntryCount) {
159353358Sdim    // Set the entry count to 0 to ensure it is placed in the unlikely text
160353358Sdim    // section when function sections are enabled.
161353358Sdim    F.setEntryCount(0);
162353358Sdim    Changed = true;
163353358Sdim  }
164353358Sdim
165343171Sdim  return Changed;
166343171Sdim}
167343171Sdim
168343171Sdimclass HotColdSplittingLegacyPass : public ModulePass {
169343171Sdimpublic:
170343171Sdim  static char ID;
171343171Sdim  HotColdSplittingLegacyPass() : ModulePass(ID) {
172343171Sdim    initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
173343171Sdim  }
174343171Sdim
175343171Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
176343171Sdim    AU.addRequired<BlockFrequencyInfoWrapperPass>();
177343171Sdim    AU.addRequired<ProfileSummaryInfoWrapperPass>();
178343171Sdim    AU.addRequired<TargetTransformInfoWrapperPass>();
179353358Sdim    AU.addUsedIfAvailable<AssumptionCacheTracker>();
180343171Sdim  }
181343171Sdim
182343171Sdim  bool runOnModule(Module &M) override;
183343171Sdim};
184343171Sdim
185343171Sdim} // end anonymous namespace
186343171Sdim
187353358Sdim/// Check whether \p F is inherently cold.
188353358Sdimbool HotColdSplitting::isFunctionCold(const Function &F) const {
189353358Sdim  if (F.hasFnAttribute(Attribute::Cold))
190353358Sdim    return true;
191343171Sdim
192353358Sdim  if (F.getCallingConv() == CallingConv::Cold)
193353358Sdim    return true;
194343171Sdim
195353358Sdim  if (PSI->isFunctionEntryCold(&F))
196353358Sdim    return true;
197343171Sdim
198353358Sdim  return false;
199353358Sdim}
200343171Sdim
201353358Sdim// Returns false if the function should not be considered for hot-cold split
202353358Sdim// optimization.
203353358Sdimbool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
204343171Sdim  if (F.hasFnAttribute(Attribute::AlwaysInline))
205343171Sdim    return false;
206343171Sdim
207343171Sdim  if (F.hasFnAttribute(Attribute::NoInline))
208343171Sdim    return false;
209343171Sdim
210360784Sdim  // A function marked `noreturn` may contain unreachable terminators: these
211360784Sdim  // should not be considered cold, as the function may be a trampoline.
212360784Sdim  if (F.hasFnAttribute(Attribute::NoReturn))
213360784Sdim    return false;
214360784Sdim
215353358Sdim  if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
216353358Sdim      F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
217353358Sdim      F.hasFnAttribute(Attribute::SanitizeThread) ||
218353358Sdim      F.hasFnAttribute(Attribute::SanitizeMemory))
219343171Sdim    return false;
220343171Sdim
221343171Sdim  return true;
222343171Sdim}
223343171Sdim
224353358Sdim/// Get the benefit score of outlining \p Region.
225353358Sdimstatic int getOutliningBenefit(ArrayRef<BasicBlock *> Region,
226353358Sdim                               TargetTransformInfo &TTI) {
227353358Sdim  // Sum up the code size costs of non-terminator instructions. Tight coupling
228353358Sdim  // with \ref getOutliningPenalty is needed to model the costs of terminators.
229353358Sdim  int Benefit = 0;
230353358Sdim  for (BasicBlock *BB : Region)
231353358Sdim    for (Instruction &I : BB->instructionsWithoutDebug())
232353358Sdim      if (&I != BB->getTerminator())
233353358Sdim        Benefit +=
234353358Sdim            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
235353358Sdim
236353358Sdim  return Benefit;
237353358Sdim}
238353358Sdim
239353358Sdim/// Get the penalty score for outlining \p Region.
240353358Sdimstatic int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
241353358Sdim                               unsigned NumInputs, unsigned NumOutputs) {
242353358Sdim  int Penalty = SplittingThreshold;
243353358Sdim  LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
244353358Sdim
245353358Sdim  // If the splitting threshold is set at or below zero, skip the usual
246353358Sdim  // profitability check.
247353358Sdim  if (SplittingThreshold <= 0)
248353358Sdim    return Penalty;
249353358Sdim
250353358Sdim  // The typical code size cost for materializing an argument for the outlined
251353358Sdim  // call.
252353358Sdim  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
253353358Sdim  const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
254353358Sdim  Penalty += CostForArgMaterialization * NumInputs;
255353358Sdim
256353358Sdim  // The typical code size cost for an output alloca, its associated store, and
257353358Sdim  // its associated reload.
258353358Sdim  LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
259353358Sdim  const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
260353358Sdim  Penalty += CostForRegionOutput * NumOutputs;
261353358Sdim
262353358Sdim  // Find the number of distinct exit blocks for the region. Use a conservative
263353358Sdim  // check to determine whether control returns from the region.
264353358Sdim  bool NoBlocksReturn = true;
265353358Sdim  SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
266353358Sdim  for (BasicBlock *BB : Region) {
267353358Sdim    // If a block has no successors, only assume it does not return if it's
268353358Sdim    // unreachable.
269353358Sdim    if (succ_empty(BB)) {
270353358Sdim      NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
271353358Sdim      continue;
272353358Sdim    }
273353358Sdim
274353358Sdim    for (BasicBlock *SuccBB : successors(BB)) {
275353358Sdim      if (find(Region, SuccBB) == Region.end()) {
276353358Sdim        NoBlocksReturn = false;
277353358Sdim        SuccsOutsideRegion.insert(SuccBB);
278353358Sdim      }
279353358Sdim    }
280353358Sdim  }
281353358Sdim
282353358Sdim  // Apply a `noreturn` bonus.
283353358Sdim  if (NoBlocksReturn) {
284353358Sdim    LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
285353358Sdim                      << " non-returning terminators\n");
286353358Sdim    Penalty -= Region.size();
287353358Sdim  }
288353358Sdim
289353358Sdim  // Apply a penalty for having more than one successor outside of the region.
290353358Sdim  // This penalty accounts for the switch needed in the caller.
291353358Sdim  if (!SuccsOutsideRegion.empty()) {
292353358Sdim    LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
293353358Sdim                      << " non-region successors\n");
294353358Sdim    Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
295353358Sdim  }
296353358Sdim
297353358Sdim  return Penalty;
298353358Sdim}
299353358Sdim
300360784SdimFunction *HotColdSplitting::extractColdRegion(
301360784Sdim    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
302360784Sdim    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
303360784Sdim    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
304343171Sdim  assert(!Region.empty());
305343171Sdim
306343171Sdim  // TODO: Pass BFI and BPI to update profile information.
307343171Sdim  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
308353358Sdim                   /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
309343171Sdim                   /* AllowAlloca */ false,
310343171Sdim                   /* Suffix */ "cold." + std::to_string(Count));
311343171Sdim
312353358Sdim  // Perform a simple cost/benefit analysis to decide whether or not to permit
313353358Sdim  // splitting.
314343171Sdim  SetVector<Value *> Inputs, Outputs, Sinks;
315343171Sdim  CE.findInputsOutputs(Inputs, Outputs, Sinks);
316353358Sdim  int OutliningBenefit = getOutliningBenefit(Region, TTI);
317353358Sdim  int OutliningPenalty =
318353358Sdim      getOutliningPenalty(Region, Inputs.size(), Outputs.size());
319353358Sdim  LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
320353358Sdim                    << ", penalty = " << OutliningPenalty << "\n");
321353358Sdim  if (OutliningBenefit <= OutliningPenalty)
322343171Sdim    return nullptr;
323343171Sdim
324343171Sdim  Function *OrigF = Region[0]->getParent();
325360784Sdim  if (Function *OutF = CE.extractCodeRegion(CEAC)) {
326343171Sdim    User *U = *OutF->user_begin();
327343171Sdim    CallInst *CI = cast<CallInst>(U);
328343171Sdim    CallSite CS(CI);
329343171Sdim    NumColdRegionsOutlined++;
330343171Sdim    if (TTI.useColdCCForColdCall(*OutF)) {
331343171Sdim      OutF->setCallingConv(CallingConv::Cold);
332343171Sdim      CS.setCallingConv(CallingConv::Cold);
333343171Sdim    }
334343171Sdim    CI->setIsNoInline();
335343171Sdim
336360784Sdim    if (OrigF->hasSection())
337360784Sdim      OutF->setSection(OrigF->getSection());
338360784Sdim
339353358Sdim    markFunctionCold(*OutF, BFI != nullptr);
340343171Sdim
341343171Sdim    LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
342343171Sdim    ORE.emit([&]() {
343343171Sdim      return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
344343171Sdim                                &*Region[0]->begin())
345343171Sdim             << ore::NV("Original", OrigF) << " split cold code into "
346343171Sdim             << ore::NV("Split", OutF);
347343171Sdim    });
348343171Sdim    return OutF;
349343171Sdim  }
350343171Sdim
351343171Sdim  ORE.emit([&]() {
352343171Sdim    return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
353343171Sdim                                    &*Region[0]->begin())
354343171Sdim           << "Failed to extract region at block "
355343171Sdim           << ore::NV("Block", Region.front());
356343171Sdim  });
357343171Sdim  return nullptr;
358343171Sdim}
359343171Sdim
360343171Sdim/// A pair of (basic block, score).
361343171Sdimusing BlockTy = std::pair<BasicBlock *, unsigned>;
362343171Sdim
363343171Sdimnamespace {
364343171Sdim/// A maximal outlining region. This contains all blocks post-dominated by a
365343171Sdim/// sink block, the sink block itself, and all blocks dominated by the sink.
366353358Sdim/// If sink-predecessors and sink-successors cannot be extracted in one region,
367353358Sdim/// the static constructor returns a list of suitable extraction regions.
368343171Sdimclass OutliningRegion {
369343171Sdim  /// A list of (block, score) pairs. A block's score is non-zero iff it's a
370343171Sdim  /// viable sub-region entry point. Blocks with higher scores are better entry
371343171Sdim  /// points (i.e. they are more distant ancestors of the sink block).
372343171Sdim  SmallVector<BlockTy, 0> Blocks = {};
373343171Sdim
374343171Sdim  /// The suggested entry point into the region. If the region has multiple
375343171Sdim  /// entry points, all blocks within the region may not be reachable from this
376343171Sdim  /// entry point.
377343171Sdim  BasicBlock *SuggestedEntryPoint = nullptr;
378343171Sdim
379343171Sdim  /// Whether the entire function is cold.
380343171Sdim  bool EntireFunctionCold = false;
381343171Sdim
382343171Sdim  /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
383343171Sdim  static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
384353358Sdim    return mayExtractBlock(BB) ? Score : 0;
385343171Sdim  }
386343171Sdim
387343171Sdim  /// These scores should be lower than the score for predecessor blocks,
388343171Sdim  /// because regions starting at predecessor blocks are typically larger.
389343171Sdim  static constexpr unsigned ScoreForSuccBlock = 1;
390343171Sdim  static constexpr unsigned ScoreForSinkBlock = 1;
391343171Sdim
392343171Sdim  OutliningRegion(const OutliningRegion &) = delete;
393343171Sdim  OutliningRegion &operator=(const OutliningRegion &) = delete;
394343171Sdim
395343171Sdimpublic:
396343171Sdim  OutliningRegion() = default;
397343171Sdim  OutliningRegion(OutliningRegion &&) = default;
398343171Sdim  OutliningRegion &operator=(OutliningRegion &&) = default;
399343171Sdim
400353358Sdim  static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
401353358Sdim                                             const DominatorTree &DT,
402353358Sdim                                             const PostDominatorTree &PDT) {
403353358Sdim    std::vector<OutliningRegion> Regions;
404343171Sdim    SmallPtrSet<BasicBlock *, 4> RegionBlocks;
405343171Sdim
406353358Sdim    Regions.emplace_back();
407353358Sdim    OutliningRegion *ColdRegion = &Regions.back();
408353358Sdim
409343171Sdim    auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
410343171Sdim      RegionBlocks.insert(BB);
411353358Sdim      ColdRegion->Blocks.emplace_back(BB, Score);
412343171Sdim    };
413343171Sdim
414343171Sdim    // The ancestor farthest-away from SinkBB, and also post-dominated by it.
415343171Sdim    unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
416353358Sdim    ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
417343171Sdim    unsigned BestScore = SinkScore;
418343171Sdim
419343171Sdim    // Visit SinkBB's ancestors using inverse DFS.
420343171Sdim    auto PredIt = ++idf_begin(&SinkBB);
421343171Sdim    auto PredEnd = idf_end(&SinkBB);
422343171Sdim    while (PredIt != PredEnd) {
423343171Sdim      BasicBlock &PredBB = **PredIt;
424343171Sdim      bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
425343171Sdim
426343171Sdim      // If the predecessor is cold and has no predecessors, the entire
427343171Sdim      // function must be cold.
428343171Sdim      if (SinkPostDom && pred_empty(&PredBB)) {
429353358Sdim        ColdRegion->EntireFunctionCold = true;
430353358Sdim        return Regions;
431343171Sdim      }
432343171Sdim
433343171Sdim      // If SinkBB does not post-dominate a predecessor, do not mark the
434343171Sdim      // predecessor (or any of its predecessors) cold.
435343171Sdim      if (!SinkPostDom || !mayExtractBlock(PredBB)) {
436343171Sdim        PredIt.skipChildren();
437343171Sdim        continue;
438343171Sdim      }
439343171Sdim
440343171Sdim      // Keep track of the post-dominated ancestor farthest away from the sink.
441343171Sdim      // The path length is always >= 2, ensuring that predecessor blocks are
442343171Sdim      // considered as entry points before the sink block.
443343171Sdim      unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
444343171Sdim      if (PredScore > BestScore) {
445353358Sdim        ColdRegion->SuggestedEntryPoint = &PredBB;
446343171Sdim        BestScore = PredScore;
447343171Sdim      }
448343171Sdim
449343171Sdim      addBlockToRegion(&PredBB, PredScore);
450343171Sdim      ++PredIt;
451343171Sdim    }
452343171Sdim
453353358Sdim    // If the sink can be added to the cold region, do so. It's considered as
454353358Sdim    // an entry point before any sink-successor blocks.
455353358Sdim    //
456353358Sdim    // Otherwise, split cold sink-successor blocks using a separate region.
457353358Sdim    // This satisfies the requirement that all extraction blocks other than the
458353358Sdim    // first have predecessors within the extraction region.
459353358Sdim    if (mayExtractBlock(SinkBB)) {
460353358Sdim      addBlockToRegion(&SinkBB, SinkScore);
461353358Sdim    } else {
462353358Sdim      Regions.emplace_back();
463353358Sdim      ColdRegion = &Regions.back();
464353358Sdim      BestScore = 0;
465353358Sdim    }
466343171Sdim
467343171Sdim    // Find all successors of SinkBB dominated by SinkBB using DFS.
468343171Sdim    auto SuccIt = ++df_begin(&SinkBB);
469343171Sdim    auto SuccEnd = df_end(&SinkBB);
470343171Sdim    while (SuccIt != SuccEnd) {
471343171Sdim      BasicBlock &SuccBB = **SuccIt;
472343171Sdim      bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
473343171Sdim
474343171Sdim      // Don't allow the backwards & forwards DFSes to mark the same block.
475343171Sdim      bool DuplicateBlock = RegionBlocks.count(&SuccBB);
476343171Sdim
477343171Sdim      // If SinkBB does not dominate a successor, do not mark the successor (or
478343171Sdim      // any of its successors) cold.
479343171Sdim      if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
480343171Sdim        SuccIt.skipChildren();
481343171Sdim        continue;
482343171Sdim      }
483343171Sdim
484343171Sdim      unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
485343171Sdim      if (SuccScore > BestScore) {
486353358Sdim        ColdRegion->SuggestedEntryPoint = &SuccBB;
487343171Sdim        BestScore = SuccScore;
488343171Sdim      }
489343171Sdim
490343171Sdim      addBlockToRegion(&SuccBB, SuccScore);
491343171Sdim      ++SuccIt;
492343171Sdim    }
493343171Sdim
494353358Sdim    return Regions;
495343171Sdim  }
496343171Sdim
497343171Sdim  /// Whether this region has nothing to extract.
498343171Sdim  bool empty() const { return !SuggestedEntryPoint; }
499343171Sdim
500343171Sdim  /// The blocks in this region.
501343171Sdim  ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
502343171Sdim
503343171Sdim  /// Whether the entire function containing this region is cold.
504343171Sdim  bool isEntireFunctionCold() const { return EntireFunctionCold; }
505343171Sdim
506343171Sdim  /// Remove a sub-region from this region and return it as a block sequence.
507343171Sdim  BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
508343171Sdim    assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
509343171Sdim
510343171Sdim    // Remove blocks dominated by the suggested entry point from this region.
511343171Sdim    // During the removal, identify the next best entry point into the region.
512343171Sdim    // Ensure that the first extracted block is the suggested entry point.
513343171Sdim    BlockSequence SubRegion = {SuggestedEntryPoint};
514343171Sdim    BasicBlock *NextEntryPoint = nullptr;
515343171Sdim    unsigned NextScore = 0;
516343171Sdim    auto RegionEndIt = Blocks.end();
517343171Sdim    auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
518343171Sdim      BasicBlock *BB = Block.first;
519343171Sdim      unsigned Score = Block.second;
520343171Sdim      bool InSubRegion =
521343171Sdim          BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
522343171Sdim      if (!InSubRegion && Score > NextScore) {
523343171Sdim        NextEntryPoint = BB;
524343171Sdim        NextScore = Score;
525343171Sdim      }
526343171Sdim      if (InSubRegion && BB != SuggestedEntryPoint)
527343171Sdim        SubRegion.push_back(BB);
528343171Sdim      return InSubRegion;
529343171Sdim    });
530343171Sdim    Blocks.erase(RegionStartIt, RegionEndIt);
531343171Sdim
532343171Sdim    // Update the suggested entry point.
533343171Sdim    SuggestedEntryPoint = NextEntryPoint;
534343171Sdim
535343171Sdim    return SubRegion;
536343171Sdim  }
537343171Sdim};
538343171Sdim} // namespace
539343171Sdim
540353358Sdimbool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
541343171Sdim  bool Changed = false;
542343171Sdim
543343171Sdim  // The set of cold blocks.
544343171Sdim  SmallPtrSet<BasicBlock *, 4> ColdBlocks;
545343171Sdim
546343171Sdim  // The worklist of non-intersecting regions left to outline.
547343171Sdim  SmallVector<OutliningRegion, 2> OutliningWorklist;
548343171Sdim
549343171Sdim  // Set up an RPO traversal. Experimentally, this performs better (outlines
550343171Sdim  // more) than a PO traversal, because we prevent region overlap by keeping
551343171Sdim  // the first region to contain a block.
552343171Sdim  ReversePostOrderTraversal<Function *> RPOT(&F);
553343171Sdim
554353358Sdim  // Calculate domtrees lazily. This reduces compile-time significantly.
555353358Sdim  std::unique_ptr<DominatorTree> DT;
556353358Sdim  std::unique_ptr<PostDominatorTree> PDT;
557353358Sdim
558353358Sdim  // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
559353358Sdim  // reduces compile-time significantly. TODO: When we *do* use BFI, we should
560353358Sdim  // be able to salvage its domtrees instead of recomputing them.
561353358Sdim  BlockFrequencyInfo *BFI = nullptr;
562353358Sdim  if (HasProfileSummary)
563353358Sdim    BFI = GetBFI(F);
564353358Sdim
565353358Sdim  TargetTransformInfo &TTI = GetTTI(F);
566353358Sdim  OptimizationRemarkEmitter &ORE = (*GetORE)(F);
567353358Sdim  AssumptionCache *AC = LookupAC(F);
568353358Sdim
569343171Sdim  // Find all cold regions.
570343171Sdim  for (BasicBlock *BB : RPOT) {
571343171Sdim    // This block is already part of some outlining region.
572343171Sdim    if (ColdBlocks.count(BB))
573343171Sdim      continue;
574343171Sdim
575353358Sdim    bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
576343171Sdim                (EnableStaticAnalyis && unlikelyExecuted(*BB));
577343171Sdim    if (!Cold)
578343171Sdim      continue;
579343171Sdim
580343171Sdim    LLVM_DEBUG({
581343171Sdim      dbgs() << "Found a cold block:\n";
582343171Sdim      BB->dump();
583343171Sdim    });
584343171Sdim
585353358Sdim    if (!DT)
586360784Sdim      DT = std::make_unique<DominatorTree>(F);
587353358Sdim    if (!PDT)
588360784Sdim      PDT = std::make_unique<PostDominatorTree>(F);
589343171Sdim
590353358Sdim    auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
591353358Sdim    for (OutliningRegion &Region : Regions) {
592353358Sdim      if (Region.empty())
593353358Sdim        continue;
594343171Sdim
595353358Sdim      if (Region.isEntireFunctionCold()) {
596353358Sdim        LLVM_DEBUG(dbgs() << "Entire function is cold\n");
597353358Sdim        return markFunctionCold(F);
598353358Sdim      }
599343171Sdim
600353358Sdim      // If this outlining region intersects with another, drop the new region.
601353358Sdim      //
602353358Sdim      // TODO: It's theoretically possible to outline more by only keeping the
603353358Sdim      // largest region which contains a block, but the extra bookkeeping to do
604353358Sdim      // this is tricky/expensive.
605353358Sdim      bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
606353358Sdim        return !ColdBlocks.insert(Block.first).second;
607353358Sdim      });
608353358Sdim      if (RegionsOverlap)
609353358Sdim        continue;
610353358Sdim
611353358Sdim      OutliningWorklist.emplace_back(std::move(Region));
612353358Sdim      ++NumColdRegionsFound;
613353358Sdim    }
614343171Sdim  }
615343171Sdim
616360784Sdim  if (OutliningWorklist.empty())
617360784Sdim    return Changed;
618360784Sdim
619343171Sdim  // Outline single-entry cold regions, splitting up larger regions as needed.
620343171Sdim  unsigned OutlinedFunctionID = 1;
621360784Sdim  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
622360784Sdim  CodeExtractorAnalysisCache CEAC(F);
623360784Sdim  do {
624343171Sdim    OutliningRegion Region = OutliningWorklist.pop_back_val();
625343171Sdim    assert(!Region.empty() && "Empty outlining region in worklist");
626343171Sdim    do {
627353358Sdim      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
628343171Sdim      LLVM_DEBUG({
629343171Sdim        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
630343171Sdim        for (BasicBlock *BB : SubRegion)
631343171Sdim          BB->dump();
632343171Sdim      });
633343171Sdim
634360784Sdim      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
635360784Sdim                                             ORE, AC, OutlinedFunctionID);
636343171Sdim      if (Outlined) {
637343171Sdim        ++OutlinedFunctionID;
638343171Sdim        Changed = true;
639343171Sdim      }
640343171Sdim    } while (!Region.empty());
641360784Sdim  } while (!OutliningWorklist.empty());
642343171Sdim
643343171Sdim  return Changed;
644343171Sdim}
645343171Sdim
646343171Sdimbool HotColdSplitting::run(Module &M) {
647343171Sdim  bool Changed = false;
648353358Sdim  bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
649353358Sdim  for (auto It = M.begin(), End = M.end(); It != End; ++It) {
650353358Sdim    Function &F = *It;
651353358Sdim
652353358Sdim    // Do not touch declarations.
653353358Sdim    if (F.isDeclaration())
654353358Sdim      continue;
655353358Sdim
656353358Sdim    // Do not modify `optnone` functions.
657353358Sdim    if (F.hasOptNone())
658353358Sdim      continue;
659353358Sdim
660353358Sdim    // Detect inherently cold functions and mark them as such.
661353358Sdim    if (isFunctionCold(F)) {
662353358Sdim      Changed |= markFunctionCold(F);
663353358Sdim      continue;
664353358Sdim    }
665353358Sdim
666343171Sdim    if (!shouldOutlineFrom(F)) {
667343171Sdim      LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
668343171Sdim      continue;
669343171Sdim    }
670353358Sdim
671343171Sdim    LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
672353358Sdim    Changed |= outlineColdRegions(F, HasProfileSummary);
673343171Sdim  }
674343171Sdim  return Changed;
675343171Sdim}
676343171Sdim
677343171Sdimbool HotColdSplittingLegacyPass::runOnModule(Module &M) {
678343171Sdim  if (skipModule(M))
679343171Sdim    return false;
680343171Sdim  ProfileSummaryInfo *PSI =
681343171Sdim      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
682343171Sdim  auto GTTI = [this](Function &F) -> TargetTransformInfo & {
683343171Sdim    return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
684343171Sdim  };
685343171Sdim  auto GBFI = [this](Function &F) {
686343171Sdim    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
687343171Sdim  };
688343171Sdim  std::unique_ptr<OptimizationRemarkEmitter> ORE;
689343171Sdim  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
690343171Sdim      [&ORE](Function &F) -> OptimizationRemarkEmitter & {
691343171Sdim    ORE.reset(new OptimizationRemarkEmitter(&F));
692343171Sdim    return *ORE.get();
693343171Sdim  };
694353358Sdim  auto LookupAC = [this](Function &F) -> AssumptionCache * {
695353358Sdim    if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
696353358Sdim      return ACT->lookupAssumptionCache(F);
697353358Sdim    return nullptr;
698353358Sdim  };
699343171Sdim
700353358Sdim  return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
701343171Sdim}
702343171Sdim
703343171SdimPreservedAnalyses
704343171SdimHotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
705343171Sdim  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
706343171Sdim
707353358Sdim  auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
708353358Sdim    return FAM.getCachedResult<AssumptionAnalysis>(F);
709343171Sdim  };
710343171Sdim
711343171Sdim  auto GBFI = [&FAM](Function &F) {
712343171Sdim    return &FAM.getResult<BlockFrequencyAnalysis>(F);
713343171Sdim  };
714343171Sdim
715343171Sdim  std::function<TargetTransformInfo &(Function &)> GTTI =
716343171Sdim      [&FAM](Function &F) -> TargetTransformInfo & {
717343171Sdim    return FAM.getResult<TargetIRAnalysis>(F);
718343171Sdim  };
719343171Sdim
720343171Sdim  std::unique_ptr<OptimizationRemarkEmitter> ORE;
721343171Sdim  std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
722343171Sdim      [&ORE](Function &F) -> OptimizationRemarkEmitter & {
723343171Sdim    ORE.reset(new OptimizationRemarkEmitter(&F));
724343171Sdim    return *ORE.get();
725343171Sdim  };
726343171Sdim
727343171Sdim  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
728343171Sdim
729353358Sdim  if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
730343171Sdim    return PreservedAnalyses::none();
731343171Sdim  return PreservedAnalyses::all();
732343171Sdim}
733343171Sdim
734343171Sdimchar HotColdSplittingLegacyPass::ID = 0;
735343171SdimINITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
736343171Sdim                      "Hot Cold Splitting", false, false)
737343171SdimINITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
738343171SdimINITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
739343171SdimINITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
740343171Sdim                    "Hot Cold Splitting", false, false)
741343171Sdim
742343171SdimModulePass *llvm::createHotColdSplittingPass() {
743343171Sdim  return new HotColdSplittingLegacyPass();
744343171Sdim}
745