1//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the transformation that optimizes memory intrinsics
10// such as memcpy using the size value profile. When memory intrinsic size
11// value profile metadata is available, a single memory intrinsic is expanded
12// to a sequence of guarded specialized versions that are called with the
13// hottest size(s), for later expansion into more optimal inline sequences.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/Twine.h"
21#include "llvm/Analysis/BlockFrequencyInfo.h"
22#include "llvm/Analysis/DomTreeUpdater.h"
23#include "llvm/Analysis/GlobalsModRef.h"
24#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/CallSite.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/InstVisitor.h"
32#include "llvm/IR/InstrTypes.h"
33#include "llvm/IR/Instruction.h"
34#include "llvm/IR/Instructions.h"
35#include "llvm/IR/LLVMContext.h"
36#include "llvm/IR/PassManager.h"
37#include "llvm/IR/Type.h"
38#include "llvm/InitializePasses.h"
39#include "llvm/Pass.h"
40#include "llvm/PassRegistry.h"
41#include "llvm/PassSupport.h"
42#include "llvm/ProfileData/InstrProf.h"
43#include "llvm/Support/Casting.h"
44#include "llvm/Support/CommandLine.h"
45#include "llvm/Support/Debug.h"
46#include "llvm/Support/ErrorHandling.h"
47#include "llvm/Support/MathExtras.h"
48#include "llvm/Transforms/Instrumentation.h"
49#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
50#include "llvm/Transforms/Utils/BasicBlockUtils.h"
51#include <cassert>
52#include <cstdint>
53#include <vector>
54
55using namespace llvm;
56
57#define DEBUG_TYPE "pgo-memop-opt"
58
59STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
61
62// The minimum call count to optimize memory intrinsic calls.
63static cl::opt<unsigned>
64    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
65                        cl::init(1000),
66                        cl::desc("The minimum count to optimize memory "
67                                 "intrinsic calls"));
68
69// Command line option to disable memory intrinsic optimization. The default is
70// false. This is for debug purpose.
71static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
72                                     cl::Hidden, cl::desc("Disable optimize"));
73
74// The percent threshold to optimize memory intrinsic calls.
75static cl::opt<unsigned>
76    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
77                          cl::Hidden, cl::ZeroOrMore,
78                          cl::desc("The percentage threshold for the "
79                                   "memory intrinsic calls optimization"));
80
81// Maximum number of versions for optimizing memory intrinsic call.
82static cl::opt<unsigned>
83    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
84                    cl::ZeroOrMore,
85                    cl::desc("The max version for the optimized memory "
86                             " intrinsic calls"));
87
88// Scale the counts from the annotation using the BB count value.
89static cl::opt<bool>
90    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
91                    cl::desc("Scale the memop size counts using the basic "
92                             " block count value"));
93
94// This option sets the rangge of precise profile memop sizes.
95extern cl::opt<std::string> MemOPSizeRange;
96
97// This option sets the value that groups large memop sizes
98extern cl::opt<unsigned> MemOPSizeLarge;
99
100namespace {
101class PGOMemOPSizeOptLegacyPass : public FunctionPass {
102public:
103  static char ID;
104
105  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
106    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
107  }
108
109  StringRef getPassName() const override { return "PGOMemOPSize"; }
110
111private:
112  bool runOnFunction(Function &F) override;
113  void getAnalysisUsage(AnalysisUsage &AU) const override {
114    AU.addRequired<BlockFrequencyInfoWrapperPass>();
115    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
116    AU.addPreserved<GlobalsAAWrapperPass>();
117    AU.addPreserved<DominatorTreeWrapperPass>();
118  }
119};
120} // end anonymous namespace
121
122char PGOMemOPSizeOptLegacyPass::ID = 0;
123INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
124                      "Optimize memory intrinsic using its size value profile",
125                      false, false)
126INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
127INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
128                    "Optimize memory intrinsic using its size value profile",
129                    false, false)
130
131FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
132  return new PGOMemOPSizeOptLegacyPass();
133}
134
135namespace {
136class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
137public:
138  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
139               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
140      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
141    ValueDataArray =
142        std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
143    // Get the MemOPSize range information from option MemOPSizeRange,
144    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
145                                PreciseRangeLast);
146  }
147  bool isChanged() const { return Changed; }
148  void perform() {
149    WorkList.clear();
150    visit(Func);
151
152    for (auto &MI : WorkList) {
153      ++NumOfPGOMemOPAnnotate;
154      if (perform(MI)) {
155        Changed = true;
156        ++NumOfPGOMemOPOpt;
157        LLVM_DEBUG(dbgs() << "MemOP call: "
158                          << MI->getCalledFunction()->getName()
159                          << "is Transformed.\n");
160      }
161    }
162  }
163
164  void visitMemIntrinsic(MemIntrinsic &MI) {
165    Value *Length = MI.getLength();
166    // Not perform on constant length calls.
167    if (dyn_cast<ConstantInt>(Length))
168      return;
169    WorkList.push_back(&MI);
170  }
171
172private:
173  Function &Func;
174  BlockFrequencyInfo &BFI;
175  OptimizationRemarkEmitter &ORE;
176  DominatorTree *DT;
177  bool Changed;
178  std::vector<MemIntrinsic *> WorkList;
179  // Start of the previse range.
180  int64_t PreciseRangeStart;
181  // Last value of the previse range.
182  int64_t PreciseRangeLast;
183  // The space to read the profile annotation.
184  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
185  bool perform(MemIntrinsic *MI);
186
187  // This kind shows which group the value falls in. For PreciseValue, we have
188  // the profile count for that value. LargeGroup groups the values that are in
189  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
190  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
191
192  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
193    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
194      return LargeGroup;
195    if (Value == PreciseRangeLast + 1)
196      return NonLargeGroup;
197    return PreciseValue;
198  }
199};
200
201static const char *getMIName(const MemIntrinsic *MI) {
202  switch (MI->getIntrinsicID()) {
203  case Intrinsic::memcpy:
204    return "memcpy";
205  case Intrinsic::memmove:
206    return "memmove";
207  case Intrinsic::memset:
208    return "memset";
209  default:
210    return "unknown";
211  }
212}
213
214static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
215  assert(Count <= TotalCount);
216  if (Count < MemOPCountThreshold)
217    return false;
218  if (Count < TotalCount * MemOPPercentThreshold / 100)
219    return false;
220  return true;
221}
222
223static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
224                                      uint64_t Denom) {
225  if (!MemOPScaleCount)
226    return Count;
227  bool Overflowed;
228  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
229  return ScaleCount / Denom;
230}
231
232bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
233  assert(MI);
234  if (MI->getIntrinsicID() == Intrinsic::memmove)
235    return false;
236
237  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
238  uint64_t TotalCount;
239  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
240                                ValueDataArray.get(), NumVals, TotalCount))
241    return false;
242
243  uint64_t ActualCount = TotalCount;
244  uint64_t SavedTotalCount = TotalCount;
245  if (MemOPScaleCount) {
246    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
247    if (!BBEdgeCount)
248      return false;
249    ActualCount = *BBEdgeCount;
250  }
251
252  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
253  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
254                    << ActualCount << "\n");
255  LLVM_DEBUG(
256      for (auto &VD
257           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
258
259  if (ActualCount < MemOPCountThreshold)
260    return false;
261  // Skip if the total value profiled count is 0, in which case we can't
262  // scale up the counts properly (and there is no profitable transformation).
263  if (TotalCount == 0)
264    return false;
265
266  TotalCount = ActualCount;
267  if (MemOPScaleCount)
268    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
269                      << " denominator = " << SavedTotalCount << "\n");
270
271  // Keeping track of the count of the default case:
272  uint64_t RemainCount = TotalCount;
273  uint64_t SavedRemainCount = SavedTotalCount;
274  SmallVector<uint64_t, 16> SizeIds;
275  SmallVector<uint64_t, 16> CaseCounts;
276  uint64_t MaxCount = 0;
277  unsigned Version = 0;
278  // Default case is in the front -- save the slot here.
279  CaseCounts.push_back(0);
280  for (auto &VD : VDs) {
281    int64_t V = VD.Value;
282    uint64_t C = VD.Count;
283    if (MemOPScaleCount)
284      C = getScaledCount(C, ActualCount, SavedTotalCount);
285
286    // Only care precise value here.
287    if (getMemOPSizeKind(V) != PreciseValue)
288      continue;
289
290    // ValueCounts are sorted on the count. Break at the first un-profitable
291    // value.
292    if (!isProfitable(C, RemainCount))
293      break;
294
295    SizeIds.push_back(V);
296    CaseCounts.push_back(C);
297    if (C > MaxCount)
298      MaxCount = C;
299
300    assert(RemainCount >= C);
301    RemainCount -= C;
302    assert(SavedRemainCount >= VD.Count);
303    SavedRemainCount -= VD.Count;
304
305    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
306      break;
307  }
308
309  if (Version == 0)
310    return false;
311
312  CaseCounts[0] = RemainCount;
313  if (RemainCount > MaxCount)
314    MaxCount = RemainCount;
315
316  uint64_t SumForOpt = TotalCount - RemainCount;
317
318  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
319                    << " Versions (covering " << SumForOpt << " out of "
320                    << TotalCount << ")\n");
321
322  // mem_op(..., size)
323  // ==>
324  // switch (size) {
325  //   case s1:
326  //      mem_op(..., s1);
327  //      goto merge_bb;
328  //   case s2:
329  //      mem_op(..., s2);
330  //      goto merge_bb;
331  //   ...
332  //   default:
333  //      mem_op(..., size);
334  //      goto merge_bb;
335  // }
336  // merge_bb:
337
338  BasicBlock *BB = MI->getParent();
339  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
340  LLVM_DEBUG(dbgs() << *BB << "\n");
341  auto OrigBBFreq = BFI.getBlockFreq(BB);
342
343  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
344  BasicBlock::iterator It(*MI);
345  ++It;
346  assert(It != DefaultBB->end());
347  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
348  MergeBB->setName("MemOP.Merge");
349  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
350  DefaultBB->setName("MemOP.Default");
351
352  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
353  auto &Ctx = Func.getContext();
354  IRBuilder<> IRB(BB);
355  BB->getTerminator()->eraseFromParent();
356  Value *SizeVar = MI->getLength();
357  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
358
359  // Clear the value profile data.
360  MI->setMetadata(LLVMContext::MD_prof, nullptr);
361  // If all promoted, we don't need the MD.prof metadata.
362  if (SavedRemainCount > 0 || Version != NumVals)
363    // Otherwise we need update with the un-promoted records back.
364    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
365                      SavedRemainCount, IPVK_MemOPSize, NumVals);
366
367  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
368
369  std::vector<DominatorTree::UpdateType> Updates;
370  if (DT)
371    Updates.reserve(2 * SizeIds.size());
372
373  for (uint64_t SizeId : SizeIds) {
374    BasicBlock *CaseBB = BasicBlock::Create(
375        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
376    Instruction *NewInst = MI->clone();
377    // Fix the argument.
378    auto *MemI = cast<MemIntrinsic>(NewInst);
379    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
380    assert(SizeType && "Expected integer type size argument.");
381    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
382    MemI->setLength(CaseSizeId);
383    CaseBB->getInstList().push_back(NewInst);
384    IRBuilder<> IRBCase(CaseBB);
385    IRBCase.CreateBr(MergeBB);
386    SI->addCase(CaseSizeId, CaseBB);
387    if (DT) {
388      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
389      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
390    }
391    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
392  }
393  DTU.applyUpdates(Updates);
394  Updates.clear();
395
396  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
397
398  LLVM_DEBUG(dbgs() << *BB << "\n");
399  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
400  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
401
402  ORE.emit([&]() {
403    using namespace ore;
404    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
405             << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
406             << " with count " << NV("Count", SumForOpt) << " out of "
407             << NV("Total", TotalCount) << " for " << NV("Versions", Version)
408             << " versions";
409  });
410
411  return true;
412}
413} // namespace
414
415static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
416                                OptimizationRemarkEmitter &ORE,
417                                DominatorTree *DT) {
418  if (DisableMemOPOPT)
419    return false;
420
421  if (F.hasFnAttribute(Attribute::OptimizeForSize))
422    return false;
423  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
424  MemOPSizeOpt.perform();
425  return MemOPSizeOpt.isChanged();
426}
427
428bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
429  BlockFrequencyInfo &BFI =
430      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
431  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
432  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
433  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
434  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
435}
436
437namespace llvm {
438char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
439
440PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
441                                       FunctionAnalysisManager &FAM) {
442  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
443  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
444  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
445  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
446  if (!Changed)
447    return PreservedAnalyses::all();
448  auto PA = PreservedAnalyses();
449  PA.preserve<GlobalsAA>();
450  PA.preserve<DominatorTreeAnalysis>();
451  return PA;
452}
453} // namespace llvm
454