1//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the transformation that optimizes memory intrinsics
10// such as memcpy using the size value profile. When memory intrinsic size
11// value profile metadata is available, a single memory intrinsic is expanded
12// to a sequence of guarded specialized versions that are called with the
13// hottest size(s), for later expansion into more optimal inline sequences.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/Twine.h"
21#include "llvm/Analysis/BlockFrequencyInfo.h"
22#include "llvm/Analysis/DomTreeUpdater.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/TargetLibraryInfo.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/DerivedTypes.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/Function.h"
29#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstVisitor.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/Instructions.h"
33#include "llvm/IR/LLVMContext.h"
34#include "llvm/IR/PassManager.h"
35#include "llvm/IR/Type.h"
36#include "llvm/ProfileData/InstrProf.h"
37#define INSTR_PROF_VALUE_PROF_MEMOP_API
38#include "llvm/ProfileData/InstrProfData.inc"
39#include "llvm/Support/Casting.h"
40#include "llvm/Support/CommandLine.h"
41#include "llvm/Support/Debug.h"
42#include "llvm/Support/ErrorHandling.h"
43#include "llvm/Support/MathExtras.h"
44#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
45#include "llvm/Transforms/Utils/BasicBlockUtils.h"
46#include <cassert>
47#include <cstdint>
48#include <vector>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "pgo-memop-opt"
53
54STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
55STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
56
57// The minimum call count to optimize memory intrinsic calls.
58static cl::opt<unsigned>
59    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
60                        cl::desc("The minimum count to optimize memory "
61                                 "intrinsic calls"));
62
63// Command line option to disable memory intrinsic optimization. The default is
64// false. This is for debug purpose.
65static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
66                                     cl::Hidden, cl::desc("Disable optimize"));
67
68// The percent threshold to optimize memory intrinsic calls.
69static cl::opt<unsigned>
70    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
71                          cl::Hidden,
72                          cl::desc("The percentage threshold for the "
73                                   "memory intrinsic calls optimization"));
74
75// Maximum number of versions for optimizing memory intrinsic call.
76static cl::opt<unsigned>
77    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
78                    cl::desc("The max version for the optimized memory "
79                             " intrinsic calls"));
80
81// Scale the counts from the annotation using the BB count value.
82static cl::opt<bool>
83    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
84                    cl::desc("Scale the memop size counts using the basic "
85                             " block count value"));
86
87cl::opt<bool>
88    MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
89                       cl::Hidden,
90                       cl::desc("Size-specialize memcmp and bcmp calls"));
91
92static cl::opt<unsigned>
93    MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
94                    cl::desc("Optimize the memop size <= this value"));
95
96namespace {
97
98static const char *getMIName(const MemIntrinsic *MI) {
99  switch (MI->getIntrinsicID()) {
100  case Intrinsic::memcpy:
101    return "memcpy";
102  case Intrinsic::memmove:
103    return "memmove";
104  case Intrinsic::memset:
105    return "memset";
106  default:
107    return "unknown";
108  }
109}
110
111// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
112struct MemOp {
113  Instruction *I;
114  MemOp(MemIntrinsic *MI) : I(MI) {}
115  MemOp(CallInst *CI) : I(CI) {}
116  MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
117  CallInst *asCI() { return cast<CallInst>(I); }
118  MemOp clone() {
119    if (auto MI = asMI())
120      return MemOp(cast<MemIntrinsic>(MI->clone()));
121    return MemOp(cast<CallInst>(asCI()->clone()));
122  }
123  Value *getLength() {
124    if (auto MI = asMI())
125      return MI->getLength();
126    return asCI()->getArgOperand(2);
127  }
128  void setLength(Value *Length) {
129    if (auto MI = asMI())
130      return MI->setLength(Length);
131    asCI()->setArgOperand(2, Length);
132  }
133  StringRef getFuncName() {
134    if (auto MI = asMI())
135      return MI->getCalledFunction()->getName();
136    return asCI()->getCalledFunction()->getName();
137  }
138  bool isMemmove() {
139    if (auto MI = asMI())
140      if (MI->getIntrinsicID() == Intrinsic::memmove)
141        return true;
142    return false;
143  }
144  bool isMemcmp(TargetLibraryInfo &TLI) {
145    LibFunc Func;
146    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
147        Func == LibFunc_memcmp) {
148      return true;
149    }
150    return false;
151  }
152  bool isBcmp(TargetLibraryInfo &TLI) {
153    LibFunc Func;
154    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
155        Func == LibFunc_bcmp) {
156      return true;
157    }
158    return false;
159  }
160  const char *getName(TargetLibraryInfo &TLI) {
161    if (auto MI = asMI())
162      return getMIName(MI);
163    LibFunc Func;
164    if (TLI.getLibFunc(*asCI(), Func)) {
165      if (Func == LibFunc_memcmp)
166        return "memcmp";
167      if (Func == LibFunc_bcmp)
168        return "bcmp";
169    }
170    llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
171    return nullptr;
172  }
173};
174
175class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
176public:
177  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
178               OptimizationRemarkEmitter &ORE, DominatorTree *DT,
179               TargetLibraryInfo &TLI)
180      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
181    ValueDataArray =
182        std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS);
183  }
184  bool isChanged() const { return Changed; }
185  void perform() {
186    WorkList.clear();
187    visit(Func);
188
189    for (auto &MO : WorkList) {
190      ++NumOfPGOMemOPAnnotate;
191      if (perform(MO)) {
192        Changed = true;
193        ++NumOfPGOMemOPOpt;
194        LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
195                          << "is Transformed.\n");
196      }
197    }
198  }
199
200  void visitMemIntrinsic(MemIntrinsic &MI) {
201    Value *Length = MI.getLength();
202    // Not perform on constant length calls.
203    if (isa<ConstantInt>(Length))
204      return;
205    WorkList.push_back(MemOp(&MI));
206  }
207
208  void visitCallInst(CallInst &CI) {
209    LibFunc Func;
210    if (TLI.getLibFunc(CI, Func) &&
211        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
212        !isa<ConstantInt>(CI.getArgOperand(2))) {
213      WorkList.push_back(MemOp(&CI));
214    }
215  }
216
217private:
218  Function &Func;
219  BlockFrequencyInfo &BFI;
220  OptimizationRemarkEmitter &ORE;
221  DominatorTree *DT;
222  TargetLibraryInfo &TLI;
223  bool Changed;
224  std::vector<MemOp> WorkList;
225  // The space to read the profile annotation.
226  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
227  bool perform(MemOp MO);
228};
229
230static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
231  assert(Count <= TotalCount);
232  if (Count < MemOPCountThreshold)
233    return false;
234  if (Count < TotalCount * MemOPPercentThreshold / 100)
235    return false;
236  return true;
237}
238
239static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
240                                      uint64_t Denom) {
241  if (!MemOPScaleCount)
242    return Count;
243  bool Overflowed;
244  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
245  return ScaleCount / Denom;
246}
247
248bool MemOPSizeOpt::perform(MemOp MO) {
249  assert(MO.I);
250  if (MO.isMemmove())
251    return false;
252  if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
253    return false;
254
255  uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS;
256  uint64_t TotalCount;
257  if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals,
258                                ValueDataArray.get(), NumVals, TotalCount))
259    return false;
260
261  uint64_t ActualCount = TotalCount;
262  uint64_t SavedTotalCount = TotalCount;
263  if (MemOPScaleCount) {
264    auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
265    if (!BBEdgeCount)
266      return false;
267    ActualCount = *BBEdgeCount;
268  }
269
270  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
271  LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
272                    << ActualCount << "\n");
273  LLVM_DEBUG(
274      for (auto &VD
275           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
276
277  if (ActualCount < MemOPCountThreshold)
278    return false;
279  // Skip if the total value profiled count is 0, in which case we can't
280  // scale up the counts properly (and there is no profitable transformation).
281  if (TotalCount == 0)
282    return false;
283
284  TotalCount = ActualCount;
285  if (MemOPScaleCount)
286    LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
287                      << " denominator = " << SavedTotalCount << "\n");
288
289  // Keeping track of the count of the default case:
290  uint64_t RemainCount = TotalCount;
291  uint64_t SavedRemainCount = SavedTotalCount;
292  SmallVector<uint64_t, 16> SizeIds;
293  SmallVector<uint64_t, 16> CaseCounts;
294  SmallDenseSet<uint64_t, 16> SeenSizeId;
295  uint64_t MaxCount = 0;
296  unsigned Version = 0;
297  // Default case is in the front -- save the slot here.
298  CaseCounts.push_back(0);
299  SmallVector<InstrProfValueData, 24> RemainingVDs;
300  for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) {
301    auto &VD = *I;
302    int64_t V = VD.Value;
303    uint64_t C = VD.Count;
304    if (MemOPScaleCount)
305      C = getScaledCount(C, ActualCount, SavedTotalCount);
306
307    if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) {
308      RemainingVDs.push_back(VD);
309      continue;
310    }
311
312    // ValueCounts are sorted on the count. Break at the first un-profitable
313    // value.
314    if (!isProfitable(C, RemainCount)) {
315      RemainingVDs.insert(RemainingVDs.end(), I, E);
316      break;
317    }
318
319    if (!SeenSizeId.insert(V).second) {
320      errs() << "warning: Invalid Profile Data in Function " << Func.getName()
321             << ": Two identical values in MemOp value counts.\n";
322      return false;
323    }
324
325    SizeIds.push_back(V);
326    CaseCounts.push_back(C);
327    if (C > MaxCount)
328      MaxCount = C;
329
330    assert(RemainCount >= C);
331    RemainCount -= C;
332    assert(SavedRemainCount >= VD.Count);
333    SavedRemainCount -= VD.Count;
334
335    if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) {
336      RemainingVDs.insert(RemainingVDs.end(), I + 1, E);
337      break;
338    }
339  }
340
341  if (Version == 0)
342    return false;
343
344  CaseCounts[0] = RemainCount;
345  if (RemainCount > MaxCount)
346    MaxCount = RemainCount;
347
348  uint64_t SumForOpt = TotalCount - RemainCount;
349
350  LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
351                    << " Versions (covering " << SumForOpt << " out of "
352                    << TotalCount << ")\n");
353
354  // mem_op(..., size)
355  // ==>
356  // switch (size) {
357  //   case s1:
358  //      mem_op(..., s1);
359  //      goto merge_bb;
360  //   case s2:
361  //      mem_op(..., s2);
362  //      goto merge_bb;
363  //   ...
364  //   default:
365  //      mem_op(..., size);
366  //      goto merge_bb;
367  // }
368  // merge_bb:
369
370  BasicBlock *BB = MO.I->getParent();
371  LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
372  LLVM_DEBUG(dbgs() << *BB << "\n");
373  auto OrigBBFreq = BFI.getBlockFreq(BB);
374
375  BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
376  BasicBlock::iterator It(*MO.I);
377  ++It;
378  assert(It != DefaultBB->end());
379  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
380  MergeBB->setName("MemOP.Merge");
381  BFI.setBlockFreq(MergeBB, OrigBBFreq);
382  DefaultBB->setName("MemOP.Default");
383
384  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
385  auto &Ctx = Func.getContext();
386  IRBuilder<> IRB(BB);
387  BB->getTerminator()->eraseFromParent();
388  Value *SizeVar = MO.getLength();
389  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
390  Type *MemOpTy = MO.I->getType();
391  PHINode *PHI = nullptr;
392  if (!MemOpTy->isVoidTy()) {
393    // Insert a phi for the return values at the merge block.
394    IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
395    PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
396    MO.I->replaceAllUsesWith(PHI);
397    PHI->addIncoming(MO.I, DefaultBB);
398  }
399
400  // Clear the value profile data.
401  MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
402  // If all promoted, we don't need the MD.prof metadata.
403  if (SavedRemainCount > 0 || Version != NumVals) {
404    // Otherwise we need update with the un-promoted records back.
405    ArrayRef<InstrProfValueData> RemVDs(RemainingVDs);
406    annotateValueSite(*Func.getParent(), *MO.I, RemVDs, SavedRemainCount,
407                      IPVK_MemOPSize, NumVals);
408  }
409
410  LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
411
412  std::vector<DominatorTree::UpdateType> Updates;
413  if (DT)
414    Updates.reserve(2 * SizeIds.size());
415
416  for (uint64_t SizeId : SizeIds) {
417    BasicBlock *CaseBB = BasicBlock::Create(
418        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
419    MemOp NewMO = MO.clone();
420    // Fix the argument.
421    auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
422    assert(SizeType && "Expected integer type size argument.");
423    ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
424    NewMO.setLength(CaseSizeId);
425    NewMO.I->insertInto(CaseBB, CaseBB->end());
426    IRBuilder<> IRBCase(CaseBB);
427    IRBCase.CreateBr(MergeBB);
428    SI->addCase(CaseSizeId, CaseBB);
429    if (!MemOpTy->isVoidTy())
430      PHI->addIncoming(NewMO.I, CaseBB);
431    if (DT) {
432      Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
433      Updates.push_back({DominatorTree::Insert, BB, CaseBB});
434    }
435    LLVM_DEBUG(dbgs() << *CaseBB << "\n");
436  }
437  DTU.applyUpdates(Updates);
438  Updates.clear();
439
440  if (MaxCount)
441    setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
442
443  LLVM_DEBUG(dbgs() << *BB << "\n");
444  LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
445  LLVM_DEBUG(dbgs() << *MergeBB << "\n");
446
447  ORE.emit([&]() {
448    using namespace ore;
449    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
450           << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
451           << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
452           << " for " << NV("Versions", Version) << " versions";
453  });
454
455  return true;
456}
457} // namespace
458
459static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
460                                OptimizationRemarkEmitter &ORE,
461                                DominatorTree *DT, TargetLibraryInfo &TLI) {
462  if (DisableMemOPOPT)
463    return false;
464
465  if (F.hasFnAttribute(Attribute::OptimizeForSize))
466    return false;
467  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
468  MemOPSizeOpt.perform();
469  return MemOPSizeOpt.isChanged();
470}
471
472PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
473                                       FunctionAnalysisManager &FAM) {
474  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
475  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
476  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
477  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
478  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
479  if (!Changed)
480    return PreservedAnalyses::all();
481  auto PA = PreservedAnalyses();
482  PA.preserve<DominatorTreeAnalysis>();
483  return PA;
484}
485