1//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file implements the transformation that optimizes memory intrinsics 10// such as memcpy using the size value profile. When memory intrinsic size 11// value profile metadata is available, a single memory intrinsic is expanded 12// to a sequence of guarded specialized versions that are called with the 13// hottest size(s), for later expansion into more optimal inline sequences. 14// 15//===----------------------------------------------------------------------===// 16 17#include "llvm/ADT/ArrayRef.h" 18#include "llvm/ADT/Statistic.h" 19#include "llvm/ADT/StringRef.h" 20#include "llvm/ADT/Twine.h" 21#include "llvm/Analysis/BlockFrequencyInfo.h" 22#include "llvm/Analysis/DomTreeUpdater.h" 23#include "llvm/Analysis/GlobalsModRef.h" 24#include "llvm/Analysis/OptimizationRemarkEmitter.h" 25#include "llvm/IR/BasicBlock.h" 26#include "llvm/IR/CallSite.h" 27#include "llvm/IR/DerivedTypes.h" 28#include "llvm/IR/Dominators.h" 29#include "llvm/IR/Function.h" 30#include "llvm/IR/IRBuilder.h" 31#include "llvm/IR/InstVisitor.h" 32#include "llvm/IR/InstrTypes.h" 33#include "llvm/IR/Instruction.h" 34#include "llvm/IR/Instructions.h" 35#include "llvm/IR/LLVMContext.h" 36#include "llvm/IR/PassManager.h" 37#include "llvm/IR/Type.h" 38#include "llvm/InitializePasses.h" 39#include "llvm/Pass.h" 40#include "llvm/PassRegistry.h" 41#include "llvm/PassSupport.h" 42#include "llvm/ProfileData/InstrProf.h" 43#include "llvm/Support/Casting.h" 44#include "llvm/Support/CommandLine.h" 45#include "llvm/Support/Debug.h" 46#include "llvm/Support/ErrorHandling.h" 47#include "llvm/Support/MathExtras.h" 48#include "llvm/Transforms/Instrumentation.h" 49#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 50#include "llvm/Transforms/Utils/BasicBlockUtils.h" 51#include <cassert> 52#include <cstdint> 53#include <vector> 54 55using namespace llvm; 56 57#define DEBUG_TYPE "pgo-memop-opt" 58 59STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized."); 60STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated."); 61 62// The minimum call count to optimize memory intrinsic calls. 63static cl::opt<unsigned> 64 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore, 65 cl::init(1000), 66 cl::desc("The minimum count to optimize memory " 67 "intrinsic calls")); 68 69// Command line option to disable memory intrinsic optimization. The default is 70// false. This is for debug purpose. 71static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false), 72 cl::Hidden, cl::desc("Disable optimize")); 73 74// The percent threshold to optimize memory intrinsic calls. 75static cl::opt<unsigned> 76 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40), 77 cl::Hidden, cl::ZeroOrMore, 78 cl::desc("The percentage threshold for the " 79 "memory intrinsic calls optimization")); 80 81// Maximum number of versions for optimizing memory intrinsic call. 82static cl::opt<unsigned> 83 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden, 84 cl::ZeroOrMore, 85 cl::desc("The max version for the optimized memory " 86 " intrinsic calls")); 87 88// Scale the counts from the annotation using the BB count value. 89static cl::opt<bool> 90 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden, 91 cl::desc("Scale the memop size counts using the basic " 92 " block count value")); 93 94// This option sets the rangge of precise profile memop sizes. 95extern cl::opt<std::string> MemOPSizeRange; 96 97// This option sets the value that groups large memop sizes 98extern cl::opt<unsigned> MemOPSizeLarge; 99 100namespace { 101class PGOMemOPSizeOptLegacyPass : public FunctionPass { 102public: 103 static char ID; 104 105 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) { 106 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry()); 107 } 108 109 StringRef getPassName() const override { return "PGOMemOPSize"; } 110 111private: 112 bool runOnFunction(Function &F) override; 113 void getAnalysisUsage(AnalysisUsage &AU) const override { 114 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 115 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 116 AU.addPreserved<GlobalsAAWrapperPass>(); 117 AU.addPreserved<DominatorTreeWrapperPass>(); 118 } 119}; 120} // end anonymous namespace 121 122char PGOMemOPSizeOptLegacyPass::ID = 0; 123INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 124 "Optimize memory intrinsic using its size value profile", 125 false, false) 126INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 127INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", 128 "Optimize memory intrinsic using its size value profile", 129 false, false) 130 131FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { 132 return new PGOMemOPSizeOptLegacyPass(); 133} 134 135namespace { 136class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { 137public: 138 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, 139 OptimizationRemarkEmitter &ORE, DominatorTree *DT) 140 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { 141 ValueDataArray = 142 std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); 143 // Get the MemOPSize range information from option MemOPSizeRange, 144 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, 145 PreciseRangeLast); 146 } 147 bool isChanged() const { return Changed; } 148 void perform() { 149 WorkList.clear(); 150 visit(Func); 151 152 for (auto &MI : WorkList) { 153 ++NumOfPGOMemOPAnnotate; 154 if (perform(MI)) { 155 Changed = true; 156 ++NumOfPGOMemOPOpt; 157 LLVM_DEBUG(dbgs() << "MemOP call: " 158 << MI->getCalledFunction()->getName() 159 << "is Transformed.\n"); 160 } 161 } 162 } 163 164 void visitMemIntrinsic(MemIntrinsic &MI) { 165 Value *Length = MI.getLength(); 166 // Not perform on constant length calls. 167 if (dyn_cast<ConstantInt>(Length)) 168 return; 169 WorkList.push_back(&MI); 170 } 171 172private: 173 Function &Func; 174 BlockFrequencyInfo &BFI; 175 OptimizationRemarkEmitter &ORE; 176 DominatorTree *DT; 177 bool Changed; 178 std::vector<MemIntrinsic *> WorkList; 179 // Start of the previse range. 180 int64_t PreciseRangeStart; 181 // Last value of the previse range. 182 int64_t PreciseRangeLast; 183 // The space to read the profile annotation. 184 std::unique_ptr<InstrProfValueData[]> ValueDataArray; 185 bool perform(MemIntrinsic *MI); 186 187 // This kind shows which group the value falls in. For PreciseValue, we have 188 // the profile count for that value. LargeGroup groups the values that are in 189 // range [LargeValue, +inf). NonLargeGroup groups the rest of values. 190 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup }; 191 192 MemOPSizeKind getMemOPSizeKind(int64_t Value) const { 193 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0) 194 return LargeGroup; 195 if (Value == PreciseRangeLast + 1) 196 return NonLargeGroup; 197 return PreciseValue; 198 } 199}; 200 201static const char *getMIName(const MemIntrinsic *MI) { 202 switch (MI->getIntrinsicID()) { 203 case Intrinsic::memcpy: 204 return "memcpy"; 205 case Intrinsic::memmove: 206 return "memmove"; 207 case Intrinsic::memset: 208 return "memset"; 209 default: 210 return "unknown"; 211 } 212} 213 214static bool isProfitable(uint64_t Count, uint64_t TotalCount) { 215 assert(Count <= TotalCount); 216 if (Count < MemOPCountThreshold) 217 return false; 218 if (Count < TotalCount * MemOPPercentThreshold / 100) 219 return false; 220 return true; 221} 222 223static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, 224 uint64_t Denom) { 225 if (!MemOPScaleCount) 226 return Count; 227 bool Overflowed; 228 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed); 229 return ScaleCount / Denom; 230} 231 232bool MemOPSizeOpt::perform(MemIntrinsic *MI) { 233 assert(MI); 234 if (MI->getIntrinsicID() == Intrinsic::memmove) 235 return false; 236 237 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; 238 uint64_t TotalCount; 239 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, 240 ValueDataArray.get(), NumVals, TotalCount)) 241 return false; 242 243 uint64_t ActualCount = TotalCount; 244 uint64_t SavedTotalCount = TotalCount; 245 if (MemOPScaleCount) { 246 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); 247 if (!BBEdgeCount) 248 return false; 249 ActualCount = *BBEdgeCount; 250 } 251 252 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); 253 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " 254 << ActualCount << "\n"); 255 LLVM_DEBUG( 256 for (auto &VD 257 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); 258 259 if (ActualCount < MemOPCountThreshold) 260 return false; 261 // Skip if the total value profiled count is 0, in which case we can't 262 // scale up the counts properly (and there is no profitable transformation). 263 if (TotalCount == 0) 264 return false; 265 266 TotalCount = ActualCount; 267 if (MemOPScaleCount) 268 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount 269 << " denominator = " << SavedTotalCount << "\n"); 270 271 // Keeping track of the count of the default case: 272 uint64_t RemainCount = TotalCount; 273 uint64_t SavedRemainCount = SavedTotalCount; 274 SmallVector<uint64_t, 16> SizeIds; 275 SmallVector<uint64_t, 16> CaseCounts; 276 uint64_t MaxCount = 0; 277 unsigned Version = 0; 278 // Default case is in the front -- save the slot here. 279 CaseCounts.push_back(0); 280 for (auto &VD : VDs) { 281 int64_t V = VD.Value; 282 uint64_t C = VD.Count; 283 if (MemOPScaleCount) 284 C = getScaledCount(C, ActualCount, SavedTotalCount); 285 286 // Only care precise value here. 287 if (getMemOPSizeKind(V) != PreciseValue) 288 continue; 289 290 // ValueCounts are sorted on the count. Break at the first un-profitable 291 // value. 292 if (!isProfitable(C, RemainCount)) 293 break; 294 295 SizeIds.push_back(V); 296 CaseCounts.push_back(C); 297 if (C > MaxCount) 298 MaxCount = C; 299 300 assert(RemainCount >= C); 301 RemainCount -= C; 302 assert(SavedRemainCount >= VD.Count); 303 SavedRemainCount -= VD.Count; 304 305 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0) 306 break; 307 } 308 309 if (Version == 0) 310 return false; 311 312 CaseCounts[0] = RemainCount; 313 if (RemainCount > MaxCount) 314 MaxCount = RemainCount; 315 316 uint64_t SumForOpt = TotalCount - RemainCount; 317 318 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version 319 << " Versions (covering " << SumForOpt << " out of " 320 << TotalCount << ")\n"); 321 322 // mem_op(..., size) 323 // ==> 324 // switch (size) { 325 // case s1: 326 // mem_op(..., s1); 327 // goto merge_bb; 328 // case s2: 329 // mem_op(..., s2); 330 // goto merge_bb; 331 // ... 332 // default: 333 // mem_op(..., size); 334 // goto merge_bb; 335 // } 336 // merge_bb: 337 338 BasicBlock *BB = MI->getParent(); 339 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); 340 LLVM_DEBUG(dbgs() << *BB << "\n"); 341 auto OrigBBFreq = BFI.getBlockFreq(BB); 342 343 BasicBlock *DefaultBB = SplitBlock(BB, MI, DT); 344 BasicBlock::iterator It(*MI); 345 ++It; 346 assert(It != DefaultBB->end()); 347 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); 348 MergeBB->setName("MemOP.Merge"); 349 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency()); 350 DefaultBB->setName("MemOP.Default"); 351 352 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 353 auto &Ctx = Func.getContext(); 354 IRBuilder<> IRB(BB); 355 BB->getTerminator()->eraseFromParent(); 356 Value *SizeVar = MI->getLength(); 357 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); 358 359 // Clear the value profile data. 360 MI->setMetadata(LLVMContext::MD_prof, nullptr); 361 // If all promoted, we don't need the MD.prof metadata. 362 if (SavedRemainCount > 0 || Version != NumVals) 363 // Otherwise we need update with the un-promoted records back. 364 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), 365 SavedRemainCount, IPVK_MemOPSize, NumVals); 366 367 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); 368 369 std::vector<DominatorTree::UpdateType> Updates; 370 if (DT) 371 Updates.reserve(2 * SizeIds.size()); 372 373 for (uint64_t SizeId : SizeIds) { 374 BasicBlock *CaseBB = BasicBlock::Create( 375 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); 376 Instruction *NewInst = MI->clone(); 377 // Fix the argument. 378 auto *MemI = cast<MemIntrinsic>(NewInst); 379 auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType()); 380 assert(SizeType && "Expected integer type size argument."); 381 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); 382 MemI->setLength(CaseSizeId); 383 CaseBB->getInstList().push_back(NewInst); 384 IRBuilder<> IRBCase(CaseBB); 385 IRBCase.CreateBr(MergeBB); 386 SI->addCase(CaseSizeId, CaseBB); 387 if (DT) { 388 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); 389 Updates.push_back({DominatorTree::Insert, BB, CaseBB}); 390 } 391 LLVM_DEBUG(dbgs() << *CaseBB << "\n"); 392 } 393 DTU.applyUpdates(Updates); 394 Updates.clear(); 395 396 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount); 397 398 LLVM_DEBUG(dbgs() << *BB << "\n"); 399 LLVM_DEBUG(dbgs() << *DefaultBB << "\n"); 400 LLVM_DEBUG(dbgs() << *MergeBB << "\n"); 401 402 ORE.emit([&]() { 403 using namespace ore; 404 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI) 405 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI))) 406 << " with count " << NV("Count", SumForOpt) << " out of " 407 << NV("Total", TotalCount) << " for " << NV("Versions", Version) 408 << " versions"; 409 }); 410 411 return true; 412} 413} // namespace 414 415static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, 416 OptimizationRemarkEmitter &ORE, 417 DominatorTree *DT) { 418 if (DisableMemOPOPT) 419 return false; 420 421 if (F.hasFnAttribute(Attribute::OptimizeForSize)) 422 return false; 423 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT); 424 MemOPSizeOpt.perform(); 425 return MemOPSizeOpt.isChanged(); 426} 427 428bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { 429 BlockFrequencyInfo &BFI = 430 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 431 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 432 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 433 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; 434 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 435} 436 437namespace llvm { 438char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID; 439 440PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, 441 FunctionAnalysisManager &FAM) { 442 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); 443 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 444 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 445 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT); 446 if (!Changed) 447 return PreservedAnalyses::all(); 448 auto PA = PreservedAnalyses(); 449 PA.preserve<GlobalsAA>(); 450 PA.preserve<DominatorTreeAnalysis>(); 451 return PA; 452} 453} // namespace llvm 454