1//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9/// \file 10/// \brief Analyzes if a function potentially memory bound and if a kernel 11/// kernel may benefit from limiting number of waves to reduce cache thrashing. 12/// 13//===----------------------------------------------------------------------===// 14 15#include "AMDGPU.h" 16#include "AMDGPUPerfHintAnalysis.h" 17#include "Utils/AMDGPUBaseInfo.h" 18#include "llvm/ADT/SmallSet.h" 19#include "llvm/ADT/Statistic.h" 20#include "llvm/Analysis/CallGraph.h" 21#include "llvm/Analysis/ValueTracking.h" 22#include "llvm/CodeGen/TargetLowering.h" 23#include "llvm/CodeGen/TargetPassConfig.h" 24#include "llvm/CodeGen/TargetSubtargetInfo.h" 25#include "llvm/IR/Instructions.h" 26#include "llvm/IR/IntrinsicInst.h" 27#include "llvm/Support/CommandLine.h" 28#include "llvm/Target/TargetMachine.h" 29 30using namespace llvm; 31 32#define DEBUG_TYPE "amdgpu-perf-hint" 33 34static cl::opt<unsigned> 35 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 36 cl::desc("Function mem bound threshold in %")); 37 38static cl::opt<unsigned> 39 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 40 cl::desc("Kernel limit wave threshold in %")); 41 42static cl::opt<unsigned> 43 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 44 cl::desc("Indirect access memory instruction weight")); 45 46static cl::opt<unsigned> 47 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 48 cl::desc("Large stride memory access weight")); 49 50static cl::opt<unsigned> 51 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 52 cl::desc("Large stride memory access threshold")); 53 54STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 55STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 56 57char llvm::AMDGPUPerfHintAnalysis::ID = 0; 58char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 59 60INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 61 "Analysis if a function is memory bound", true, true) 62 63namespace { 64 65struct AMDGPUPerfHint { 66 friend AMDGPUPerfHintAnalysis; 67 68public: 69 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 70 const TargetLowering *TLI_) 71 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 72 73 bool runOnFunction(Function &F); 74 75private: 76 struct MemAccessInfo { 77 const Value *V = nullptr; 78 const Value *Base = nullptr; 79 int64_t Offset = 0; 80 MemAccessInfo() = default; 81 bool isLargeStride(MemAccessInfo &Reference) const; 82#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 83 Printable print() const { 84 return Printable([this](raw_ostream &OS) { 85 OS << "Value: " << *V << '\n' 86 << "Base: " << *Base << " Offset: " << Offset << '\n'; 87 }); 88 } 89#endif 90 }; 91 92 MemAccessInfo makeMemAccessInfo(Instruction *) const; 93 94 MemAccessInfo LastAccess; // Last memory access info 95 96 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 97 98 const DataLayout *DL; 99 100 const TargetLowering *TLI; 101 102 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 103 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 104 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 105 106 bool isIndirectAccess(const Instruction *Inst) const; 107 108 /// Check if the instruction is large stride. 109 /// The purpose is to identify memory access pattern like: 110 /// x = a[i]; 111 /// y = a[i+1000]; 112 /// z = a[i+2000]; 113 /// In the above example, the second and third memory access will be marked 114 /// large stride memory access. 115 bool isLargeStride(const Instruction *Inst); 116 117 bool isGlobalAddr(const Value *V) const; 118 bool isLocalAddr(const Value *V) const; 119 bool isGlobalLoadUsedInBB(const Instruction &) const; 120}; 121 122static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( 123 const Instruction *Inst) { 124 if (auto LI = dyn_cast<LoadInst>(Inst)) 125 return {LI->getPointerOperand(), LI->getType()}; 126 if (auto SI = dyn_cast<StoreInst>(Inst)) 127 return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; 128 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) 129 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; 130 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) 131 return {AI->getPointerOperand(), AI->getValOperand()->getType()}; 132 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) 133 return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())}; 134 135 return {nullptr, nullptr}; 136} 137 138bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 139 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 140 SmallSet<const Value *, 32> WorkSet; 141 SmallSet<const Value *, 32> Visited; 142 if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { 143 if (isGlobalAddr(MO)) 144 WorkSet.insert(MO); 145 } 146 147 while (!WorkSet.empty()) { 148 const Value *V = *WorkSet.begin(); 149 WorkSet.erase(*WorkSet.begin()); 150 if (!Visited.insert(V).second) 151 continue; 152 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 153 154 if (auto LD = dyn_cast<LoadInst>(V)) { 155 auto M = LD->getPointerOperand(); 156 if (isGlobalAddr(M)) { 157 LLVM_DEBUG(dbgs() << " is IA\n"); 158 return true; 159 } 160 continue; 161 } 162 163 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 164 auto P = GEP->getPointerOperand(); 165 WorkSet.insert(P); 166 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 167 WorkSet.insert(GEP->getOperand(I)); 168 continue; 169 } 170 171 if (auto U = dyn_cast<UnaryInstruction>(V)) { 172 WorkSet.insert(U->getOperand(0)); 173 continue; 174 } 175 176 if (auto BO = dyn_cast<BinaryOperator>(V)) { 177 WorkSet.insert(BO->getOperand(0)); 178 WorkSet.insert(BO->getOperand(1)); 179 continue; 180 } 181 182 if (auto S = dyn_cast<SelectInst>(V)) { 183 WorkSet.insert(S->getFalseValue()); 184 WorkSet.insert(S->getTrueValue()); 185 continue; 186 } 187 188 if (auto E = dyn_cast<ExtractElementInst>(V)) { 189 WorkSet.insert(E->getVectorOperand()); 190 continue; 191 } 192 193 LLVM_DEBUG(dbgs() << " dropped\n"); 194 } 195 196 LLVM_DEBUG(dbgs() << " is not IA\n"); 197 return false; 198} 199 200// Returns true if the global load `I` is used in its own basic block. 201bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { 202 const auto *Ld = dyn_cast<LoadInst>(&I); 203 if (!Ld) 204 return false; 205 if (!isGlobalAddr(Ld->getPointerOperand())) 206 return false; 207 208 for (const User *Usr : Ld->users()) { 209 if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) { 210 if (UsrInst->getParent() == I.getParent()) 211 return true; 212 } 213 } 214 215 return false; 216} 217 218AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 219 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 220 221 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 222 223 for (auto &B : F) { 224 LastAccess = MemAccessInfo(); 225 unsigned UsedGlobalLoadsInBB = 0; 226 for (auto &I : B) { 227 if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { 228 unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); 229 // TODO: Check if the global load and its user are close to each other 230 // instead (Or do this analysis in GCNSchedStrategy?). 231 if (isGlobalLoadUsedInBB(I)) 232 UsedGlobalLoadsInBB += Size; 233 if (isIndirectAccess(&I)) 234 FI.IAMInstCost += Size; 235 if (isLargeStride(&I)) 236 FI.LSMInstCost += Size; 237 FI.MemInstCost += Size; 238 FI.InstCost += Size; 239 continue; 240 } 241 if (auto *CB = dyn_cast<CallBase>(&I)) { 242 Function *Callee = CB->getCalledFunction(); 243 if (!Callee || Callee->isDeclaration()) { 244 ++FI.InstCost; 245 continue; 246 } 247 if (&F == Callee) // Handle immediate recursion 248 continue; 249 250 auto Loc = FIM.find(Callee); 251 if (Loc == FIM.end()) 252 continue; 253 254 FI.MemInstCost += Loc->second.MemInstCost; 255 FI.InstCost += Loc->second.InstCost; 256 FI.IAMInstCost += Loc->second.IAMInstCost; 257 FI.LSMInstCost += Loc->second.LSMInstCost; 258 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 259 TargetLoweringBase::AddrMode AM; 260 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 261 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 262 AM.HasBaseReg = !AM.BaseGV; 263 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 264 GEP->getPointerAddressSpace())) 265 // Offset will likely be folded into load or store 266 continue; 267 ++FI.InstCost; 268 } else { 269 ++FI.InstCost; 270 } 271 } 272 273 if (!FI.HasDenseGlobalMemAcc) { 274 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); 275 if (GlobalMemAccPercentage > 50) { 276 LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " 277 << B.getName() << " has " << GlobalMemAccPercentage 278 << "% global memory access\n"); 279 FI.HasDenseGlobalMemAcc = true; 280 } 281 } 282 } 283 284 return &FI; 285} 286 287bool AMDGPUPerfHint::runOnFunction(Function &F) { 288 const Module &M = *F.getParent(); 289 DL = &M.getDataLayout(); 290 291 if (F.hasFnAttribute("amdgpu-wave-limiter") && 292 F.hasFnAttribute("amdgpu-memory-bound")) 293 return false; 294 295 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 296 297 LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost 298 << '\n' 299 << " IAMInst cost: " << Info->IAMInstCost << '\n' 300 << " LSMInst cost: " << Info->LSMInstCost << '\n' 301 << " TotalInst cost: " << Info->InstCost << '\n'); 302 303 bool Changed = false; 304 305 if (isMemBound(*Info)) { 306 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 307 NumMemBound++; 308 F.addFnAttr("amdgpu-memory-bound", "true"); 309 Changed = true; 310 } 311 312 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 313 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 314 NumLimitWave++; 315 F.addFnAttr("amdgpu-wave-limiter", "true"); 316 Changed = true; 317 } 318 319 return Changed; 320} 321 322bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 323 // Reverting optimal scheduling in favour of occupancy with basic block(s) 324 // having dense global memory access can potentially hurt performance. 325 if (FI.HasDenseGlobalMemAcc) 326 return true; 327 328 return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; 329} 330 331bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 332 return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + 333 FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; 334} 335 336bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 337 if (auto PT = dyn_cast<PointerType>(V->getType())) { 338 unsigned As = PT->getAddressSpace(); 339 // Flat likely points to global too. 340 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 341 } 342 return false; 343} 344 345bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 346 if (auto PT = dyn_cast<PointerType>(V->getType())) 347 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 348 return false; 349} 350 351bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 352 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 353 354 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 355 bool IsLargeStride = MAI.isLargeStride(LastAccess); 356 if (MAI.Base) 357 LastAccess = std::move(MAI); 358 359 return IsLargeStride; 360} 361 362AMDGPUPerfHint::MemAccessInfo 363AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 364 MemAccessInfo MAI; 365 const Value *MO = getMemoryInstrPtrAndType(Inst).first; 366 367 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 368 // Do not treat local-addr memory access as large stride. 369 if (isLocalAddr(MO)) 370 return MAI; 371 372 MAI.V = MO; 373 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 374 return MAI; 375} 376 377bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 378 MemAccessInfo &Reference) const { 379 380 if (!Base || !Reference.Base || Base != Reference.Base) 381 return false; 382 383 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 384 : Reference.Offset - Offset; 385 bool Result = Diff > LargeStrideThresh; 386 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 387 << print() << "<=>\n" 388 << Reference.print() << "Result:" << Result << '\n'); 389 return Result; 390} 391} // namespace 392 393bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { 394 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 395 if (!TPC) 396 return false; 397 398 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 399 400 bool Changed = false; 401 for (CallGraphNode *I : SCC) { 402 Function *F = I->getFunction(); 403 if (!F || F->isDeclaration()) 404 continue; 405 406 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); 407 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 408 409 if (Analyzer.runOnFunction(*F)) 410 Changed = true; 411 } 412 413 return Changed; 414} 415 416bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 417 auto FI = FIM.find(F); 418 if (FI == FIM.end()) 419 return false; 420 421 return AMDGPUPerfHint::isMemBound(FI->second); 422} 423 424bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 425 auto FI = FIM.find(F); 426 if (FI == FIM.end()) 427 return false; 428 429 return AMDGPUPerfHint::needLimitWave(FI->second); 430} 431