1336809Sdim//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2336809Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6336809Sdim// 7336809Sdim//===----------------------------------------------------------------------===// 8336809Sdim// 9336809Sdim/// \file 10336809Sdim/// \brief Analyzes if a function potentially memory bound and if a kernel 11336809Sdim/// kernel may benefit from limiting number of waves to reduce cache thrashing. 12336809Sdim/// 13336809Sdim//===----------------------------------------------------------------------===// 14336809Sdim 15336809Sdim#include "AMDGPU.h" 16336809Sdim#include "AMDGPUPerfHintAnalysis.h" 17336809Sdim#include "Utils/AMDGPUBaseInfo.h" 18336809Sdim#include "llvm/ADT/SmallSet.h" 19336809Sdim#include "llvm/ADT/Statistic.h" 20353358Sdim#include "llvm/Analysis/CallGraph.h" 21336809Sdim#include "llvm/Analysis/ValueTracking.h" 22336809Sdim#include "llvm/CodeGen/TargetLowering.h" 23336809Sdim#include "llvm/CodeGen/TargetPassConfig.h" 24336809Sdim#include "llvm/CodeGen/TargetSubtargetInfo.h" 25336809Sdim#include "llvm/IR/Constants.h" 26336809Sdim#include "llvm/IR/Instructions.h" 27336809Sdim#include "llvm/IR/IntrinsicInst.h" 28336809Sdim#include "llvm/IR/Module.h" 29336809Sdim#include "llvm/IR/ValueMap.h" 30336809Sdim#include "llvm/Support/CommandLine.h" 31336809Sdim 32336809Sdimusing namespace llvm; 33336809Sdim 34336809Sdim#define DEBUG_TYPE "amdgpu-perf-hint" 35336809Sdim 36336809Sdimstatic cl::opt<unsigned> 37336809Sdim MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 38336809Sdim cl::desc("Function mem bound threshold in %")); 39336809Sdim 40336809Sdimstatic cl::opt<unsigned> 41336809Sdim LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 42336809Sdim cl::desc("Kernel limit wave threshold in %")); 43336809Sdim 44336809Sdimstatic cl::opt<unsigned> 45336809Sdim IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 46336809Sdim cl::desc("Indirect access memory instruction weight")); 47336809Sdim 48336809Sdimstatic cl::opt<unsigned> 49336809Sdim LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 50336809Sdim cl::desc("Large stride memory access weight")); 51336809Sdim 52336809Sdimstatic cl::opt<unsigned> 53336809Sdim LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 54336809Sdim cl::desc("Large stride memory access threshold")); 55336809Sdim 56336809SdimSTATISTIC(NumMemBound, "Number of functions marked as memory bound"); 57336809SdimSTATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 58336809Sdim 59336809Sdimchar llvm::AMDGPUPerfHintAnalysis::ID = 0; 60336809Sdimchar &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 61336809Sdim 62336809SdimINITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 63336809Sdim "Analysis if a function is memory bound", true, true) 64336809Sdim 65336809Sdimnamespace { 66336809Sdim 67336809Sdimstruct AMDGPUPerfHint { 68336809Sdim friend AMDGPUPerfHintAnalysis; 69336809Sdim 70336809Sdimpublic: 71336809Sdim AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 72336809Sdim const TargetLowering *TLI_) 73336809Sdim : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 74336809Sdim 75353358Sdim bool runOnFunction(Function &F); 76336809Sdim 77336809Sdimprivate: 78336809Sdim struct MemAccessInfo { 79336809Sdim const Value *V; 80336809Sdim const Value *Base; 81336809Sdim int64_t Offset; 82336809Sdim MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 83336809Sdim bool isLargeStride(MemAccessInfo &Reference) const; 84336809Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 85336809Sdim Printable print() const { 86336809Sdim return Printable([this](raw_ostream &OS) { 87336809Sdim OS << "Value: " << *V << '\n' 88336809Sdim << "Base: " << *Base << " Offset: " << Offset << '\n'; 89336809Sdim }); 90336809Sdim } 91336809Sdim#endif 92336809Sdim }; 93336809Sdim 94336809Sdim MemAccessInfo makeMemAccessInfo(Instruction *) const; 95336809Sdim 96336809Sdim MemAccessInfo LastAccess; // Last memory access info 97336809Sdim 98336809Sdim AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 99336809Sdim 100336809Sdim const DataLayout *DL; 101336809Sdim 102336809Sdim const TargetLowering *TLI; 103336809Sdim 104353358Sdim AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 105336809Sdim static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 106336809Sdim static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 107336809Sdim 108336809Sdim bool isIndirectAccess(const Instruction *Inst) const; 109336809Sdim 110336809Sdim /// Check if the instruction is large stride. 111336809Sdim /// The purpose is to identify memory access pattern like: 112336809Sdim /// x = a[i]; 113336809Sdim /// y = a[i+1000]; 114336809Sdim /// z = a[i+2000]; 115336809Sdim /// In the above example, the second and third memory access will be marked 116336809Sdim /// large stride memory access. 117336809Sdim bool isLargeStride(const Instruction *Inst); 118336809Sdim 119336809Sdim bool isGlobalAddr(const Value *V) const; 120336809Sdim bool isLocalAddr(const Value *V) const; 121336809Sdim bool isConstantAddr(const Value *V) const; 122336809Sdim}; 123336809Sdim 124336809Sdimstatic const Value *getMemoryInstrPtr(const Instruction *Inst) { 125336809Sdim if (auto LI = dyn_cast<LoadInst>(Inst)) { 126336809Sdim return LI->getPointerOperand(); 127336809Sdim } 128336809Sdim if (auto SI = dyn_cast<StoreInst>(Inst)) { 129336809Sdim return SI->getPointerOperand(); 130336809Sdim } 131336809Sdim if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 132336809Sdim return AI->getPointerOperand(); 133336809Sdim } 134336809Sdim if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 135336809Sdim return AI->getPointerOperand(); 136336809Sdim } 137336809Sdim if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 138336809Sdim return MI->getRawDest(); 139336809Sdim } 140336809Sdim 141336809Sdim return nullptr; 142336809Sdim} 143336809Sdim 144336809Sdimbool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 145336809Sdim LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 146336809Sdim SmallSet<const Value *, 32> WorkSet; 147336809Sdim SmallSet<const Value *, 32> Visited; 148336809Sdim if (const Value *MO = getMemoryInstrPtr(Inst)) { 149336809Sdim if (isGlobalAddr(MO)) 150336809Sdim WorkSet.insert(MO); 151336809Sdim } 152336809Sdim 153336809Sdim while (!WorkSet.empty()) { 154336809Sdim const Value *V = *WorkSet.begin(); 155336809Sdim WorkSet.erase(*WorkSet.begin()); 156336809Sdim if (!Visited.insert(V).second) 157336809Sdim continue; 158336809Sdim LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 159336809Sdim 160336809Sdim if (auto LD = dyn_cast<LoadInst>(V)) { 161336809Sdim auto M = LD->getPointerOperand(); 162336809Sdim if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 163336809Sdim LLVM_DEBUG(dbgs() << " is IA\n"); 164336809Sdim return true; 165336809Sdim } 166336809Sdim continue; 167336809Sdim } 168336809Sdim 169336809Sdim if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 170336809Sdim auto P = GEP->getPointerOperand(); 171336809Sdim WorkSet.insert(P); 172336809Sdim for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 173336809Sdim WorkSet.insert(GEP->getOperand(I)); 174336809Sdim continue; 175336809Sdim } 176336809Sdim 177336809Sdim if (auto U = dyn_cast<UnaryInstruction>(V)) { 178336809Sdim WorkSet.insert(U->getOperand(0)); 179336809Sdim continue; 180336809Sdim } 181336809Sdim 182336809Sdim if (auto BO = dyn_cast<BinaryOperator>(V)) { 183336809Sdim WorkSet.insert(BO->getOperand(0)); 184336809Sdim WorkSet.insert(BO->getOperand(1)); 185336809Sdim continue; 186336809Sdim } 187336809Sdim 188336809Sdim if (auto S = dyn_cast<SelectInst>(V)) { 189336809Sdim WorkSet.insert(S->getFalseValue()); 190336809Sdim WorkSet.insert(S->getTrueValue()); 191336809Sdim continue; 192336809Sdim } 193336809Sdim 194336809Sdim if (auto E = dyn_cast<ExtractElementInst>(V)) { 195336809Sdim WorkSet.insert(E->getVectorOperand()); 196336809Sdim continue; 197336809Sdim } 198336809Sdim 199336809Sdim LLVM_DEBUG(dbgs() << " dropped\n"); 200336809Sdim } 201336809Sdim 202336809Sdim LLVM_DEBUG(dbgs() << " is not IA\n"); 203336809Sdim return false; 204336809Sdim} 205336809Sdim 206353358SdimAMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 207353358Sdim AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 208336809Sdim 209336809Sdim LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 210336809Sdim 211336809Sdim for (auto &B : F) { 212336809Sdim LastAccess = MemAccessInfo(); 213336809Sdim for (auto &I : B) { 214336809Sdim if (getMemoryInstrPtr(&I)) { 215336809Sdim if (isIndirectAccess(&I)) 216336809Sdim ++FI.IAMInstCount; 217336809Sdim if (isLargeStride(&I)) 218336809Sdim ++FI.LSMInstCount; 219336809Sdim ++FI.MemInstCount; 220336809Sdim ++FI.InstCount; 221336809Sdim continue; 222336809Sdim } 223336809Sdim CallSite CS(const_cast<Instruction *>(&I)); 224336809Sdim if (CS) { 225336809Sdim Function *Callee = CS.getCalledFunction(); 226336809Sdim if (!Callee || Callee->isDeclaration()) { 227336809Sdim ++FI.InstCount; 228336809Sdim continue; 229336809Sdim } 230336809Sdim if (&F == Callee) // Handle immediate recursion 231336809Sdim continue; 232336809Sdim 233336809Sdim auto Loc = FIM.find(Callee); 234353358Sdim if (Loc == FIM.end()) 235353358Sdim continue; 236336809Sdim 237336809Sdim FI.MemInstCount += Loc->second.MemInstCount; 238336809Sdim FI.InstCount += Loc->second.InstCount; 239336809Sdim FI.IAMInstCount += Loc->second.IAMInstCount; 240336809Sdim FI.LSMInstCount += Loc->second.LSMInstCount; 241336809Sdim } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 242336809Sdim TargetLoweringBase::AddrMode AM; 243336809Sdim auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 244336809Sdim AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 245336809Sdim AM.HasBaseReg = !AM.BaseGV; 246336809Sdim if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 247336809Sdim GEP->getPointerAddressSpace())) 248336809Sdim // Offset will likely be folded into load or store 249336809Sdim continue; 250336809Sdim ++FI.InstCount; 251336809Sdim } else { 252336809Sdim ++FI.InstCount; 253336809Sdim } 254336809Sdim } 255336809Sdim } 256353358Sdim 257353358Sdim return &FI; 258336809Sdim} 259336809Sdim 260353358Sdimbool AMDGPUPerfHint::runOnFunction(Function &F) { 261336809Sdim const Module &M = *F.getParent(); 262336809Sdim DL = &M.getDataLayout(); 263336809Sdim 264353358Sdim if (F.hasFnAttribute("amdgpu-wave-limiter") && 265353358Sdim F.hasFnAttribute("amdgpu-memory-bound")) 266353358Sdim return false; 267336809Sdim 268353358Sdim const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 269353358Sdim 270353358Sdim LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount 271336809Sdim << '\n' 272353358Sdim << " IAMInst: " << Info->IAMInstCount << '\n' 273353358Sdim << " LSMInst: " << Info->LSMInstCount << '\n' 274353358Sdim << " TotalInst: " << Info->InstCount << '\n'); 275336809Sdim 276353358Sdim if (isMemBound(*Info)) { 277336809Sdim LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 278336809Sdim NumMemBound++; 279353358Sdim F.addFnAttr("amdgpu-memory-bound", "true"); 280336809Sdim } 281336809Sdim 282353358Sdim if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 283336809Sdim LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 284336809Sdim NumLimitWave++; 285353358Sdim F.addFnAttr("amdgpu-wave-limiter", "true"); 286336809Sdim } 287353358Sdim 288353358Sdim return true; 289336809Sdim} 290336809Sdim 291336809Sdimbool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 292336809Sdim return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 293336809Sdim} 294336809Sdim 295336809Sdimbool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 296336809Sdim return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 297336809Sdim FI.LSMInstCount * LSWeight) * 298336809Sdim 100 / FI.InstCount) > LimitWaveThresh; 299336809Sdim} 300336809Sdim 301336809Sdimbool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 302336809Sdim if (auto PT = dyn_cast<PointerType>(V->getType())) { 303336809Sdim unsigned As = PT->getAddressSpace(); 304336809Sdim // Flat likely points to global too. 305344779Sdim return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 306336809Sdim } 307336809Sdim return false; 308336809Sdim} 309336809Sdim 310336809Sdimbool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 311336809Sdim if (auto PT = dyn_cast<PointerType>(V->getType())) 312344779Sdim return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 313336809Sdim return false; 314336809Sdim} 315336809Sdim 316336809Sdimbool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 317336809Sdim LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 318336809Sdim 319336809Sdim MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 320336809Sdim bool IsLargeStride = MAI.isLargeStride(LastAccess); 321336809Sdim if (MAI.Base) 322336809Sdim LastAccess = std::move(MAI); 323336809Sdim 324336809Sdim return IsLargeStride; 325336809Sdim} 326336809Sdim 327336809SdimAMDGPUPerfHint::MemAccessInfo 328336809SdimAMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 329336809Sdim MemAccessInfo MAI; 330336809Sdim const Value *MO = getMemoryInstrPtr(Inst); 331336809Sdim 332336809Sdim LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 333336809Sdim // Do not treat local-addr memory access as large stride. 334336809Sdim if (isLocalAddr(MO)) 335336809Sdim return MAI; 336336809Sdim 337336809Sdim MAI.V = MO; 338336809Sdim MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 339336809Sdim return MAI; 340336809Sdim} 341336809Sdim 342336809Sdimbool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 343336809Sdim if (auto PT = dyn_cast<PointerType>(V->getType())) { 344336809Sdim unsigned As = PT->getAddressSpace(); 345344779Sdim return As == AMDGPUAS::CONSTANT_ADDRESS || 346344779Sdim As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 347336809Sdim } 348336809Sdim return false; 349336809Sdim} 350336809Sdim 351336809Sdimbool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 352336809Sdim MemAccessInfo &Reference) const { 353336809Sdim 354336809Sdim if (!Base || !Reference.Base || Base != Reference.Base) 355336809Sdim return false; 356336809Sdim 357336809Sdim uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 358336809Sdim : Reference.Offset - Offset; 359336809Sdim bool Result = Diff > LargeStrideThresh; 360336809Sdim LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 361336809Sdim << print() << "<=>\n" 362336809Sdim << Reference.print() << "Result:" << Result << '\n'); 363336809Sdim return Result; 364336809Sdim} 365336809Sdim} // namespace 366336809Sdim 367353358Sdimbool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { 368336809Sdim auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 369336809Sdim if (!TPC) 370336809Sdim return false; 371336809Sdim 372336809Sdim const TargetMachine &TM = TPC->getTM<TargetMachine>(); 373336809Sdim 374353358Sdim bool Changed = false; 375353358Sdim for (CallGraphNode *I : SCC) { 376353358Sdim Function *F = I->getFunction(); 377353358Sdim if (!F || F->isDeclaration()) 378353358Sdim continue; 379353358Sdim 380353358Sdim const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); 381353358Sdim AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 382353358Sdim 383353358Sdim if (Analyzer.runOnFunction(*F)) 384353358Sdim Changed = true; 385353358Sdim } 386353358Sdim 387353358Sdim return Changed; 388336809Sdim} 389336809Sdim 390336809Sdimbool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 391336809Sdim auto FI = FIM.find(F); 392336809Sdim if (FI == FIM.end()) 393336809Sdim return false; 394336809Sdim 395336809Sdim return AMDGPUPerfHint::isMemBound(FI->second); 396336809Sdim} 397336809Sdim 398336809Sdimbool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 399336809Sdim auto FI = FIM.find(F); 400336809Sdim if (FI == FIM.end()) 401336809Sdim return false; 402336809Sdim 403336809Sdim return AMDGPUPerfHint::needLimitWave(FI->second); 404336809Sdim} 405