1336809Sdim//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2336809Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6336809Sdim//
7336809Sdim//===----------------------------------------------------------------------===//
8336809Sdim//
9336809Sdim/// \file
10336809Sdim/// \brief Analyzes if a function potentially memory bound and if a kernel
11336809Sdim/// kernel may benefit from limiting number of waves to reduce cache thrashing.
12336809Sdim///
13336809Sdim//===----------------------------------------------------------------------===//
14336809Sdim
15336809Sdim#include "AMDGPU.h"
16336809Sdim#include "AMDGPUPerfHintAnalysis.h"
17336809Sdim#include "Utils/AMDGPUBaseInfo.h"
18336809Sdim#include "llvm/ADT/SmallSet.h"
19336809Sdim#include "llvm/ADT/Statistic.h"
20353358Sdim#include "llvm/Analysis/CallGraph.h"
21336809Sdim#include "llvm/Analysis/ValueTracking.h"
22336809Sdim#include "llvm/CodeGen/TargetLowering.h"
23336809Sdim#include "llvm/CodeGen/TargetPassConfig.h"
24336809Sdim#include "llvm/CodeGen/TargetSubtargetInfo.h"
25336809Sdim#include "llvm/IR/Constants.h"
26336809Sdim#include "llvm/IR/Instructions.h"
27336809Sdim#include "llvm/IR/IntrinsicInst.h"
28336809Sdim#include "llvm/IR/Module.h"
29336809Sdim#include "llvm/IR/ValueMap.h"
30336809Sdim#include "llvm/Support/CommandLine.h"
31336809Sdim
32336809Sdimusing namespace llvm;
33336809Sdim
34336809Sdim#define DEBUG_TYPE "amdgpu-perf-hint"
35336809Sdim
36336809Sdimstatic cl::opt<unsigned>
37336809Sdim    MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
38336809Sdim                   cl::desc("Function mem bound threshold in %"));
39336809Sdim
40336809Sdimstatic cl::opt<unsigned>
41336809Sdim    LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
42336809Sdim                    cl::desc("Kernel limit wave threshold in %"));
43336809Sdim
44336809Sdimstatic cl::opt<unsigned>
45336809Sdim    IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
46336809Sdim             cl::desc("Indirect access memory instruction weight"));
47336809Sdim
48336809Sdimstatic cl::opt<unsigned>
49336809Sdim    LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
50336809Sdim             cl::desc("Large stride memory access weight"));
51336809Sdim
52336809Sdimstatic cl::opt<unsigned>
53336809Sdim    LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
54336809Sdim                      cl::desc("Large stride memory access threshold"));
55336809Sdim
56336809SdimSTATISTIC(NumMemBound, "Number of functions marked as memory bound");
57336809SdimSTATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
58336809Sdim
59336809Sdimchar llvm::AMDGPUPerfHintAnalysis::ID = 0;
60336809Sdimchar &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
61336809Sdim
62336809SdimINITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
63336809Sdim                "Analysis if a function is memory bound", true, true)
64336809Sdim
65336809Sdimnamespace {
66336809Sdim
67336809Sdimstruct AMDGPUPerfHint {
68336809Sdim  friend AMDGPUPerfHintAnalysis;
69336809Sdim
70336809Sdimpublic:
71336809Sdim  AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
72336809Sdim                 const TargetLowering *TLI_)
73336809Sdim      : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
74336809Sdim
75353358Sdim  bool runOnFunction(Function &F);
76336809Sdim
77336809Sdimprivate:
78336809Sdim  struct MemAccessInfo {
79336809Sdim    const Value *V;
80336809Sdim    const Value *Base;
81336809Sdim    int64_t Offset;
82336809Sdim    MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
83336809Sdim    bool isLargeStride(MemAccessInfo &Reference) const;
84336809Sdim#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
85336809Sdim    Printable print() const {
86336809Sdim      return Printable([this](raw_ostream &OS) {
87336809Sdim        OS << "Value: " << *V << '\n'
88336809Sdim           << "Base: " << *Base << " Offset: " << Offset << '\n';
89336809Sdim      });
90336809Sdim    }
91336809Sdim#endif
92336809Sdim  };
93336809Sdim
94336809Sdim  MemAccessInfo makeMemAccessInfo(Instruction *) const;
95336809Sdim
96336809Sdim  MemAccessInfo LastAccess; // Last memory access info
97336809Sdim
98336809Sdim  AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
99336809Sdim
100336809Sdim  const DataLayout *DL;
101336809Sdim
102336809Sdim  const TargetLowering *TLI;
103336809Sdim
104353358Sdim  AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
105336809Sdim  static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
106336809Sdim  static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
107336809Sdim
108336809Sdim  bool isIndirectAccess(const Instruction *Inst) const;
109336809Sdim
110336809Sdim  /// Check if the instruction is large stride.
111336809Sdim  /// The purpose is to identify memory access pattern like:
112336809Sdim  /// x = a[i];
113336809Sdim  /// y = a[i+1000];
114336809Sdim  /// z = a[i+2000];
115336809Sdim  /// In the above example, the second and third memory access will be marked
116336809Sdim  /// large stride memory access.
117336809Sdim  bool isLargeStride(const Instruction *Inst);
118336809Sdim
119336809Sdim  bool isGlobalAddr(const Value *V) const;
120336809Sdim  bool isLocalAddr(const Value *V) const;
121336809Sdim  bool isConstantAddr(const Value *V) const;
122336809Sdim};
123336809Sdim
124336809Sdimstatic const Value *getMemoryInstrPtr(const Instruction *Inst) {
125336809Sdim  if (auto LI = dyn_cast<LoadInst>(Inst)) {
126336809Sdim    return LI->getPointerOperand();
127336809Sdim  }
128336809Sdim  if (auto SI = dyn_cast<StoreInst>(Inst)) {
129336809Sdim    return SI->getPointerOperand();
130336809Sdim  }
131336809Sdim  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
132336809Sdim    return AI->getPointerOperand();
133336809Sdim  }
134336809Sdim  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
135336809Sdim    return AI->getPointerOperand();
136336809Sdim  }
137336809Sdim  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
138336809Sdim    return MI->getRawDest();
139336809Sdim  }
140336809Sdim
141336809Sdim  return nullptr;
142336809Sdim}
143336809Sdim
144336809Sdimbool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
145336809Sdim  LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
146336809Sdim  SmallSet<const Value *, 32> WorkSet;
147336809Sdim  SmallSet<const Value *, 32> Visited;
148336809Sdim  if (const Value *MO = getMemoryInstrPtr(Inst)) {
149336809Sdim    if (isGlobalAddr(MO))
150336809Sdim      WorkSet.insert(MO);
151336809Sdim  }
152336809Sdim
153336809Sdim  while (!WorkSet.empty()) {
154336809Sdim    const Value *V = *WorkSet.begin();
155336809Sdim    WorkSet.erase(*WorkSet.begin());
156336809Sdim    if (!Visited.insert(V).second)
157336809Sdim      continue;
158336809Sdim    LLVM_DEBUG(dbgs() << "  check: " << *V << '\n');
159336809Sdim
160336809Sdim    if (auto LD = dyn_cast<LoadInst>(V)) {
161336809Sdim      auto M = LD->getPointerOperand();
162336809Sdim      if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) {
163336809Sdim        LLVM_DEBUG(dbgs() << "    is IA\n");
164336809Sdim        return true;
165336809Sdim      }
166336809Sdim      continue;
167336809Sdim    }
168336809Sdim
169336809Sdim    if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
170336809Sdim      auto P = GEP->getPointerOperand();
171336809Sdim      WorkSet.insert(P);
172336809Sdim      for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
173336809Sdim        WorkSet.insert(GEP->getOperand(I));
174336809Sdim      continue;
175336809Sdim    }
176336809Sdim
177336809Sdim    if (auto U = dyn_cast<UnaryInstruction>(V)) {
178336809Sdim      WorkSet.insert(U->getOperand(0));
179336809Sdim      continue;
180336809Sdim    }
181336809Sdim
182336809Sdim    if (auto BO = dyn_cast<BinaryOperator>(V)) {
183336809Sdim      WorkSet.insert(BO->getOperand(0));
184336809Sdim      WorkSet.insert(BO->getOperand(1));
185336809Sdim      continue;
186336809Sdim    }
187336809Sdim
188336809Sdim    if (auto S = dyn_cast<SelectInst>(V)) {
189336809Sdim      WorkSet.insert(S->getFalseValue());
190336809Sdim      WorkSet.insert(S->getTrueValue());
191336809Sdim      continue;
192336809Sdim    }
193336809Sdim
194336809Sdim    if (auto E = dyn_cast<ExtractElementInst>(V)) {
195336809Sdim      WorkSet.insert(E->getVectorOperand());
196336809Sdim      continue;
197336809Sdim    }
198336809Sdim
199336809Sdim    LLVM_DEBUG(dbgs() << "    dropped\n");
200336809Sdim  }
201336809Sdim
202336809Sdim  LLVM_DEBUG(dbgs() << "  is not IA\n");
203336809Sdim  return false;
204336809Sdim}
205336809Sdim
206353358SdimAMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
207353358Sdim  AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
208336809Sdim
209336809Sdim  LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
210336809Sdim
211336809Sdim  for (auto &B : F) {
212336809Sdim    LastAccess = MemAccessInfo();
213336809Sdim    for (auto &I : B) {
214336809Sdim      if (getMemoryInstrPtr(&I)) {
215336809Sdim        if (isIndirectAccess(&I))
216336809Sdim          ++FI.IAMInstCount;
217336809Sdim        if (isLargeStride(&I))
218336809Sdim          ++FI.LSMInstCount;
219336809Sdim        ++FI.MemInstCount;
220336809Sdim        ++FI.InstCount;
221336809Sdim        continue;
222336809Sdim      }
223336809Sdim      CallSite CS(const_cast<Instruction *>(&I));
224336809Sdim      if (CS) {
225336809Sdim        Function *Callee = CS.getCalledFunction();
226336809Sdim        if (!Callee || Callee->isDeclaration()) {
227336809Sdim          ++FI.InstCount;
228336809Sdim          continue;
229336809Sdim        }
230336809Sdim        if (&F == Callee) // Handle immediate recursion
231336809Sdim          continue;
232336809Sdim
233336809Sdim        auto Loc = FIM.find(Callee);
234353358Sdim        if (Loc == FIM.end())
235353358Sdim          continue;
236336809Sdim
237336809Sdim        FI.MemInstCount += Loc->second.MemInstCount;
238336809Sdim        FI.InstCount += Loc->second.InstCount;
239336809Sdim        FI.IAMInstCount += Loc->second.IAMInstCount;
240336809Sdim        FI.LSMInstCount += Loc->second.LSMInstCount;
241336809Sdim      } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
242336809Sdim        TargetLoweringBase::AddrMode AM;
243336809Sdim        auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
244336809Sdim        AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
245336809Sdim        AM.HasBaseReg = !AM.BaseGV;
246336809Sdim        if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
247336809Sdim                                       GEP->getPointerAddressSpace()))
248336809Sdim          // Offset will likely be folded into load or store
249336809Sdim          continue;
250336809Sdim        ++FI.InstCount;
251336809Sdim      } else {
252336809Sdim        ++FI.InstCount;
253336809Sdim      }
254336809Sdim    }
255336809Sdim  }
256353358Sdim
257353358Sdim  return &FI;
258336809Sdim}
259336809Sdim
260353358Sdimbool AMDGPUPerfHint::runOnFunction(Function &F) {
261336809Sdim  const Module &M = *F.getParent();
262336809Sdim  DL = &M.getDataLayout();
263336809Sdim
264353358Sdim  if (F.hasFnAttribute("amdgpu-wave-limiter") &&
265353358Sdim      F.hasFnAttribute("amdgpu-memory-bound"))
266353358Sdim    return false;
267336809Sdim
268353358Sdim  const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
269353358Sdim
270353358Sdim  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
271336809Sdim                    << '\n'
272353358Sdim                    << " IAMInst: " << Info->IAMInstCount << '\n'
273353358Sdim                    << " LSMInst: " << Info->LSMInstCount << '\n'
274353358Sdim                    << " TotalInst: " << Info->InstCount << '\n');
275336809Sdim
276353358Sdim  if (isMemBound(*Info)) {
277336809Sdim    LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
278336809Sdim    NumMemBound++;
279353358Sdim    F.addFnAttr("amdgpu-memory-bound", "true");
280336809Sdim  }
281336809Sdim
282353358Sdim  if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
283336809Sdim    LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
284336809Sdim    NumLimitWave++;
285353358Sdim    F.addFnAttr("amdgpu-wave-limiter", "true");
286336809Sdim  }
287353358Sdim
288353358Sdim  return true;
289336809Sdim}
290336809Sdim
291336809Sdimbool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
292336809Sdim  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
293336809Sdim}
294336809Sdim
295336809Sdimbool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
296336809Sdim  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
297336809Sdim           FI.LSMInstCount * LSWeight) *
298336809Sdim          100 / FI.InstCount) > LimitWaveThresh;
299336809Sdim}
300336809Sdim
301336809Sdimbool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
302336809Sdim  if (auto PT = dyn_cast<PointerType>(V->getType())) {
303336809Sdim    unsigned As = PT->getAddressSpace();
304336809Sdim    // Flat likely points to global too.
305344779Sdim    return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
306336809Sdim  }
307336809Sdim  return false;
308336809Sdim}
309336809Sdim
310336809Sdimbool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
311336809Sdim  if (auto PT = dyn_cast<PointerType>(V->getType()))
312344779Sdim    return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
313336809Sdim  return false;
314336809Sdim}
315336809Sdim
316336809Sdimbool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
317336809Sdim  LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
318336809Sdim
319336809Sdim  MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
320336809Sdim  bool IsLargeStride = MAI.isLargeStride(LastAccess);
321336809Sdim  if (MAI.Base)
322336809Sdim    LastAccess = std::move(MAI);
323336809Sdim
324336809Sdim  return IsLargeStride;
325336809Sdim}
326336809Sdim
327336809SdimAMDGPUPerfHint::MemAccessInfo
328336809SdimAMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
329336809Sdim  MemAccessInfo MAI;
330336809Sdim  const Value *MO = getMemoryInstrPtr(Inst);
331336809Sdim
332336809Sdim  LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
333336809Sdim  // Do not treat local-addr memory access as large stride.
334336809Sdim  if (isLocalAddr(MO))
335336809Sdim    return MAI;
336336809Sdim
337336809Sdim  MAI.V = MO;
338336809Sdim  MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
339336809Sdim  return MAI;
340336809Sdim}
341336809Sdim
342336809Sdimbool AMDGPUPerfHint::isConstantAddr(const Value *V) const {
343336809Sdim  if (auto PT = dyn_cast<PointerType>(V->getType())) {
344336809Sdim    unsigned As = PT->getAddressSpace();
345344779Sdim    return As == AMDGPUAS::CONSTANT_ADDRESS ||
346344779Sdim           As == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
347336809Sdim  }
348336809Sdim  return false;
349336809Sdim}
350336809Sdim
351336809Sdimbool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
352336809Sdim    MemAccessInfo &Reference) const {
353336809Sdim
354336809Sdim  if (!Base || !Reference.Base || Base != Reference.Base)
355336809Sdim    return false;
356336809Sdim
357336809Sdim  uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
358336809Sdim                                            : Reference.Offset - Offset;
359336809Sdim  bool Result = Diff > LargeStrideThresh;
360336809Sdim  LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
361336809Sdim               << print() << "<=>\n"
362336809Sdim               << Reference.print() << "Result:" << Result << '\n');
363336809Sdim  return Result;
364336809Sdim}
365336809Sdim} // namespace
366336809Sdim
367353358Sdimbool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
368336809Sdim  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
369336809Sdim  if (!TPC)
370336809Sdim    return false;
371336809Sdim
372336809Sdim  const TargetMachine &TM = TPC->getTM<TargetMachine>();
373336809Sdim
374353358Sdim  bool Changed = false;
375353358Sdim  for (CallGraphNode *I : SCC) {
376353358Sdim    Function *F = I->getFunction();
377353358Sdim    if (!F || F->isDeclaration())
378353358Sdim      continue;
379353358Sdim
380353358Sdim    const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
381353358Sdim    AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
382353358Sdim
383353358Sdim    if (Analyzer.runOnFunction(*F))
384353358Sdim      Changed = true;
385353358Sdim  }
386353358Sdim
387353358Sdim  return Changed;
388336809Sdim}
389336809Sdim
390336809Sdimbool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
391336809Sdim  auto FI = FIM.find(F);
392336809Sdim  if (FI == FIM.end())
393336809Sdim    return false;
394336809Sdim
395336809Sdim  return AMDGPUPerfHint::isMemBound(FI->second);
396336809Sdim}
397336809Sdim
398336809Sdimbool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
399336809Sdim  auto FI = FIM.find(F);
400336809Sdim  if (FI == FIM.end())
401336809Sdim    return false;
402336809Sdim
403336809Sdim  return AMDGPUPerfHint::needLimitWave(FI->second);
404336809Sdim}
405