1//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass adds target attributes to functions which use intrinsics
10/// which will impact calling convention lowering.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUSubtarget.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/ADT/SmallPtrSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/Triple.h"
21#include "llvm/Analysis/CallGraph.h"
22#include "llvm/Analysis/CallGraphSCCPass.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/Constant.h"
25#include "llvm/IR/Constants.h"
26#include "llvm/IR/Function.h"
27#include "llvm/IR/Instruction.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/Intrinsics.h"
30#include "llvm/IR/Module.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Use.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/ErrorHandling.h"
36#include "llvm/Target/TargetMachine.h"
37
38#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
39
40using namespace llvm;
41
42namespace {
43
44class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
45private:
46  const TargetMachine *TM = nullptr;
47  SmallVector<CallGraphNode*, 8> NodeList;
48
49  bool addFeatureAttributes(Function &F);
50  bool processUniformWorkGroupAttribute();
51  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
52
53public:
54  static char ID;
55
56  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
57
58  bool doInitialization(CallGraph &CG) override;
59  bool runOnSCC(CallGraphSCC &SCC) override;
60
61  StringRef getPassName() const override {
62    return "AMDGPU Annotate Kernel Features";
63  }
64
65  void getAnalysisUsage(AnalysisUsage &AU) const override {
66    AU.setPreservesAll();
67    CallGraphSCCPass::getAnalysisUsage(AU);
68  }
69
70  static bool visitConstantExpr(const ConstantExpr *CE);
71  static bool visitConstantExprsRecursively(
72    const Constant *EntryC,
73    SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
74    bool HasApertureRegs);
75};
76
77} // end anonymous namespace
78
79char AMDGPUAnnotateKernelFeatures::ID = 0;
80
81char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82
83INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                "Add AMDGPU function attributes", false, false)
85
86
87// The queue ptr is only needed when casting to flat, not from it.
88static bool castRequiresQueuePtr(unsigned SrcAS) {
89  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90}
91
92static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94}
95
96static bool isDSAddress(const Constant *C) {
97  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
98  if (!GV)
99    return false;
100  unsigned AS = GV->getAddressSpace();
101  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
102}
103
104bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
105  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
106    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
107    return castRequiresQueuePtr(SrcAS);
108  }
109
110  return false;
111}
112
113bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
114  const Constant *EntryC,
115  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
116  bool IsFunc, bool HasApertureRegs) {
117
118  if (!ConstantExprVisited.insert(EntryC).second)
119    return false;
120
121  SmallVector<const Constant *, 16> Stack;
122  Stack.push_back(EntryC);
123
124  while (!Stack.empty()) {
125    const Constant *C = Stack.pop_back_val();
126
127    // We need to trap on DS globals in non-entry functions.
128    if (IsFunc && isDSAddress(C))
129      return true;
130
131    // Check this constant expression.
132    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
133      if (!HasApertureRegs && visitConstantExpr(CE))
134        return true;
135    }
136
137    // Visit all sub-expressions.
138    for (const Use &U : C->operands()) {
139      const auto *OpC = dyn_cast<Constant>(U);
140      if (!OpC)
141        continue;
142
143      if (!ConstantExprVisited.insert(OpC).second)
144        continue;
145
146      Stack.push_back(OpC);
147    }
148  }
149
150  return false;
151}
152
153// We do not need to note the x workitem or workgroup id because they are always
154// initialized.
155//
156// TODO: We should not add the attributes if the known compile time workgroup
157// size is 1 for y/z.
158static StringRef intrinsicToAttrName(Intrinsic::ID ID,
159                                     bool &NonKernelOnly,
160                                     bool &IsQueuePtr) {
161  switch (ID) {
162  case Intrinsic::amdgcn_workitem_id_x:
163    NonKernelOnly = true;
164    return "amdgpu-work-item-id-x";
165  case Intrinsic::amdgcn_workgroup_id_x:
166    NonKernelOnly = true;
167    return "amdgpu-work-group-id-x";
168  case Intrinsic::amdgcn_workitem_id_y:
169  case Intrinsic::r600_read_tidig_y:
170    return "amdgpu-work-item-id-y";
171  case Intrinsic::amdgcn_workitem_id_z:
172  case Intrinsic::r600_read_tidig_z:
173    return "amdgpu-work-item-id-z";
174  case Intrinsic::amdgcn_workgroup_id_y:
175  case Intrinsic::r600_read_tgid_y:
176    return "amdgpu-work-group-id-y";
177  case Intrinsic::amdgcn_workgroup_id_z:
178  case Intrinsic::r600_read_tgid_z:
179    return "amdgpu-work-group-id-z";
180  case Intrinsic::amdgcn_dispatch_ptr:
181    return "amdgpu-dispatch-ptr";
182  case Intrinsic::amdgcn_dispatch_id:
183    return "amdgpu-dispatch-id";
184  case Intrinsic::amdgcn_kernarg_segment_ptr:
185    return "amdgpu-kernarg-segment-ptr";
186  case Intrinsic::amdgcn_implicitarg_ptr:
187    return "amdgpu-implicitarg-ptr";
188  case Intrinsic::amdgcn_queue_ptr:
189  case Intrinsic::amdgcn_is_shared:
190  case Intrinsic::amdgcn_is_private:
191    // TODO: Does not require queue ptr on gfx9+
192  case Intrinsic::trap:
193  case Intrinsic::debugtrap:
194    IsQueuePtr = true;
195    return "amdgpu-queue-ptr";
196  default:
197    return "";
198  }
199}
200
201static bool handleAttr(Function &Parent, const Function &Callee,
202                       StringRef Name) {
203  if (Callee.hasFnAttribute(Name)) {
204    Parent.addFnAttr(Name);
205    return true;
206  }
207  return false;
208}
209
210static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
211                                   bool &NeedQueuePtr) {
212  // X ids unnecessarily propagated to kernels.
213  static constexpr StringLiteral AttrNames[] = {
214      "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
215      "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
216      "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
217      "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
218      "amdgpu-implicitarg-ptr"};
219
220  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
221    NeedQueuePtr = true;
222
223  for (StringRef AttrName : AttrNames)
224    handleAttr(Parent, Callee, AttrName);
225}
226
227bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
228  bool Changed = false;
229
230  for (auto *Node : reverse(NodeList)) {
231    Function *Caller = Node->getFunction();
232
233    for (auto I : *Node) {
234      Function *Callee = std::get<1>(I)->getFunction();
235      if (Callee)
236        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
237    }
238  }
239
240  return Changed;
241}
242
243bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
244       Function &Caller, Function &Callee) {
245
246  // Check for externally defined function
247  if (!Callee.hasExactDefinition()) {
248    Callee.addFnAttr("uniform-work-group-size", "false");
249    if (!Caller.hasFnAttribute("uniform-work-group-size"))
250      Caller.addFnAttr("uniform-work-group-size", "false");
251
252    return true;
253  }
254  // Check if the Caller has the attribute
255  if (Caller.hasFnAttribute("uniform-work-group-size")) {
256    // Check if the value of the attribute is true
257    if (Caller.getFnAttribute("uniform-work-group-size")
258        .getValueAsString().equals("true")) {
259      // Propagate the attribute to the Callee, if it does not have it
260      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
261        Callee.addFnAttr("uniform-work-group-size", "true");
262        return true;
263      }
264    } else {
265      Callee.addFnAttr("uniform-work-group-size", "false");
266      return true;
267    }
268  } else {
269    // If the attribute is absent, set it as false
270    Caller.addFnAttr("uniform-work-group-size", "false");
271    Callee.addFnAttr("uniform-work-group-size", "false");
272    return true;
273  }
274  return false;
275}
276
277bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
278  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
279  bool HasApertureRegs = ST.hasApertureRegs();
280  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
281
282  bool HaveStackObjects = false;
283  bool Changed = false;
284  bool NeedQueuePtr = false;
285  bool HaveCall = false;
286  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
287
288  for (BasicBlock &BB : F) {
289    for (Instruction &I : BB) {
290      if (isa<AllocaInst>(I)) {
291        HaveStackObjects = true;
292        continue;
293      }
294
295      if (auto *CB = dyn_cast<CallBase>(&I)) {
296        const Function *Callee =
297            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
298
299        // TODO: Do something with indirect calls.
300        if (!Callee) {
301          if (!CB->isInlineAsm())
302            HaveCall = true;
303          continue;
304        }
305
306        Intrinsic::ID IID = Callee->getIntrinsicID();
307        if (IID == Intrinsic::not_intrinsic) {
308          HaveCall = true;
309          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
310          Changed = true;
311        } else {
312          bool NonKernelOnly = false;
313
314          if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
315            F.addFnAttr("amdgpu-kernarg-segment-ptr");
316          } else {
317            StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
318                                                     NeedQueuePtr);
319            if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
320              F.addFnAttr(AttrName);
321              Changed = true;
322            }
323          }
324        }
325      }
326
327      if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
328        continue;
329
330      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
331        if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
332          NeedQueuePtr = true;
333          continue;
334        }
335      }
336
337      for (const Use &U : I.operands()) {
338        const auto *OpC = dyn_cast<Constant>(U);
339        if (!OpC)
340          continue;
341
342        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
343                                          HasApertureRegs)) {
344          NeedQueuePtr = true;
345          break;
346        }
347      }
348    }
349  }
350
351  if (NeedQueuePtr) {
352    F.addFnAttr("amdgpu-queue-ptr");
353    Changed = true;
354  }
355
356  // TODO: We could refine this to captured pointers that could possibly be
357  // accessed by flat instructions. For now this is mostly a poor way of
358  // estimating whether there are calls before argument lowering.
359  if (!IsFunc && HaveCall) {
360    F.addFnAttr("amdgpu-calls");
361    Changed = true;
362  }
363
364  if (HaveStackObjects) {
365    F.addFnAttr("amdgpu-stack-objects");
366    Changed = true;
367  }
368
369  return Changed;
370}
371
372bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
373  bool Changed = false;
374
375  for (CallGraphNode *I : SCC) {
376    // Build a list of CallGraphNodes from most number of uses to least
377    if (I->getNumReferences())
378      NodeList.push_back(I);
379    else {
380      processUniformWorkGroupAttribute();
381      NodeList.clear();
382    }
383
384    Function *F = I->getFunction();
385    // Add feature attributes
386    if (!F || F->isDeclaration())
387      continue;
388    Changed |= addFeatureAttributes(*F);
389  }
390
391  return Changed;
392}
393
394bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
395  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
396  if (!TPC)
397    report_fatal_error("TargetMachine is required");
398
399  TM = &TPC->getTM<TargetMachine>();
400  return false;
401}
402
403Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
404  return new AMDGPUAnnotateKernelFeatures();
405}
406