1//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass adds target attributes to functions which use intrinsics
10/// which will impact calling convention lowering.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPUSubtarget.h"
16#include "Utils/AMDGPUBaseInfo.h"
17#include "llvm/ADT/SmallPtrSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/Triple.h"
21#include "llvm/Analysis/CallGraph.h"
22#include "llvm/Analysis/CallGraphSCCPass.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/CallSite.h"
25#include "llvm/IR/Constant.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/Instruction.h"
29#include "llvm/IR/Instructions.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Module.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Use.h"
34#include "llvm/Pass.h"
35#include "llvm/Support/Casting.h"
36#include "llvm/Support/ErrorHandling.h"
37#include "llvm/Target/TargetMachine.h"
38
39#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40
41using namespace llvm;
42
43namespace {
44
45class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46private:
47  const TargetMachine *TM = nullptr;
48  SmallVector<CallGraphNode*, 8> NodeList;
49
50  bool addFeatureAttributes(Function &F);
51  bool processUniformWorkGroupAttribute();
52  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53
54public:
55  static char ID;
56
57  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58
59  bool doInitialization(CallGraph &CG) override;
60  bool runOnSCC(CallGraphSCC &SCC) override;
61
62  StringRef getPassName() const override {
63    return "AMDGPU Annotate Kernel Features";
64  }
65
66  void getAnalysisUsage(AnalysisUsage &AU) const override {
67    AU.setPreservesAll();
68    CallGraphSCCPass::getAnalysisUsage(AU);
69  }
70
71  static bool visitConstantExpr(const ConstantExpr *CE);
72  static bool visitConstantExprsRecursively(
73    const Constant *EntryC,
74    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75};
76
77} // end anonymous namespace
78
79char AMDGPUAnnotateKernelFeatures::ID = 0;
80
81char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82
83INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                "Add AMDGPU function attributes", false, false)
85
86
87// The queue ptr is only needed when casting to flat, not from it.
88static bool castRequiresQueuePtr(unsigned SrcAS) {
89  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90}
91
92static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94}
95
96bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99    return castRequiresQueuePtr(SrcAS);
100  }
101
102  return false;
103}
104
105bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106  const Constant *EntryC,
107  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108
109  if (!ConstantExprVisited.insert(EntryC).second)
110    return false;
111
112  SmallVector<const Constant *, 16> Stack;
113  Stack.push_back(EntryC);
114
115  while (!Stack.empty()) {
116    const Constant *C = Stack.pop_back_val();
117
118    // Check this constant expression.
119    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120      if (visitConstantExpr(CE))
121        return true;
122    }
123
124    // Visit all sub-expressions.
125    for (const Use &U : C->operands()) {
126      const auto *OpC = dyn_cast<Constant>(U);
127      if (!OpC)
128        continue;
129
130      if (!ConstantExprVisited.insert(OpC).second)
131        continue;
132
133      Stack.push_back(OpC);
134    }
135  }
136
137  return false;
138}
139
140// We do not need to note the x workitem or workgroup id because they are always
141// initialized.
142//
143// TODO: We should not add the attributes if the known compile time workgroup
144// size is 1 for y/z.
145static StringRef intrinsicToAttrName(Intrinsic::ID ID,
146                                     bool &NonKernelOnly,
147                                     bool &IsQueuePtr) {
148  switch (ID) {
149  case Intrinsic::amdgcn_workitem_id_x:
150    NonKernelOnly = true;
151    return "amdgpu-work-item-id-x";
152  case Intrinsic::amdgcn_workgroup_id_x:
153    NonKernelOnly = true;
154    return "amdgpu-work-group-id-x";
155  case Intrinsic::amdgcn_workitem_id_y:
156  case Intrinsic::r600_read_tidig_y:
157    return "amdgpu-work-item-id-y";
158  case Intrinsic::amdgcn_workitem_id_z:
159  case Intrinsic::r600_read_tidig_z:
160    return "amdgpu-work-item-id-z";
161  case Intrinsic::amdgcn_workgroup_id_y:
162  case Intrinsic::r600_read_tgid_y:
163    return "amdgpu-work-group-id-y";
164  case Intrinsic::amdgcn_workgroup_id_z:
165  case Intrinsic::r600_read_tgid_z:
166    return "amdgpu-work-group-id-z";
167  case Intrinsic::amdgcn_dispatch_ptr:
168    return "amdgpu-dispatch-ptr";
169  case Intrinsic::amdgcn_dispatch_id:
170    return "amdgpu-dispatch-id";
171  case Intrinsic::amdgcn_kernarg_segment_ptr:
172    return "amdgpu-kernarg-segment-ptr";
173  case Intrinsic::amdgcn_implicitarg_ptr:
174    return "amdgpu-implicitarg-ptr";
175  case Intrinsic::amdgcn_queue_ptr:
176  case Intrinsic::amdgcn_is_shared:
177  case Intrinsic::amdgcn_is_private:
178    // TODO: Does not require queue ptr on gfx9+
179  case Intrinsic::trap:
180  case Intrinsic::debugtrap:
181    IsQueuePtr = true;
182    return "amdgpu-queue-ptr";
183  default:
184    return "";
185  }
186}
187
188static bool handleAttr(Function &Parent, const Function &Callee,
189                       StringRef Name) {
190  if (Callee.hasFnAttribute(Name)) {
191    Parent.addFnAttr(Name);
192    return true;
193  }
194  return false;
195}
196
197static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
198                                   bool &NeedQueuePtr) {
199  // X ids unnecessarily propagated to kernels.
200  static constexpr StringLiteral AttrNames[] = {
201      "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
202      "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
203      "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
204      "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
205      "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
206
207  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
208    NeedQueuePtr = true;
209
210  for (StringRef AttrName : AttrNames)
211    handleAttr(Parent, Callee, AttrName);
212}
213
214bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
215  bool Changed = false;
216
217  for (auto *Node : reverse(NodeList)) {
218    Function *Caller = Node->getFunction();
219
220    for (auto I : *Node) {
221      Function *Callee = std::get<1>(I)->getFunction();
222      if (Callee)
223        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
224    }
225  }
226
227  return Changed;
228}
229
230bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
231       Function &Caller, Function &Callee) {
232
233  // Check for externally defined function
234  if (!Callee.hasExactDefinition()) {
235    Callee.addFnAttr("uniform-work-group-size", "false");
236    if (!Caller.hasFnAttribute("uniform-work-group-size"))
237      Caller.addFnAttr("uniform-work-group-size", "false");
238
239    return true;
240  }
241  // Check if the Caller has the attribute
242  if (Caller.hasFnAttribute("uniform-work-group-size")) {
243    // Check if the value of the attribute is true
244    if (Caller.getFnAttribute("uniform-work-group-size")
245        .getValueAsString().equals("true")) {
246      // Propagate the attribute to the Callee, if it does not have it
247      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
248        Callee.addFnAttr("uniform-work-group-size", "true");
249        return true;
250      }
251    } else {
252      Callee.addFnAttr("uniform-work-group-size", "false");
253      return true;
254    }
255  } else {
256    // If the attribute is absent, set it as false
257    Caller.addFnAttr("uniform-work-group-size", "false");
258    Callee.addFnAttr("uniform-work-group-size", "false");
259    return true;
260  }
261  return false;
262}
263
264bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
265  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
266  bool HasFlat = ST.hasFlatAddressSpace();
267  bool HasApertureRegs = ST.hasApertureRegs();
268  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
269
270  bool Changed = false;
271  bool NeedQueuePtr = false;
272  bool HaveCall = false;
273  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
274
275  for (BasicBlock &BB : F) {
276    for (Instruction &I : BB) {
277      CallSite CS(&I);
278      if (CS) {
279        Function *Callee = CS.getCalledFunction();
280
281        // TODO: Do something with indirect calls.
282        if (!Callee) {
283          if (!CS.isInlineAsm())
284            HaveCall = true;
285          continue;
286        }
287
288        Intrinsic::ID IID = Callee->getIntrinsicID();
289        if (IID == Intrinsic::not_intrinsic) {
290          HaveCall = true;
291          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
292          Changed = true;
293        } else {
294          bool NonKernelOnly = false;
295          StringRef AttrName = intrinsicToAttrName(IID,
296                                                   NonKernelOnly, NeedQueuePtr);
297          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
298            F.addFnAttr(AttrName);
299            Changed = true;
300          }
301        }
302      }
303
304      if (NeedQueuePtr || HasApertureRegs)
305        continue;
306
307      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
308        if (castRequiresQueuePtr(ASC)) {
309          NeedQueuePtr = true;
310          continue;
311        }
312      }
313
314      for (const Use &U : I.operands()) {
315        const auto *OpC = dyn_cast<Constant>(U);
316        if (!OpC)
317          continue;
318
319        if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
320          NeedQueuePtr = true;
321          break;
322        }
323      }
324    }
325  }
326
327  if (NeedQueuePtr) {
328    F.addFnAttr("amdgpu-queue-ptr");
329    Changed = true;
330  }
331
332  // TODO: We could refine this to captured pointers that could possibly be
333  // accessed by flat instructions. For now this is mostly a poor way of
334  // estimating whether there are calls before argument lowering.
335  if (HasFlat && !IsFunc && HaveCall) {
336    F.addFnAttr("amdgpu-flat-scratch");
337    Changed = true;
338  }
339
340  return Changed;
341}
342
343bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
344  bool Changed = false;
345
346  for (CallGraphNode *I : SCC) {
347    // Build a list of CallGraphNodes from most number of uses to least
348    if (I->getNumReferences())
349      NodeList.push_back(I);
350    else {
351      processUniformWorkGroupAttribute();
352      NodeList.clear();
353    }
354
355    Function *F = I->getFunction();
356    // Add feature attributes
357    if (!F || F->isDeclaration())
358      continue;
359    Changed |= addFeatureAttributes(*F);
360  }
361
362  return Changed;
363}
364
365bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
366  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
367  if (!TPC)
368    report_fatal_error("TargetMachine is required");
369
370  TM = &TPC->getTM<TargetMachine>();
371  return false;
372}
373
374Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
375  return new AMDGPUAnnotateKernelFeatures();
376}
377