1327952Sdim//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2292915Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6292915Sdim//
7292915Sdim//===----------------------------------------------------------------------===//
8292915Sdim//
9292915Sdim/// \file This pass adds target attributes to functions which use intrinsics
10292915Sdim/// which will impact calling convention lowering.
11292915Sdim//
12292915Sdim//===----------------------------------------------------------------------===//
13292915Sdim
14292915Sdim#include "AMDGPU.h"
15321369Sdim#include "AMDGPUSubtarget.h"
16327952Sdim#include "Utils/AMDGPUBaseInfo.h"
17327952Sdim#include "llvm/ADT/SmallPtrSet.h"
18327952Sdim#include "llvm/ADT/SmallVector.h"
19327952Sdim#include "llvm/ADT/StringRef.h"
20314564Sdim#include "llvm/ADT/Triple.h"
21327952Sdim#include "llvm/Analysis/CallGraph.h"
22321369Sdim#include "llvm/Analysis/CallGraphSCCPass.h"
23321369Sdim#include "llvm/CodeGen/TargetPassConfig.h"
24327952Sdim#include "llvm/IR/CallSite.h"
25327952Sdim#include "llvm/IR/Constant.h"
26309124Sdim#include "llvm/IR/Constants.h"
27327952Sdim#include "llvm/IR/Function.h"
28327952Sdim#include "llvm/IR/Instruction.h"
29292915Sdim#include "llvm/IR/Instructions.h"
30327952Sdim#include "llvm/IR/Intrinsics.h"
31292915Sdim#include "llvm/IR/Module.h"
32327952Sdim#include "llvm/IR/Type.h"
33327952Sdim#include "llvm/IR/Use.h"
34327952Sdim#include "llvm/Pass.h"
35327952Sdim#include "llvm/Support/Casting.h"
36327952Sdim#include "llvm/Support/ErrorHandling.h"
37327952Sdim#include "llvm/Target/TargetMachine.h"
38292915Sdim
39292915Sdim#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40292915Sdim
41292915Sdimusing namespace llvm;
42292915Sdim
43292915Sdimnamespace {
44292915Sdim
45321369Sdimclass AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46292915Sdimprivate:
47321369Sdim  const TargetMachine *TM = nullptr;
48353358Sdim  SmallVector<CallGraphNode*, 8> NodeList;
49309124Sdim
50321369Sdim  bool addFeatureAttributes(Function &F);
51353358Sdim  bool processUniformWorkGroupAttribute();
52353358Sdim  bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53292915Sdim
54292915Sdimpublic:
55292915Sdim  static char ID;
56292915Sdim
57321369Sdim  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58321369Sdim
59321369Sdim  bool doInitialization(CallGraph &CG) override;
60321369Sdim  bool runOnSCC(CallGraphSCC &SCC) override;
61327952Sdim
62314564Sdim  StringRef getPassName() const override {
63292915Sdim    return "AMDGPU Annotate Kernel Features";
64292915Sdim  }
65292915Sdim
66292915Sdim  void getAnalysisUsage(AnalysisUsage &AU) const override {
67292915Sdim    AU.setPreservesAll();
68321369Sdim    CallGraphSCCPass::getAnalysisUsage(AU);
69292915Sdim  }
70309124Sdim
71344779Sdim  static bool visitConstantExpr(const ConstantExpr *CE);
72309124Sdim  static bool visitConstantExprsRecursively(
73309124Sdim    const Constant *EntryC,
74344779Sdim    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75292915Sdim};
76292915Sdim
77327952Sdim} // end anonymous namespace
78292915Sdim
79292915Sdimchar AMDGPUAnnotateKernelFeatures::ID = 0;
80292915Sdim
81292915Sdimchar &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82292915Sdim
83309124SdimINITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84309124Sdim                "Add AMDGPU function attributes", false, false)
85292915Sdim
86292915Sdim
87309124Sdim// The queue ptr is only needed when casting to flat, not from it.
88344779Sdimstatic bool castRequiresQueuePtr(unsigned SrcAS) {
89344779Sdim  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90309124Sdim}
91292915Sdim
92344779Sdimstatic bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93344779Sdim  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94309124Sdim}
95309124Sdim
96344779Sdimbool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97309124Sdim  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98309124Sdim    unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99344779Sdim    return castRequiresQueuePtr(SrcAS);
100309124Sdim  }
101309124Sdim
102309124Sdim  return false;
103309124Sdim}
104309124Sdim
105309124Sdimbool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106309124Sdim  const Constant *EntryC,
107344779Sdim  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108309124Sdim
109309124Sdim  if (!ConstantExprVisited.insert(EntryC).second)
110309124Sdim    return false;
111309124Sdim
112309124Sdim  SmallVector<const Constant *, 16> Stack;
113309124Sdim  Stack.push_back(EntryC);
114309124Sdim
115309124Sdim  while (!Stack.empty()) {
116309124Sdim    const Constant *C = Stack.pop_back_val();
117309124Sdim
118309124Sdim    // Check this constant expression.
119309124Sdim    if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120344779Sdim      if (visitConstantExpr(CE))
121309124Sdim        return true;
122309124Sdim    }
123309124Sdim
124309124Sdim    // Visit all sub-expressions.
125309124Sdim    for (const Use &U : C->operands()) {
126309124Sdim      const auto *OpC = dyn_cast<Constant>(U);
127309124Sdim      if (!OpC)
128309124Sdim        continue;
129309124Sdim
130309124Sdim      if (!ConstantExprVisited.insert(OpC).second)
131309124Sdim        continue;
132309124Sdim
133309124Sdim      Stack.push_back(OpC);
134309124Sdim    }
135309124Sdim  }
136309124Sdim
137309124Sdim  return false;
138309124Sdim}
139309124Sdim
140321369Sdim// We do not need to note the x workitem or workgroup id because they are always
141321369Sdim// initialized.
142321369Sdim//
143321369Sdim// TODO: We should not add the attributes if the known compile time workgroup
144321369Sdim// size is 1 for y/z.
145321369Sdimstatic StringRef intrinsicToAttrName(Intrinsic::ID ID,
146321369Sdim                                     bool &NonKernelOnly,
147321369Sdim                                     bool &IsQueuePtr) {
148321369Sdim  switch (ID) {
149321369Sdim  case Intrinsic::amdgcn_workitem_id_x:
150321369Sdim    NonKernelOnly = true;
151321369Sdim    return "amdgpu-work-item-id-x";
152321369Sdim  case Intrinsic::amdgcn_workgroup_id_x:
153321369Sdim    NonKernelOnly = true;
154321369Sdim    return "amdgpu-work-group-id-x";
155321369Sdim  case Intrinsic::amdgcn_workitem_id_y:
156321369Sdim  case Intrinsic::r600_read_tidig_y:
157321369Sdim    return "amdgpu-work-item-id-y";
158321369Sdim  case Intrinsic::amdgcn_workitem_id_z:
159321369Sdim  case Intrinsic::r600_read_tidig_z:
160321369Sdim    return "amdgpu-work-item-id-z";
161321369Sdim  case Intrinsic::amdgcn_workgroup_id_y:
162321369Sdim  case Intrinsic::r600_read_tgid_y:
163321369Sdim    return "amdgpu-work-group-id-y";
164321369Sdim  case Intrinsic::amdgcn_workgroup_id_z:
165321369Sdim  case Intrinsic::r600_read_tgid_z:
166321369Sdim    return "amdgpu-work-group-id-z";
167321369Sdim  case Intrinsic::amdgcn_dispatch_ptr:
168321369Sdim    return "amdgpu-dispatch-ptr";
169321369Sdim  case Intrinsic::amdgcn_dispatch_id:
170321369Sdim    return "amdgpu-dispatch-id";
171321369Sdim  case Intrinsic::amdgcn_kernarg_segment_ptr:
172327952Sdim    return "amdgpu-kernarg-segment-ptr";
173321369Sdim  case Intrinsic::amdgcn_implicitarg_ptr:
174327952Sdim    return "amdgpu-implicitarg-ptr";
175321369Sdim  case Intrinsic::amdgcn_queue_ptr:
176360784Sdim  case Intrinsic::amdgcn_is_shared:
177360784Sdim  case Intrinsic::amdgcn_is_private:
178360784Sdim    // TODO: Does not require queue ptr on gfx9+
179321369Sdim  case Intrinsic::trap:
180321369Sdim  case Intrinsic::debugtrap:
181321369Sdim    IsQueuePtr = true;
182321369Sdim    return "amdgpu-queue-ptr";
183321369Sdim  default:
184321369Sdim    return "";
185321369Sdim  }
186321369Sdim}
187321369Sdim
188321369Sdimstatic bool handleAttr(Function &Parent, const Function &Callee,
189321369Sdim                       StringRef Name) {
190321369Sdim  if (Callee.hasFnAttribute(Name)) {
191321369Sdim    Parent.addFnAttr(Name);
192321369Sdim    return true;
193321369Sdim  }
194321369Sdim  return false;
195321369Sdim}
196321369Sdim
197321369Sdimstatic void copyFeaturesToFunction(Function &Parent, const Function &Callee,
198321369Sdim                                   bool &NeedQueuePtr) {
199321369Sdim  // X ids unnecessarily propagated to kernels.
200360784Sdim  static constexpr StringLiteral AttrNames[] = {
201360784Sdim      "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
202360784Sdim      "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
203360784Sdim      "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
204360784Sdim      "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
205360784Sdim      "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
206321369Sdim
207321369Sdim  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
208321369Sdim    NeedQueuePtr = true;
209321369Sdim
210321369Sdim  for (StringRef AttrName : AttrNames)
211321369Sdim    handleAttr(Parent, Callee, AttrName);
212321369Sdim}
213321369Sdim
214353358Sdimbool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
215353358Sdim  bool Changed = false;
216353358Sdim
217353358Sdim  for (auto *Node : reverse(NodeList)) {
218353358Sdim    Function *Caller = Node->getFunction();
219353358Sdim
220353358Sdim    for (auto I : *Node) {
221353358Sdim      Function *Callee = std::get<1>(I)->getFunction();
222353358Sdim      if (Callee)
223353358Sdim        Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
224353358Sdim    }
225353358Sdim  }
226353358Sdim
227353358Sdim  return Changed;
228353358Sdim}
229353358Sdim
230353358Sdimbool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
231353358Sdim       Function &Caller, Function &Callee) {
232353358Sdim
233353358Sdim  // Check for externally defined function
234353358Sdim  if (!Callee.hasExactDefinition()) {
235353358Sdim    Callee.addFnAttr("uniform-work-group-size", "false");
236353358Sdim    if (!Caller.hasFnAttribute("uniform-work-group-size"))
237353358Sdim      Caller.addFnAttr("uniform-work-group-size", "false");
238353358Sdim
239353358Sdim    return true;
240353358Sdim  }
241353358Sdim  // Check if the Caller has the attribute
242353358Sdim  if (Caller.hasFnAttribute("uniform-work-group-size")) {
243353358Sdim    // Check if the value of the attribute is true
244353358Sdim    if (Caller.getFnAttribute("uniform-work-group-size")
245353358Sdim        .getValueAsString().equals("true")) {
246353358Sdim      // Propagate the attribute to the Callee, if it does not have it
247353358Sdim      if (!Callee.hasFnAttribute("uniform-work-group-size")) {
248353358Sdim        Callee.addFnAttr("uniform-work-group-size", "true");
249353358Sdim        return true;
250353358Sdim      }
251353358Sdim    } else {
252353358Sdim      Callee.addFnAttr("uniform-work-group-size", "false");
253353358Sdim      return true;
254353358Sdim    }
255353358Sdim  } else {
256353358Sdim    // If the attribute is absent, set it as false
257353358Sdim    Caller.addFnAttr("uniform-work-group-size", "false");
258353358Sdim    Callee.addFnAttr("uniform-work-group-size", "false");
259353358Sdim    return true;
260353358Sdim  }
261353358Sdim  return false;
262353358Sdim}
263353358Sdim
264321369Sdimbool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
265341825Sdim  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
266321369Sdim  bool HasFlat = ST.hasFlatAddressSpace();
267321369Sdim  bool HasApertureRegs = ST.hasApertureRegs();
268309124Sdim  SmallPtrSet<const Constant *, 8> ConstantExprVisited;
269309124Sdim
270321369Sdim  bool Changed = false;
271321369Sdim  bool NeedQueuePtr = false;
272321369Sdim  bool HaveCall = false;
273321369Sdim  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
274321369Sdim
275321369Sdim  for (BasicBlock &BB : F) {
276321369Sdim    for (Instruction &I : BB) {
277321369Sdim      CallSite CS(&I);
278321369Sdim      if (CS) {
279321369Sdim        Function *Callee = CS.getCalledFunction();
280321369Sdim
281321369Sdim        // TODO: Do something with indirect calls.
282321369Sdim        if (!Callee) {
283321369Sdim          if (!CS.isInlineAsm())
284321369Sdim            HaveCall = true;
285321369Sdim          continue;
286321369Sdim        }
287321369Sdim
288321369Sdim        Intrinsic::ID IID = Callee->getIntrinsicID();
289321369Sdim        if (IID == Intrinsic::not_intrinsic) {
290321369Sdim          HaveCall = true;
291321369Sdim          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
292321369Sdim          Changed = true;
293321369Sdim        } else {
294321369Sdim          bool NonKernelOnly = false;
295321369Sdim          StringRef AttrName = intrinsicToAttrName(IID,
296321369Sdim                                                   NonKernelOnly, NeedQueuePtr);
297321369Sdim          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
298321369Sdim            F.addFnAttr(AttrName);
299321369Sdim            Changed = true;
300321369Sdim          }
301321369Sdim        }
302321369Sdim      }
303321369Sdim
304321369Sdim      if (NeedQueuePtr || HasApertureRegs)
305321369Sdim        continue;
306321369Sdim
307309124Sdim      if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
308344779Sdim        if (castRequiresQueuePtr(ASC)) {
309321369Sdim          NeedQueuePtr = true;
310321369Sdim          continue;
311321369Sdim        }
312309124Sdim      }
313309124Sdim
314309124Sdim      for (const Use &U : I.operands()) {
315309124Sdim        const auto *OpC = dyn_cast<Constant>(U);
316309124Sdim        if (!OpC)
317309124Sdim          continue;
318309124Sdim
319344779Sdim        if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
320321369Sdim          NeedQueuePtr = true;
321321369Sdim          break;
322321369Sdim        }
323309124Sdim      }
324309124Sdim    }
325309124Sdim  }
326309124Sdim
327321369Sdim  if (NeedQueuePtr) {
328321369Sdim    F.addFnAttr("amdgpu-queue-ptr");
329321369Sdim    Changed = true;
330292915Sdim  }
331292915Sdim
332321369Sdim  // TODO: We could refine this to captured pointers that could possibly be
333321369Sdim  // accessed by flat instructions. For now this is mostly a poor way of
334321369Sdim  // estimating whether there are calls before argument lowering.
335321369Sdim  if (HasFlat && !IsFunc && HaveCall) {
336321369Sdim    F.addFnAttr("amdgpu-flat-scratch");
337321369Sdim    Changed = true;
338292915Sdim  }
339292915Sdim
340292915Sdim  return Changed;
341292915Sdim}
342292915Sdim
343321369Sdimbool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
344353358Sdim  bool Changed = false;
345292915Sdim
346321369Sdim  for (CallGraphNode *I : SCC) {
347353358Sdim    // Build a list of CallGraphNodes from most number of uses to least
348353358Sdim    if (I->getNumReferences())
349353358Sdim      NodeList.push_back(I);
350353358Sdim    else {
351353358Sdim      processUniformWorkGroupAttribute();
352353358Sdim      NodeList.clear();
353353358Sdim    }
354353358Sdim
355321369Sdim    Function *F = I->getFunction();
356353358Sdim    // Add feature attributes
357321369Sdim    if (!F || F->isDeclaration())
358321369Sdim      continue;
359321369Sdim    Changed |= addFeatureAttributes(*F);
360321369Sdim  }
361309124Sdim
362321369Sdim  return Changed;
363321369Sdim}
364292915Sdim
365321369Sdimbool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
366321369Sdim  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
367321369Sdim  if (!TPC)
368321369Sdim    report_fatal_error("TargetMachine is required");
369292915Sdim
370321369Sdim  TM = &TPC->getTM<TargetMachine>();
371321369Sdim  return false;
372292915Sdim}
373292915Sdim
374321369SdimPass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
375292915Sdim  return new AMDGPUAnnotateKernelFeatures();
376292915Sdim}
377