1327952Sdim//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2292915Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6292915Sdim// 7292915Sdim//===----------------------------------------------------------------------===// 8292915Sdim// 9292915Sdim/// \file This pass adds target attributes to functions which use intrinsics 10292915Sdim/// which will impact calling convention lowering. 11292915Sdim// 12292915Sdim//===----------------------------------------------------------------------===// 13292915Sdim 14292915Sdim#include "AMDGPU.h" 15321369Sdim#include "AMDGPUSubtarget.h" 16327952Sdim#include "Utils/AMDGPUBaseInfo.h" 17327952Sdim#include "llvm/ADT/SmallPtrSet.h" 18327952Sdim#include "llvm/ADT/SmallVector.h" 19327952Sdim#include "llvm/ADT/StringRef.h" 20314564Sdim#include "llvm/ADT/Triple.h" 21327952Sdim#include "llvm/Analysis/CallGraph.h" 22321369Sdim#include "llvm/Analysis/CallGraphSCCPass.h" 23321369Sdim#include "llvm/CodeGen/TargetPassConfig.h" 24327952Sdim#include "llvm/IR/CallSite.h" 25327952Sdim#include "llvm/IR/Constant.h" 26309124Sdim#include "llvm/IR/Constants.h" 27327952Sdim#include "llvm/IR/Function.h" 28327952Sdim#include "llvm/IR/Instruction.h" 29292915Sdim#include "llvm/IR/Instructions.h" 30327952Sdim#include "llvm/IR/Intrinsics.h" 31292915Sdim#include "llvm/IR/Module.h" 32327952Sdim#include "llvm/IR/Type.h" 33327952Sdim#include "llvm/IR/Use.h" 34327952Sdim#include "llvm/Pass.h" 35327952Sdim#include "llvm/Support/Casting.h" 36327952Sdim#include "llvm/Support/ErrorHandling.h" 37327952Sdim#include "llvm/Target/TargetMachine.h" 38292915Sdim 39292915Sdim#define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40292915Sdim 41292915Sdimusing namespace llvm; 42292915Sdim 43292915Sdimnamespace { 44292915Sdim 45321369Sdimclass AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46292915Sdimprivate: 47321369Sdim const TargetMachine *TM = nullptr; 48353358Sdim SmallVector<CallGraphNode*, 8> NodeList; 49309124Sdim 50321369Sdim bool addFeatureAttributes(Function &F); 51353358Sdim bool processUniformWorkGroupAttribute(); 52353358Sdim bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 53292915Sdim 54292915Sdimpublic: 55292915Sdim static char ID; 56292915Sdim 57321369Sdim AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 58321369Sdim 59321369Sdim bool doInitialization(CallGraph &CG) override; 60321369Sdim bool runOnSCC(CallGraphSCC &SCC) override; 61327952Sdim 62314564Sdim StringRef getPassName() const override { 63292915Sdim return "AMDGPU Annotate Kernel Features"; 64292915Sdim } 65292915Sdim 66292915Sdim void getAnalysisUsage(AnalysisUsage &AU) const override { 67292915Sdim AU.setPreservesAll(); 68321369Sdim CallGraphSCCPass::getAnalysisUsage(AU); 69292915Sdim } 70309124Sdim 71344779Sdim static bool visitConstantExpr(const ConstantExpr *CE); 72309124Sdim static bool visitConstantExprsRecursively( 73309124Sdim const Constant *EntryC, 74344779Sdim SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 75292915Sdim}; 76292915Sdim 77327952Sdim} // end anonymous namespace 78292915Sdim 79292915Sdimchar AMDGPUAnnotateKernelFeatures::ID = 0; 80292915Sdim 81292915Sdimchar &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82292915Sdim 83309124SdimINITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84309124Sdim "Add AMDGPU function attributes", false, false) 85292915Sdim 86292915Sdim 87309124Sdim// The queue ptr is only needed when casting to flat, not from it. 88344779Sdimstatic bool castRequiresQueuePtr(unsigned SrcAS) { 89344779Sdim return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 90309124Sdim} 91292915Sdim 92344779Sdimstatic bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 93344779Sdim return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 94309124Sdim} 95309124Sdim 96344779Sdimbool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97309124Sdim if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98309124Sdim unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99344779Sdim return castRequiresQueuePtr(SrcAS); 100309124Sdim } 101309124Sdim 102309124Sdim return false; 103309124Sdim} 104309124Sdim 105309124Sdimbool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106309124Sdim const Constant *EntryC, 107344779Sdim SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 108309124Sdim 109309124Sdim if (!ConstantExprVisited.insert(EntryC).second) 110309124Sdim return false; 111309124Sdim 112309124Sdim SmallVector<const Constant *, 16> Stack; 113309124Sdim Stack.push_back(EntryC); 114309124Sdim 115309124Sdim while (!Stack.empty()) { 116309124Sdim const Constant *C = Stack.pop_back_val(); 117309124Sdim 118309124Sdim // Check this constant expression. 119309124Sdim if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 120344779Sdim if (visitConstantExpr(CE)) 121309124Sdim return true; 122309124Sdim } 123309124Sdim 124309124Sdim // Visit all sub-expressions. 125309124Sdim for (const Use &U : C->operands()) { 126309124Sdim const auto *OpC = dyn_cast<Constant>(U); 127309124Sdim if (!OpC) 128309124Sdim continue; 129309124Sdim 130309124Sdim if (!ConstantExprVisited.insert(OpC).second) 131309124Sdim continue; 132309124Sdim 133309124Sdim Stack.push_back(OpC); 134309124Sdim } 135309124Sdim } 136309124Sdim 137309124Sdim return false; 138309124Sdim} 139309124Sdim 140321369Sdim// We do not need to note the x workitem or workgroup id because they are always 141321369Sdim// initialized. 142321369Sdim// 143321369Sdim// TODO: We should not add the attributes if the known compile time workgroup 144321369Sdim// size is 1 for y/z. 145321369Sdimstatic StringRef intrinsicToAttrName(Intrinsic::ID ID, 146321369Sdim bool &NonKernelOnly, 147321369Sdim bool &IsQueuePtr) { 148321369Sdim switch (ID) { 149321369Sdim case Intrinsic::amdgcn_workitem_id_x: 150321369Sdim NonKernelOnly = true; 151321369Sdim return "amdgpu-work-item-id-x"; 152321369Sdim case Intrinsic::amdgcn_workgroup_id_x: 153321369Sdim NonKernelOnly = true; 154321369Sdim return "amdgpu-work-group-id-x"; 155321369Sdim case Intrinsic::amdgcn_workitem_id_y: 156321369Sdim case Intrinsic::r600_read_tidig_y: 157321369Sdim return "amdgpu-work-item-id-y"; 158321369Sdim case Intrinsic::amdgcn_workitem_id_z: 159321369Sdim case Intrinsic::r600_read_tidig_z: 160321369Sdim return "amdgpu-work-item-id-z"; 161321369Sdim case Intrinsic::amdgcn_workgroup_id_y: 162321369Sdim case Intrinsic::r600_read_tgid_y: 163321369Sdim return "amdgpu-work-group-id-y"; 164321369Sdim case Intrinsic::amdgcn_workgroup_id_z: 165321369Sdim case Intrinsic::r600_read_tgid_z: 166321369Sdim return "amdgpu-work-group-id-z"; 167321369Sdim case Intrinsic::amdgcn_dispatch_ptr: 168321369Sdim return "amdgpu-dispatch-ptr"; 169321369Sdim case Intrinsic::amdgcn_dispatch_id: 170321369Sdim return "amdgpu-dispatch-id"; 171321369Sdim case Intrinsic::amdgcn_kernarg_segment_ptr: 172327952Sdim return "amdgpu-kernarg-segment-ptr"; 173321369Sdim case Intrinsic::amdgcn_implicitarg_ptr: 174327952Sdim return "amdgpu-implicitarg-ptr"; 175321369Sdim case Intrinsic::amdgcn_queue_ptr: 176360784Sdim case Intrinsic::amdgcn_is_shared: 177360784Sdim case Intrinsic::amdgcn_is_private: 178360784Sdim // TODO: Does not require queue ptr on gfx9+ 179321369Sdim case Intrinsic::trap: 180321369Sdim case Intrinsic::debugtrap: 181321369Sdim IsQueuePtr = true; 182321369Sdim return "amdgpu-queue-ptr"; 183321369Sdim default: 184321369Sdim return ""; 185321369Sdim } 186321369Sdim} 187321369Sdim 188321369Sdimstatic bool handleAttr(Function &Parent, const Function &Callee, 189321369Sdim StringRef Name) { 190321369Sdim if (Callee.hasFnAttribute(Name)) { 191321369Sdim Parent.addFnAttr(Name); 192321369Sdim return true; 193321369Sdim } 194321369Sdim return false; 195321369Sdim} 196321369Sdim 197321369Sdimstatic void copyFeaturesToFunction(Function &Parent, const Function &Callee, 198321369Sdim bool &NeedQueuePtr) { 199321369Sdim // X ids unnecessarily propagated to kernels. 200360784Sdim static constexpr StringLiteral AttrNames[] = { 201360784Sdim "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 202360784Sdim "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 203360784Sdim "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 204360784Sdim "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 205360784Sdim "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; 206321369Sdim 207321369Sdim if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 208321369Sdim NeedQueuePtr = true; 209321369Sdim 210321369Sdim for (StringRef AttrName : AttrNames) 211321369Sdim handleAttr(Parent, Callee, AttrName); 212321369Sdim} 213321369Sdim 214353358Sdimbool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 215353358Sdim bool Changed = false; 216353358Sdim 217353358Sdim for (auto *Node : reverse(NodeList)) { 218353358Sdim Function *Caller = Node->getFunction(); 219353358Sdim 220353358Sdim for (auto I : *Node) { 221353358Sdim Function *Callee = std::get<1>(I)->getFunction(); 222353358Sdim if (Callee) 223353358Sdim Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 224353358Sdim } 225353358Sdim } 226353358Sdim 227353358Sdim return Changed; 228353358Sdim} 229353358Sdim 230353358Sdimbool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 231353358Sdim Function &Caller, Function &Callee) { 232353358Sdim 233353358Sdim // Check for externally defined function 234353358Sdim if (!Callee.hasExactDefinition()) { 235353358Sdim Callee.addFnAttr("uniform-work-group-size", "false"); 236353358Sdim if (!Caller.hasFnAttribute("uniform-work-group-size")) 237353358Sdim Caller.addFnAttr("uniform-work-group-size", "false"); 238353358Sdim 239353358Sdim return true; 240353358Sdim } 241353358Sdim // Check if the Caller has the attribute 242353358Sdim if (Caller.hasFnAttribute("uniform-work-group-size")) { 243353358Sdim // Check if the value of the attribute is true 244353358Sdim if (Caller.getFnAttribute("uniform-work-group-size") 245353358Sdim .getValueAsString().equals("true")) { 246353358Sdim // Propagate the attribute to the Callee, if it does not have it 247353358Sdim if (!Callee.hasFnAttribute("uniform-work-group-size")) { 248353358Sdim Callee.addFnAttr("uniform-work-group-size", "true"); 249353358Sdim return true; 250353358Sdim } 251353358Sdim } else { 252353358Sdim Callee.addFnAttr("uniform-work-group-size", "false"); 253353358Sdim return true; 254353358Sdim } 255353358Sdim } else { 256353358Sdim // If the attribute is absent, set it as false 257353358Sdim Caller.addFnAttr("uniform-work-group-size", "false"); 258353358Sdim Callee.addFnAttr("uniform-work-group-size", "false"); 259353358Sdim return true; 260353358Sdim } 261353358Sdim return false; 262353358Sdim} 263353358Sdim 264321369Sdimbool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 265341825Sdim const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 266321369Sdim bool HasFlat = ST.hasFlatAddressSpace(); 267321369Sdim bool HasApertureRegs = ST.hasApertureRegs(); 268309124Sdim SmallPtrSet<const Constant *, 8> ConstantExprVisited; 269309124Sdim 270321369Sdim bool Changed = false; 271321369Sdim bool NeedQueuePtr = false; 272321369Sdim bool HaveCall = false; 273321369Sdim bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 274321369Sdim 275321369Sdim for (BasicBlock &BB : F) { 276321369Sdim for (Instruction &I : BB) { 277321369Sdim CallSite CS(&I); 278321369Sdim if (CS) { 279321369Sdim Function *Callee = CS.getCalledFunction(); 280321369Sdim 281321369Sdim // TODO: Do something with indirect calls. 282321369Sdim if (!Callee) { 283321369Sdim if (!CS.isInlineAsm()) 284321369Sdim HaveCall = true; 285321369Sdim continue; 286321369Sdim } 287321369Sdim 288321369Sdim Intrinsic::ID IID = Callee->getIntrinsicID(); 289321369Sdim if (IID == Intrinsic::not_intrinsic) { 290321369Sdim HaveCall = true; 291321369Sdim copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 292321369Sdim Changed = true; 293321369Sdim } else { 294321369Sdim bool NonKernelOnly = false; 295321369Sdim StringRef AttrName = intrinsicToAttrName(IID, 296321369Sdim NonKernelOnly, NeedQueuePtr); 297321369Sdim if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 298321369Sdim F.addFnAttr(AttrName); 299321369Sdim Changed = true; 300321369Sdim } 301321369Sdim } 302321369Sdim } 303321369Sdim 304321369Sdim if (NeedQueuePtr || HasApertureRegs) 305321369Sdim continue; 306321369Sdim 307309124Sdim if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 308344779Sdim if (castRequiresQueuePtr(ASC)) { 309321369Sdim NeedQueuePtr = true; 310321369Sdim continue; 311321369Sdim } 312309124Sdim } 313309124Sdim 314309124Sdim for (const Use &U : I.operands()) { 315309124Sdim const auto *OpC = dyn_cast<Constant>(U); 316309124Sdim if (!OpC) 317309124Sdim continue; 318309124Sdim 319344779Sdim if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 320321369Sdim NeedQueuePtr = true; 321321369Sdim break; 322321369Sdim } 323309124Sdim } 324309124Sdim } 325309124Sdim } 326309124Sdim 327321369Sdim if (NeedQueuePtr) { 328321369Sdim F.addFnAttr("amdgpu-queue-ptr"); 329321369Sdim Changed = true; 330292915Sdim } 331292915Sdim 332321369Sdim // TODO: We could refine this to captured pointers that could possibly be 333321369Sdim // accessed by flat instructions. For now this is mostly a poor way of 334321369Sdim // estimating whether there are calls before argument lowering. 335321369Sdim if (HasFlat && !IsFunc && HaveCall) { 336321369Sdim F.addFnAttr("amdgpu-flat-scratch"); 337321369Sdim Changed = true; 338292915Sdim } 339292915Sdim 340292915Sdim return Changed; 341292915Sdim} 342292915Sdim 343321369Sdimbool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 344353358Sdim bool Changed = false; 345292915Sdim 346321369Sdim for (CallGraphNode *I : SCC) { 347353358Sdim // Build a list of CallGraphNodes from most number of uses to least 348353358Sdim if (I->getNumReferences()) 349353358Sdim NodeList.push_back(I); 350353358Sdim else { 351353358Sdim processUniformWorkGroupAttribute(); 352353358Sdim NodeList.clear(); 353353358Sdim } 354353358Sdim 355321369Sdim Function *F = I->getFunction(); 356353358Sdim // Add feature attributes 357321369Sdim if (!F || F->isDeclaration()) 358321369Sdim continue; 359321369Sdim Changed |= addFeatureAttributes(*F); 360321369Sdim } 361309124Sdim 362321369Sdim return Changed; 363321369Sdim} 364292915Sdim 365321369Sdimbool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 366321369Sdim auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 367321369Sdim if (!TPC) 368321369Sdim report_fatal_error("TargetMachine is required"); 369292915Sdim 370321369Sdim TM = &TPC->getTM<TargetMachine>(); 371321369Sdim return false; 372292915Sdim} 373292915Sdim 374321369SdimPass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 375292915Sdim return new AMDGPUAnnotateKernelFeatures(); 376292915Sdim} 377