NVPTXLowerArgs.cpp revision 360784
1//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// 10// Arguments to kernel and device functions are passed via param space, 11// which imposes certain restrictions: 12// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces 13// 14// Kernel parameters are read-only and accessible only via ld.param 15// instruction, directly or via a pointer. Pointers to kernel 16// arguments can't be converted to generic address space. 17// 18// Device function parameters are directly accessible via 19// ld.param/st.param, but taking the address of one returns a pointer 20// to a copy created in local space which *can't* be used with 21// ld.param/st.param. 22// 23// Copying a byval struct into local memory in IR allows us to enforce 24// the param space restrictions, gives the rest of IR a pointer w/o 25// param space restrictions, and gives us an opportunity to eliminate 26// the copy. 27// 28// Pointer arguments to kernel functions need more work to be lowered: 29// 30// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the 31// global address space. This allows later optimizations to emit 32// ld.global.*/st.global.* for accessing these pointer arguments. For 33// example, 34// 35// define void @foo(float* %input) { 36// %v = load float, float* %input, align 4 37// ... 38// } 39// 40// becomes 41// 42// define void @foo(float* %input) { 43// %input2 = addrspacecast float* %input to float addrspace(1)* 44// %input3 = addrspacecast float addrspace(1)* %input2 to float* 45// %v = load float, float* %input3, align 4 46// ... 47// } 48// 49// Later, NVPTXInferAddressSpaces will optimize it to 50// 51// define void @foo(float* %input) { 52// %input2 = addrspacecast float* %input to float addrspace(1)* 53// %v = load float, float addrspace(1)* %input2, align 4 54// ... 55// } 56// 57// 2. Convert pointers in a byval kernel parameter to pointers in the global 58// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., 59// 60// struct S { 61// int *x; 62// int *y; 63// }; 64// __global__ void foo(S s) { 65// int *b = s.y; 66// // use b 67// } 68// 69// "b" points to the global address space. In the IR level, 70// 71// define void @foo({i32*, i32*}* byval %input) { 72// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 73// %b = load i32*, i32** %b_ptr 74// ; use %b 75// } 76// 77// becomes 78// 79// define void @foo({i32*, i32*}* byval %input) { 80// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 81// %b = load i32*, i32** %b_ptr 82// %b_global = addrspacecast i32* %b to i32 addrspace(1)* 83// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* 84// ; use %b_generic 85// } 86// 87// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't 88// cancel the addrspacecast pair this pass emits. 89//===----------------------------------------------------------------------===// 90 91#include "NVPTX.h" 92#include "NVPTXTargetMachine.h" 93#include "NVPTXUtilities.h" 94#include "MCTargetDesc/NVPTXBaseInfo.h" 95#include "llvm/Analysis/ValueTracking.h" 96#include "llvm/IR/Function.h" 97#include "llvm/IR/Instructions.h" 98#include "llvm/IR/Module.h" 99#include "llvm/IR/Type.h" 100#include "llvm/Pass.h" 101 102using namespace llvm; 103 104namespace llvm { 105void initializeNVPTXLowerArgsPass(PassRegistry &); 106} 107 108namespace { 109class NVPTXLowerArgs : public FunctionPass { 110 bool runOnFunction(Function &F) override; 111 112 bool runOnKernelFunction(Function &F); 113 bool runOnDeviceFunction(Function &F); 114 115 // handle byval parameters 116 void handleByValParam(Argument *Arg); 117 // Knowing Ptr must point to the global address space, this function 118 // addrspacecasts Ptr to global and then back to generic. This allows 119 // NVPTXInferAddressSpaces to fold the global-to-generic cast into 120 // loads/stores that appear later. 121 void markPointerAsGlobal(Value *Ptr); 122 123public: 124 static char ID; // Pass identification, replacement for typeid 125 NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr) 126 : FunctionPass(ID), TM(TM) {} 127 StringRef getPassName() const override { 128 return "Lower pointer arguments of CUDA kernels"; 129 } 130 131private: 132 const NVPTXTargetMachine *TM; 133}; 134} // namespace 135 136char NVPTXLowerArgs::ID = 1; 137 138INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args", 139 "Lower arguments (NVPTX)", false, false) 140 141// ============================================================================= 142// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), 143// then add the following instructions to the first basic block: 144// 145// %temp = alloca %struct.x, align 8 146// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)* 147// %tv = load %struct.x addrspace(101)* %tempd 148// store %struct.x %tv, %struct.x* %temp, align 8 149// 150// The above code allocates some space in the stack and copies the incoming 151// struct from param space to local space. 152// Then replace all occurrences of %d by %temp. 153// ============================================================================= 154void NVPTXLowerArgs::handleByValParam(Argument *Arg) { 155 Function *Func = Arg->getParent(); 156 Instruction *FirstInst = &(Func->getEntryBlock().front()); 157 PointerType *PType = dyn_cast<PointerType>(Arg->getType()); 158 159 assert(PType && "Expecting pointer type in handleByValParam"); 160 161 Type *StructType = PType->getElementType(); 162 unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace(); 163 AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); 164 // Set the alignment to alignment of the byval parameter. This is because, 165 // later load/stores assume that alignment, and we are going to replace 166 // the use of the byval parameter with this alloca instruction. 167 AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo()))); 168 Arg->replaceAllUsesWith(AllocA); 169 170 Value *ArgInParam = new AddrSpaceCastInst( 171 Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), 172 FirstInst); 173 LoadInst *LI = 174 new LoadInst(StructType, ArgInParam, Arg->getName(), FirstInst); 175 new StoreInst(LI, AllocA, FirstInst); 176} 177 178void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { 179 if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) 180 return; 181 182 // Deciding where to emit the addrspacecast pair. 183 BasicBlock::iterator InsertPt; 184 if (Argument *Arg = dyn_cast<Argument>(Ptr)) { 185 // Insert at the functon entry if Ptr is an argument. 186 InsertPt = Arg->getParent()->getEntryBlock().begin(); 187 } else { 188 // Insert right after Ptr if Ptr is an instruction. 189 InsertPt = ++cast<Instruction>(Ptr)->getIterator(); 190 assert(InsertPt != InsertPt->getParent()->end() && 191 "We don't call this function with Ptr being a terminator."); 192 } 193 194 Instruction *PtrInGlobal = new AddrSpaceCastInst( 195 Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), 196 ADDRESS_SPACE_GLOBAL), 197 Ptr->getName(), &*InsertPt); 198 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), 199 Ptr->getName(), &*InsertPt); 200 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. 201 Ptr->replaceAllUsesWith(PtrInGeneric); 202 PtrInGlobal->setOperand(0, Ptr); 203} 204 205// ============================================================================= 206// Main function for this pass. 207// ============================================================================= 208bool NVPTXLowerArgs::runOnKernelFunction(Function &F) { 209 if (TM && TM->getDrvInterface() == NVPTX::CUDA) { 210 // Mark pointers in byval structs as global. 211 for (auto &B : F) { 212 for (auto &I : B) { 213 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 214 if (LI->getType()->isPointerTy()) { 215 Value *UO = GetUnderlyingObject(LI->getPointerOperand(), 216 F.getParent()->getDataLayout()); 217 if (Argument *Arg = dyn_cast<Argument>(UO)) { 218 if (Arg->hasByValAttr()) { 219 // LI is a load from a pointer within a byval kernel parameter. 220 markPointerAsGlobal(LI); 221 } 222 } 223 } 224 } 225 } 226 } 227 } 228 229 for (Argument &Arg : F.args()) { 230 if (Arg.getType()->isPointerTy()) { 231 if (Arg.hasByValAttr()) 232 handleByValParam(&Arg); 233 else if (TM && TM->getDrvInterface() == NVPTX::CUDA) 234 markPointerAsGlobal(&Arg); 235 } 236 } 237 return true; 238} 239 240// Device functions only need to copy byval args into local memory. 241bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) { 242 for (Argument &Arg : F.args()) 243 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) 244 handleByValParam(&Arg); 245 return true; 246} 247 248bool NVPTXLowerArgs::runOnFunction(Function &F) { 249 return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); 250} 251 252FunctionPass * 253llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) { 254 return new NVPTXLowerArgs(TM); 255} 256