1226586Sdim//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2226586Sdim// 3226586Sdim// The LLVM Compiler Infrastructure 4226586Sdim// 5226586Sdim// This file is distributed under the University of Illinois Open Source 6226586Sdim// License. See LICENSE.TXT for details. 7226586Sdim// 8226586Sdim//===----------------------------------------------------------------------===// 9226586Sdim// 10226586Sdim// This provides a class for CUDA code generation targeting the NVIDIA CUDA 11226586Sdim// runtime library. 12226586Sdim// 13226586Sdim//===----------------------------------------------------------------------===// 14226586Sdim 15226586Sdim#include "CGCUDARuntime.h" 16226586Sdim#include "CodeGenFunction.h" 17226586Sdim#include "CodeGenModule.h" 18226586Sdim#include "clang/AST/Decl.h" 19249423Sdim#include "llvm/IR/BasicBlock.h" 20276479Sdim#include "llvm/IR/CallSite.h" 21249423Sdim#include "llvm/IR/Constants.h" 22249423Sdim#include "llvm/IR/DerivedTypes.h" 23226586Sdim 24226586Sdimusing namespace clang; 25226586Sdimusing namespace CodeGen; 26226586Sdim 27226586Sdimnamespace { 28226586Sdim 29226586Sdimclass CGNVCUDARuntime : public CGCUDARuntime { 30226586Sdim 31226586Sdimprivate: 32288943Sdim llvm::Type *IntTy, *SizeTy, *VoidTy; 33288943Sdim llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34226586Sdim 35288943Sdim /// Convenience reference to LLVM Context 36288943Sdim llvm::LLVMContext &Context; 37288943Sdim /// Convenience reference to the current module 38288943Sdim llvm::Module &TheModule; 39288943Sdim /// Keeps track of kernel launch stubs emitted in this module 40288943Sdim llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41288943Sdim /// Keeps track of variables containing handles of GPU binaries. Populated by 42288943Sdim /// ModuleCtorFunction() and used to create corresponding cleanup calls in 43288943Sdim /// ModuleDtorFunction() 44288943Sdim llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 45288943Sdim 46226586Sdim llvm::Constant *getSetupArgumentFn() const; 47226586Sdim llvm::Constant *getLaunchFn() const; 48226586Sdim 49288943Sdim /// Creates a function to register all kernel stubs generated in this module. 50288943Sdim llvm::Function *makeRegisterKernelsFn(); 51288943Sdim 52288943Sdim /// Helper function that generates a constant string and returns a pointer to 53288943Sdim /// the start of the string. The result of this function can be used anywhere 54288943Sdim /// where the C code specifies const char*. 55288943Sdim llvm::Constant *makeConstantString(const std::string &Str, 56288943Sdim const std::string &Name = "", 57288943Sdim unsigned Alignment = 0) { 58288943Sdim llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 59288943Sdim llvm::ConstantInt::get(SizeTy, 0)}; 60296417Sdim auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 61296417Sdim return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 62296417Sdim ConstStr.getPointer(), Zeros); 63288943Sdim } 64288943Sdim 65288943Sdim void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 66288943Sdim 67226586Sdimpublic: 68226586Sdim CGNVCUDARuntime(CodeGenModule &CGM); 69226586Sdim 70288943Sdim void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 71288943Sdim /// Creates module constructor function 72288943Sdim llvm::Function *makeModuleCtorFunction() override; 73288943Sdim /// Creates module destructor function 74288943Sdim llvm::Function *makeModuleDtorFunction() override; 75226586Sdim}; 76226586Sdim 77226586Sdim} 78226586Sdim 79288943SdimCGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 80288943Sdim : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 81288943Sdim TheModule(CGM.getModule()) { 82226586Sdim CodeGen::CodeGenTypes &Types = CGM.getTypes(); 83226586Sdim ASTContext &Ctx = CGM.getContext(); 84226586Sdim 85226586Sdim IntTy = Types.ConvertType(Ctx.IntTy); 86226586Sdim SizeTy = Types.ConvertType(Ctx.getSizeType()); 87288943Sdim VoidTy = llvm::Type::getVoidTy(Context); 88226586Sdim 89226586Sdim CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 90226586Sdim VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 91288943Sdim VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 92226586Sdim} 93226586Sdim 94226586Sdimllvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 95226586Sdim // cudaError_t cudaSetupArgument(void *, size_t, size_t) 96226586Sdim std::vector<llvm::Type*> Params; 97226586Sdim Params.push_back(VoidPtrTy); 98226586Sdim Params.push_back(SizeTy); 99226586Sdim Params.push_back(SizeTy); 100226586Sdim return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 101226586Sdim Params, false), 102226586Sdim "cudaSetupArgument"); 103226586Sdim} 104226586Sdim 105226586Sdimllvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 106226586Sdim // cudaError_t cudaLaunch(char *) 107288943Sdim return CGM.CreateRuntimeFunction( 108288943Sdim llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 109226586Sdim} 110226586Sdim 111288943Sdimvoid CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 112288943Sdim FunctionArgList &Args) { 113288943Sdim EmittedKernels.push_back(CGF.CurFn); 114288943Sdim emitDeviceStubBody(CGF, Args); 115288943Sdim} 116288943Sdim 117288943Sdimvoid CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 118226586Sdim FunctionArgList &Args) { 119226586Sdim // Build the argument value list and the argument stack struct type. 120249423Sdim SmallVector<llvm::Value *, 16> ArgValues; 121226586Sdim std::vector<llvm::Type *> ArgTypes; 122226586Sdim for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); 123226586Sdim I != E; ++I) { 124296417Sdim llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer(); 125226586Sdim ArgValues.push_back(V); 126226586Sdim assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType"); 127226586Sdim ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType()); 128226586Sdim } 129288943Sdim llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); 130226586Sdim 131226586Sdim llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 132226586Sdim 133226586Sdim // Emit the calls to cudaSetupArgument 134226586Sdim llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 135226586Sdim for (unsigned I = 0, E = Args.size(); I != E; ++I) { 136226586Sdim llvm::Value *Args[3]; 137226586Sdim llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 138226586Sdim Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); 139226586Sdim Args[1] = CGF.Builder.CreateIntCast( 140226586Sdim llvm::ConstantExpr::getSizeOf(ArgTypes[I]), 141226586Sdim SizeTy, false); 142226586Sdim Args[2] = CGF.Builder.CreateIntCast( 143226586Sdim llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), 144226586Sdim SizeTy, false); 145249423Sdim llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 146226586Sdim llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 147226586Sdim llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 148226586Sdim CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 149226586Sdim CGF.EmitBlock(NextBlock); 150226586Sdim } 151226586Sdim 152226586Sdim // Emit the call to cudaLaunch 153226586Sdim llvm::Constant *cudaLaunchFn = getLaunchFn(); 154226586Sdim llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 155249423Sdim CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 156226586Sdim CGF.EmitBranch(EndBlock); 157226586Sdim 158226586Sdim CGF.EmitBlock(EndBlock); 159226586Sdim} 160226586Sdim 161288943Sdim/// Creates internal function to register all kernel stubs generated in this 162288943Sdim/// module with the CUDA runtime. 163288943Sdim/// \code 164288943Sdim/// void __cuda_register_kernels(void** GpuBinaryHandle) { 165288943Sdim/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 166288943Sdim/// ... 167288943Sdim/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 168288943Sdim/// } 169288943Sdim/// \endcode 170288943Sdimllvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { 171288943Sdim llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 172288943Sdim llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 173288943Sdim llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); 174288943Sdim llvm::BasicBlock *EntryBB = 175288943Sdim llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 176296417Sdim CGBuilderTy Builder(CGM, Context); 177288943Sdim Builder.SetInsertPoint(EntryBB); 178288943Sdim 179288943Sdim // void __cudaRegisterFunction(void **, const char *, char *, const char *, 180288943Sdim // int, uint3*, uint3*, dim3*, dim3*, int*) 181288943Sdim std::vector<llvm::Type *> RegisterFuncParams = { 182288943Sdim VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 183288943Sdim VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 184288943Sdim llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 185288943Sdim llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 186288943Sdim "__cudaRegisterFunction"); 187288943Sdim 188288943Sdim // Extract GpuBinaryHandle passed as the first argument passed to 189288943Sdim // __cuda_register_kernels() and generate __cudaRegisterFunction() call for 190288943Sdim // each emitted kernel. 191288943Sdim llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 192288943Sdim for (llvm::Function *Kernel : EmittedKernels) { 193288943Sdim llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 194288943Sdim llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 195288943Sdim llvm::Value *args[] = { 196288943Sdim &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 197288943Sdim KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 198288943Sdim NullPtr, NullPtr, NullPtr, 199288943Sdim llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 200288943Sdim Builder.CreateCall(RegisterFunc, args); 201288943Sdim } 202288943Sdim 203288943Sdim Builder.CreateRetVoid(); 204288943Sdim return RegisterKernelsFunc; 205288943Sdim} 206288943Sdim 207288943Sdim/// Creates a global constructor function for the module: 208288943Sdim/// \code 209288943Sdim/// void __cuda_module_ctor(void*) { 210288943Sdim/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 211288943Sdim/// __cuda_register_kernels(Handle0); 212288943Sdim/// ... 213288943Sdim/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 214288943Sdim/// __cuda_register_kernels(HandleN); 215288943Sdim/// } 216288943Sdim/// \endcode 217288943Sdimllvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 218288943Sdim // void __cuda_register_kernels(void* handle); 219288943Sdim llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); 220288943Sdim // void ** __cudaRegisterFatBinary(void *); 221288943Sdim llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 222288943Sdim llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 223288943Sdim "__cudaRegisterFatBinary"); 224288943Sdim // struct { int magic, int version, void * gpu_binary, void * dont_care }; 225288943Sdim llvm::StructType *FatbinWrapperTy = 226288943Sdim llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 227288943Sdim 228288943Sdim llvm::Function *ModuleCtorFunc = llvm::Function::Create( 229288943Sdim llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 230288943Sdim llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 231288943Sdim llvm::BasicBlock *CtorEntryBB = 232288943Sdim llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 233296417Sdim CGBuilderTy CtorBuilder(CGM, Context); 234288943Sdim 235288943Sdim CtorBuilder.SetInsertPoint(CtorEntryBB); 236288943Sdim 237288943Sdim // For each GPU binary, register it with the CUDA runtime and store returned 238288943Sdim // handle in a global variable and save the handle in GpuBinaryHandles vector 239288943Sdim // to be cleaned up in destructor on exit. Then associate all known kernels 240288943Sdim // with the GPU binary handle so CUDA runtime can figure out what to call on 241288943Sdim // the GPU side. 242288943Sdim for (const std::string &GpuBinaryFileName : 243288943Sdim CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 244288943Sdim llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 245288943Sdim llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 246288943Sdim if (std::error_code EC = GpuBinaryOrErr.getError()) { 247288943Sdim CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 248288943Sdim << EC.message(); 249288943Sdim continue; 250288943Sdim } 251288943Sdim 252288943Sdim // Create initialized wrapper structure that points to the loaded GPU binary 253288943Sdim llvm::Constant *Values[] = { 254288943Sdim llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 255288943Sdim llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 256288943Sdim makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. 257288943Sdim llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 258288943Sdim llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 259288943Sdim TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 260288943Sdim llvm::ConstantStruct::get(FatbinWrapperTy, Values), 261288943Sdim "__cuda_fatbin_wrapper"); 262288943Sdim 263288943Sdim // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 264288943Sdim llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 265288943Sdim RegisterFatbinFunc, 266288943Sdim CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 267288943Sdim llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 268288943Sdim TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 269288943Sdim llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 270296417Sdim CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 271296417Sdim CGM.getPointerAlign()); 272288943Sdim 273288943Sdim // Call __cuda_register_kernels(GpuBinaryHandle); 274288943Sdim CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); 275288943Sdim 276288943Sdim // Save GpuBinaryHandle so we can unregister it in destructor. 277288943Sdim GpuBinaryHandles.push_back(GpuBinaryHandle); 278288943Sdim } 279288943Sdim 280288943Sdim CtorBuilder.CreateRetVoid(); 281288943Sdim return ModuleCtorFunc; 282288943Sdim} 283288943Sdim 284288943Sdim/// Creates a global destructor function that unregisters all GPU code blobs 285288943Sdim/// registered by constructor. 286288943Sdim/// \code 287288943Sdim/// void __cuda_module_dtor(void*) { 288288943Sdim/// __cudaUnregisterFatBinary(Handle0); 289288943Sdim/// ... 290288943Sdim/// __cudaUnregisterFatBinary(HandleN); 291288943Sdim/// } 292288943Sdim/// \endcode 293288943Sdimllvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 294288943Sdim // void __cudaUnregisterFatBinary(void ** handle); 295288943Sdim llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 296288943Sdim llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 297288943Sdim "__cudaUnregisterFatBinary"); 298288943Sdim 299288943Sdim llvm::Function *ModuleDtorFunc = llvm::Function::Create( 300288943Sdim llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 301288943Sdim llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 302288943Sdim llvm::BasicBlock *DtorEntryBB = 303288943Sdim llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 304296417Sdim CGBuilderTy DtorBuilder(CGM, Context); 305288943Sdim DtorBuilder.SetInsertPoint(DtorEntryBB); 306288943Sdim 307288943Sdim for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 308296417Sdim auto HandleValue = 309296417Sdim DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 310296417Sdim DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 311288943Sdim } 312288943Sdim 313288943Sdim DtorBuilder.CreateRetVoid(); 314288943Sdim return ModuleDtorFunc; 315288943Sdim} 316288943Sdim 317226586SdimCGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 318226586Sdim return new CGNVCUDARuntime(CGM); 319226586Sdim} 320