1226586Sdim//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2226586Sdim//
3226586Sdim//                     The LLVM Compiler Infrastructure
4226586Sdim//
5226586Sdim// This file is distributed under the University of Illinois Open Source
6226586Sdim// License. See LICENSE.TXT for details.
7226586Sdim//
8226586Sdim//===----------------------------------------------------------------------===//
9226586Sdim//
10226586Sdim// This provides a class for CUDA code generation targeting the NVIDIA CUDA
11226586Sdim// runtime library.
12226586Sdim//
13226586Sdim//===----------------------------------------------------------------------===//
14226586Sdim
15226586Sdim#include "CGCUDARuntime.h"
16226586Sdim#include "CodeGenFunction.h"
17226586Sdim#include "CodeGenModule.h"
18226586Sdim#include "clang/AST/Decl.h"
19249423Sdim#include "llvm/IR/BasicBlock.h"
20276479Sdim#include "llvm/IR/CallSite.h"
21249423Sdim#include "llvm/IR/Constants.h"
22249423Sdim#include "llvm/IR/DerivedTypes.h"
23226586Sdim
24226586Sdimusing namespace clang;
25226586Sdimusing namespace CodeGen;
26226586Sdim
27226586Sdimnamespace {
28226586Sdim
29226586Sdimclass CGNVCUDARuntime : public CGCUDARuntime {
30226586Sdim
31226586Sdimprivate:
32288943Sdim  llvm::Type *IntTy, *SizeTy, *VoidTy;
33288943Sdim  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
34226586Sdim
35288943Sdim  /// Convenience reference to LLVM Context
36288943Sdim  llvm::LLVMContext &Context;
37288943Sdim  /// Convenience reference to the current module
38288943Sdim  llvm::Module &TheModule;
39288943Sdim  /// Keeps track of kernel launch stubs emitted in this module
40288943Sdim  llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
41288943Sdim  /// Keeps track of variables containing handles of GPU binaries. Populated by
42288943Sdim  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
43288943Sdim  /// ModuleDtorFunction()
44288943Sdim  llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
45288943Sdim
46226586Sdim  llvm::Constant *getSetupArgumentFn() const;
47226586Sdim  llvm::Constant *getLaunchFn() const;
48226586Sdim
49288943Sdim  /// Creates a function to register all kernel stubs generated in this module.
50288943Sdim  llvm::Function *makeRegisterKernelsFn();
51288943Sdim
52288943Sdim  /// Helper function that generates a constant string and returns a pointer to
53288943Sdim  /// the start of the string.  The result of this function can be used anywhere
54288943Sdim  /// where the C code specifies const char*.
55288943Sdim  llvm::Constant *makeConstantString(const std::string &Str,
56288943Sdim                                     const std::string &Name = "",
57288943Sdim                                     unsigned Alignment = 0) {
58288943Sdim    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
59288943Sdim                               llvm::ConstantInt::get(SizeTy, 0)};
60296417Sdim    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
61296417Sdim    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
62296417Sdim                                                ConstStr.getPointer(), Zeros);
63288943Sdim }
64288943Sdim
65288943Sdim  void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
66288943Sdim
67226586Sdimpublic:
68226586Sdim  CGNVCUDARuntime(CodeGenModule &CGM);
69226586Sdim
70288943Sdim  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
71288943Sdim  /// Creates module constructor function
72288943Sdim  llvm::Function *makeModuleCtorFunction() override;
73288943Sdim  /// Creates module destructor function
74288943Sdim  llvm::Function *makeModuleDtorFunction() override;
75226586Sdim};
76226586Sdim
77226586Sdim}
78226586Sdim
79288943SdimCGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
80288943Sdim    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
81288943Sdim      TheModule(CGM.getModule()) {
82226586Sdim  CodeGen::CodeGenTypes &Types = CGM.getTypes();
83226586Sdim  ASTContext &Ctx = CGM.getContext();
84226586Sdim
85226586Sdim  IntTy = Types.ConvertType(Ctx.IntTy);
86226586Sdim  SizeTy = Types.ConvertType(Ctx.getSizeType());
87288943Sdim  VoidTy = llvm::Type::getVoidTy(Context);
88226586Sdim
89226586Sdim  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
90226586Sdim  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
91288943Sdim  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
92226586Sdim}
93226586Sdim
94226586Sdimllvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
95226586Sdim  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
96226586Sdim  std::vector<llvm::Type*> Params;
97226586Sdim  Params.push_back(VoidPtrTy);
98226586Sdim  Params.push_back(SizeTy);
99226586Sdim  Params.push_back(SizeTy);
100226586Sdim  return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
101226586Sdim                                                           Params, false),
102226586Sdim                                   "cudaSetupArgument");
103226586Sdim}
104226586Sdim
105226586Sdimllvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
106226586Sdim  // cudaError_t cudaLaunch(char *)
107288943Sdim  return CGM.CreateRuntimeFunction(
108288943Sdim      llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
109226586Sdim}
110226586Sdim
111288943Sdimvoid CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
112288943Sdim                                     FunctionArgList &Args) {
113288943Sdim  EmittedKernels.push_back(CGF.CurFn);
114288943Sdim  emitDeviceStubBody(CGF, Args);
115288943Sdim}
116288943Sdim
117288943Sdimvoid CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
118226586Sdim                                         FunctionArgList &Args) {
119226586Sdim  // Build the argument value list and the argument stack struct type.
120249423Sdim  SmallVector<llvm::Value *, 16> ArgValues;
121226586Sdim  std::vector<llvm::Type *> ArgTypes;
122226586Sdim  for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end();
123226586Sdim       I != E; ++I) {
124296417Sdim    llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer();
125226586Sdim    ArgValues.push_back(V);
126226586Sdim    assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType");
127226586Sdim    ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType());
128226586Sdim  }
129288943Sdim  llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes);
130226586Sdim
131226586Sdim  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
132226586Sdim
133226586Sdim  // Emit the calls to cudaSetupArgument
134226586Sdim  llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
135226586Sdim  for (unsigned I = 0, E = Args.size(); I != E; ++I) {
136226586Sdim    llvm::Value *Args[3];
137226586Sdim    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
138226586Sdim    Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy);
139226586Sdim    Args[1] = CGF.Builder.CreateIntCast(
140226586Sdim        llvm::ConstantExpr::getSizeOf(ArgTypes[I]),
141226586Sdim        SizeTy, false);
142226586Sdim    Args[2] = CGF.Builder.CreateIntCast(
143226586Sdim        llvm::ConstantExpr::getOffsetOf(ArgStackTy, I),
144226586Sdim        SizeTy, false);
145249423Sdim    llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
146226586Sdim    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
147226586Sdim    llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
148226586Sdim    CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
149226586Sdim    CGF.EmitBlock(NextBlock);
150226586Sdim  }
151226586Sdim
152226586Sdim  // Emit the call to cudaLaunch
153226586Sdim  llvm::Constant *cudaLaunchFn = getLaunchFn();
154226586Sdim  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
155249423Sdim  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
156226586Sdim  CGF.EmitBranch(EndBlock);
157226586Sdim
158226586Sdim  CGF.EmitBlock(EndBlock);
159226586Sdim}
160226586Sdim
161288943Sdim/// Creates internal function to register all kernel stubs generated in this
162288943Sdim/// module with the CUDA runtime.
163288943Sdim/// \code
164288943Sdim/// void __cuda_register_kernels(void** GpuBinaryHandle) {
165288943Sdim///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
166288943Sdim///    ...
167288943Sdim///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
168288943Sdim/// }
169288943Sdim/// \endcode
170288943Sdimllvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
171288943Sdim  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
172288943Sdim      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
173288943Sdim      llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
174288943Sdim  llvm::BasicBlock *EntryBB =
175288943Sdim      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
176296417Sdim  CGBuilderTy Builder(CGM, Context);
177288943Sdim  Builder.SetInsertPoint(EntryBB);
178288943Sdim
179288943Sdim  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
180288943Sdim  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
181288943Sdim  std::vector<llvm::Type *> RegisterFuncParams = {
182288943Sdim      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
183288943Sdim      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
184288943Sdim  llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
185288943Sdim      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
186288943Sdim      "__cudaRegisterFunction");
187288943Sdim
188288943Sdim  // Extract GpuBinaryHandle passed as the first argument passed to
189288943Sdim  // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
190288943Sdim  // each emitted kernel.
191288943Sdim  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
192288943Sdim  for (llvm::Function *Kernel : EmittedKernels) {
193288943Sdim    llvm::Constant *KernelName = makeConstantString(Kernel->getName());
194288943Sdim    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
195288943Sdim    llvm::Value *args[] = {
196288943Sdim        &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
197288943Sdim        KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
198288943Sdim        NullPtr, NullPtr, NullPtr,
199288943Sdim        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
200288943Sdim    Builder.CreateCall(RegisterFunc, args);
201288943Sdim  }
202288943Sdim
203288943Sdim  Builder.CreateRetVoid();
204288943Sdim  return RegisterKernelsFunc;
205288943Sdim}
206288943Sdim
207288943Sdim/// Creates a global constructor function for the module:
208288943Sdim/// \code
209288943Sdim/// void __cuda_module_ctor(void*) {
210288943Sdim///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
211288943Sdim///     __cuda_register_kernels(Handle0);
212288943Sdim///     ...
213288943Sdim///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
214288943Sdim///     __cuda_register_kernels(HandleN);
215288943Sdim/// }
216288943Sdim/// \endcode
217288943Sdimllvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
218288943Sdim  // void __cuda_register_kernels(void* handle);
219288943Sdim  llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
220288943Sdim  // void ** __cudaRegisterFatBinary(void *);
221288943Sdim  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
222288943Sdim      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
223288943Sdim      "__cudaRegisterFatBinary");
224288943Sdim  // struct { int magic, int version, void * gpu_binary, void * dont_care };
225288943Sdim  llvm::StructType *FatbinWrapperTy =
226288943Sdim      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr);
227288943Sdim
228288943Sdim  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
229288943Sdim      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
230288943Sdim      llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
231288943Sdim  llvm::BasicBlock *CtorEntryBB =
232288943Sdim      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
233296417Sdim  CGBuilderTy CtorBuilder(CGM, Context);
234288943Sdim
235288943Sdim  CtorBuilder.SetInsertPoint(CtorEntryBB);
236288943Sdim
237288943Sdim  // For each GPU binary, register it with the CUDA runtime and store returned
238288943Sdim  // handle in a global variable and save the handle in GpuBinaryHandles vector
239288943Sdim  // to be cleaned up in destructor on exit. Then associate all known kernels
240288943Sdim  // with the GPU binary handle so CUDA runtime can figure out what to call on
241288943Sdim  // the GPU side.
242288943Sdim  for (const std::string &GpuBinaryFileName :
243288943Sdim       CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
244288943Sdim    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
245288943Sdim        llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
246288943Sdim    if (std::error_code EC = GpuBinaryOrErr.getError()) {
247288943Sdim      CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
248288943Sdim                                                        << EC.message();
249288943Sdim      continue;
250288943Sdim    }
251288943Sdim
252288943Sdim    // Create initialized wrapper structure that points to the loaded GPU binary
253288943Sdim    llvm::Constant *Values[] = {
254288943Sdim        llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic.
255288943Sdim        llvm::ConstantInt::get(IntTy, 1),          // Fatbin version.
256288943Sdim        makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data.
257288943Sdim        llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1.
258288943Sdim    llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable(
259288943Sdim        TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
260288943Sdim        llvm::ConstantStruct::get(FatbinWrapperTy, Values),
261288943Sdim        "__cuda_fatbin_wrapper");
262288943Sdim
263288943Sdim    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
264288943Sdim    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
265288943Sdim        RegisterFatbinFunc,
266288943Sdim        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
267288943Sdim    llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
268288943Sdim        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
269288943Sdim        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
270296417Sdim    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
271296417Sdim                                   CGM.getPointerAlign());
272288943Sdim
273288943Sdim    // Call __cuda_register_kernels(GpuBinaryHandle);
274288943Sdim    CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
275288943Sdim
276288943Sdim    // Save GpuBinaryHandle so we can unregister it in destructor.
277288943Sdim    GpuBinaryHandles.push_back(GpuBinaryHandle);
278288943Sdim  }
279288943Sdim
280288943Sdim  CtorBuilder.CreateRetVoid();
281288943Sdim  return ModuleCtorFunc;
282288943Sdim}
283288943Sdim
284288943Sdim/// Creates a global destructor function that unregisters all GPU code blobs
285288943Sdim/// registered by constructor.
286288943Sdim/// \code
287288943Sdim/// void __cuda_module_dtor(void*) {
288288943Sdim///     __cudaUnregisterFatBinary(Handle0);
289288943Sdim///     ...
290288943Sdim///     __cudaUnregisterFatBinary(HandleN);
291288943Sdim/// }
292288943Sdim/// \endcode
293288943Sdimllvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
294288943Sdim  // void __cudaUnregisterFatBinary(void ** handle);
295288943Sdim  llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
296288943Sdim      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
297288943Sdim      "__cudaUnregisterFatBinary");
298288943Sdim
299288943Sdim  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
300288943Sdim      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
301288943Sdim      llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
302288943Sdim  llvm::BasicBlock *DtorEntryBB =
303288943Sdim      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
304296417Sdim  CGBuilderTy DtorBuilder(CGM, Context);
305288943Sdim  DtorBuilder.SetInsertPoint(DtorEntryBB);
306288943Sdim
307288943Sdim  for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
308296417Sdim    auto HandleValue =
309296417Sdim      DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
310296417Sdim    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
311288943Sdim  }
312288943Sdim
313288943Sdim  DtorBuilder.CreateRetVoid();
314288943Sdim  return ModuleDtorFunc;
315288943Sdim}
316288943Sdim
317226586SdimCGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
318226586Sdim  return new CGNVCUDARuntime(CGM);
319226586Sdim}
320