1//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10// runtime library.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGCUDARuntime.h"
15#include "CodeGenFunction.h"
16#include "CodeGenModule.h"
17#include "clang/AST/Decl.h"
18#include "clang/Basic/Cuda.h"
19#include "clang/CodeGen/CodeGenABITypes.h"
20#include "clang/CodeGen/ConstantInitBuilder.h"
21#include "llvm/IR/BasicBlock.h"
22#include "llvm/IR/Constants.h"
23#include "llvm/IR/DerivedTypes.h"
24#include "llvm/Support/Format.h"
25
26using namespace clang;
27using namespace CodeGen;
28
29namespace {
30constexpr unsigned CudaFatMagic = 0x466243b1;
31constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
32
33class CGNVCUDARuntime : public CGCUDARuntime {
34
35private:
36  llvm::IntegerType *IntTy, *SizeTy;
37  llvm::Type *VoidTy;
38  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
39
40  /// Convenience reference to LLVM Context
41  llvm::LLVMContext &Context;
42  /// Convenience reference to the current module
43  llvm::Module &TheModule;
44  /// Keeps track of kernel launch stubs emitted in this module
45  struct KernelInfo {
46    llvm::Function *Kernel;
47    const Decl *D;
48  };
49  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
50  struct VarInfo {
51    llvm::GlobalVariable *Var;
52    const VarDecl *D;
53    DeviceVarFlags Flags;
54  };
55  llvm::SmallVector<VarInfo, 16> DeviceVars;
56  /// Keeps track of variable containing handle of GPU binary. Populated by
57  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
58  /// ModuleDtorFunction()
59  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
60  /// Whether we generate relocatable device code.
61  bool RelocatableDeviceCode;
62  /// Mangle context for device.
63  std::unique_ptr<MangleContext> DeviceMC;
64
65  llvm::FunctionCallee getSetupArgumentFn() const;
66  llvm::FunctionCallee getLaunchFn() const;
67
68  llvm::FunctionType *getRegisterGlobalsFnTy() const;
69  llvm::FunctionType *getCallbackFnTy() const;
70  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
71  std::string addPrefixToName(StringRef FuncName) const;
72  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
73
74  /// Creates a function to register all kernel stubs generated in this module.
75  llvm::Function *makeRegisterGlobalsFn();
76
77  /// Helper function that generates a constant string and returns a pointer to
78  /// the start of the string.  The result of this function can be used anywhere
79  /// where the C code specifies const char*.
80  llvm::Constant *makeConstantString(const std::string &Str,
81                                     const std::string &Name = "",
82                                     const std::string &SectionName = "",
83                                     unsigned Alignment = 0) {
84    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
85                               llvm::ConstantInt::get(SizeTy, 0)};
86    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
87    llvm::GlobalVariable *GV =
88        cast<llvm::GlobalVariable>(ConstStr.getPointer());
89    if (!SectionName.empty()) {
90      GV->setSection(SectionName);
91      // Mark the address as used which make sure that this section isn't
92      // merged and we will really have it in the object file.
93      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
94    }
95    if (Alignment)
96      GV->setAlignment(llvm::Align(Alignment));
97
98    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
99                                                ConstStr.getPointer(), Zeros);
100  }
101
102  /// Helper function that generates an empty dummy function returning void.
103  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
104    assert(FnTy->getReturnType()->isVoidTy() &&
105           "Can only generate dummy functions returning void!");
106    llvm::Function *DummyFunc = llvm::Function::Create(
107        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
108
109    llvm::BasicBlock *DummyBlock =
110        llvm::BasicBlock::Create(Context, "", DummyFunc);
111    CGBuilderTy FuncBuilder(CGM, Context);
112    FuncBuilder.SetInsertPoint(DummyBlock);
113    FuncBuilder.CreateRetVoid();
114
115    return DummyFunc;
116  }
117
118  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
119  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
120  std::string getDeviceSideName(const NamedDecl *ND) override;
121
122public:
123  CGNVCUDARuntime(CodeGenModule &CGM);
124
125  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
126  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
127                         bool Extern, bool Constant) override {
128    DeviceVars.push_back({&Var,
129                          VD,
130                          {DeviceVarFlags::Variable, Extern, Constant,
131                           /*Normalized*/ false, /*Type*/ 0}});
132  }
133  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
134                          bool Extern, int Type) override {
135    DeviceVars.push_back({&Var,
136                          VD,
137                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
138                           /*Normalized*/ false, Type}});
139  }
140  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
141                         bool Extern, int Type, bool Normalized) override {
142    DeviceVars.push_back({&Var,
143                          VD,
144                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
145                           Normalized, Type}});
146  }
147
148  /// Creates module constructor function
149  llvm::Function *makeModuleCtorFunction() override;
150  /// Creates module destructor function
151  llvm::Function *makeModuleDtorFunction() override;
152};
153
154}
155
156std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
157  if (CGM.getLangOpts().HIP)
158    return ((Twine("hip") + Twine(FuncName)).str());
159  return ((Twine("cuda") + Twine(FuncName)).str());
160}
161std::string
162CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
163  if (CGM.getLangOpts().HIP)
164    return ((Twine("__hip") + Twine(FuncName)).str());
165  return ((Twine("__cuda") + Twine(FuncName)).str());
166}
167
168CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
169    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
170      TheModule(CGM.getModule()),
171      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
172      DeviceMC(CGM.getContext().createMangleContext(
173          CGM.getContext().getAuxTargetInfo())) {
174  CodeGen::CodeGenTypes &Types = CGM.getTypes();
175  ASTContext &Ctx = CGM.getContext();
176
177  IntTy = CGM.IntTy;
178  SizeTy = CGM.SizeTy;
179  VoidTy = CGM.VoidTy;
180
181  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
182  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
183  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
184}
185
186llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
187  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
188  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
189  return CGM.CreateRuntimeFunction(
190      llvm::FunctionType::get(IntTy, Params, false),
191      addPrefixToName("SetupArgument"));
192}
193
194llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
195  if (CGM.getLangOpts().HIP) {
196    // hipError_t hipLaunchByPtr(char *);
197    return CGM.CreateRuntimeFunction(
198        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
199  } else {
200    // cudaError_t cudaLaunch(char *);
201    return CGM.CreateRuntimeFunction(
202        llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
203  }
204}
205
206llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
207  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
208}
209
210llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
211  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
212}
213
214llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
215  auto CallbackFnTy = getCallbackFnTy();
216  auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
217  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
218                          VoidPtrTy, CallbackFnTy->getPointerTo()};
219  return llvm::FunctionType::get(VoidTy, Params, false);
220}
221
222std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
223  GlobalDecl GD;
224  // D could be either a kernel or a variable.
225  if (auto *FD = dyn_cast<FunctionDecl>(ND))
226    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
227  else
228    GD = GlobalDecl(ND);
229  std::string DeviceSideName;
230  if (DeviceMC->shouldMangleDeclName(ND)) {
231    SmallString<256> Buffer;
232    llvm::raw_svector_ostream Out(Buffer);
233    DeviceMC->mangleName(GD, Out);
234    DeviceSideName = std::string(Out.str());
235  } else
236    DeviceSideName = std::string(ND->getIdentifier()->getName());
237  return DeviceSideName;
238}
239
240void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
241                                     FunctionArgList &Args) {
242  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
243  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
244                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
245      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
246    emitDeviceStubBodyNew(CGF, Args);
247  else
248    emitDeviceStubBodyLegacy(CGF, Args);
249}
250
251// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
252// array and kernels are launched using cudaLaunchKernel().
253void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
254                                            FunctionArgList &Args) {
255  // Build the shadow stack entry at the very start of the function.
256
257  // Calculate amount of space we will need for all arguments.  If we have no
258  // args, allocate a single pointer so we still have a valid pointer to the
259  // argument array that we can pass to runtime, even if it will be unused.
260  Address KernelArgs = CGF.CreateTempAlloca(
261      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
262      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
263  // Store pointers to the arguments in a locally allocated launch_args.
264  for (unsigned i = 0; i < Args.size(); ++i) {
265    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
266    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
267    CGF.Builder.CreateDefaultAlignedStore(
268        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
269  }
270
271  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
272
273  // Lookup cudaLaunchKernel/hipLaunchKernel function.
274  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
275  //                              void **args, size_t sharedMem,
276  //                              cudaStream_t stream);
277  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
278  //                            void **args, size_t sharedMem,
279  //                            hipStream_t stream);
280  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
281  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
282  auto LaunchKernelName = addPrefixToName("LaunchKernel");
283  IdentifierInfo &cudaLaunchKernelII =
284      CGM.getContext().Idents.get(LaunchKernelName);
285  FunctionDecl *cudaLaunchKernelFD = nullptr;
286  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
287    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
288      cudaLaunchKernelFD = FD;
289  }
290
291  if (cudaLaunchKernelFD == nullptr) {
292    CGM.Error(CGF.CurFuncDecl->getLocation(),
293              "Can't find declaration for " + LaunchKernelName);
294    return;
295  }
296  // Create temporary dim3 grid_dim, block_dim.
297  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
298  QualType Dim3Ty = GridDimParam->getType();
299  Address GridDim =
300      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
301  Address BlockDim =
302      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
303  Address ShmemSize =
304      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
305  Address Stream =
306      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
307  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
308      llvm::FunctionType::get(IntTy,
309                              {/*gridDim=*/GridDim.getType(),
310                               /*blockDim=*/BlockDim.getType(),
311                               /*ShmemSize=*/ShmemSize.getType(),
312                               /*Stream=*/Stream.getType()},
313                              /*isVarArg=*/false),
314      addUnderscoredPrefixToName("PopCallConfiguration"));
315
316  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
317                              {GridDim.getPointer(), BlockDim.getPointer(),
318                               ShmemSize.getPointer(), Stream.getPointer()});
319
320  // Emit the call to cudaLaunch
321  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
322  CallArgList LaunchKernelArgs;
323  LaunchKernelArgs.add(RValue::get(Kernel),
324                       cudaLaunchKernelFD->getParamDecl(0)->getType());
325  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
326  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
327  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
328                       cudaLaunchKernelFD->getParamDecl(3)->getType());
329  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
330                       cudaLaunchKernelFD->getParamDecl(4)->getType());
331  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
332                       cudaLaunchKernelFD->getParamDecl(5)->getType());
333
334  QualType QT = cudaLaunchKernelFD->getType();
335  QualType CQT = QT.getCanonicalType();
336  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
337  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
338
339  const CGFunctionInfo &FI =
340      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
341  llvm::FunctionCallee cudaLaunchKernelFn =
342      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
343  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
344               LaunchKernelArgs);
345  CGF.EmitBranch(EndBlock);
346
347  CGF.EmitBlock(EndBlock);
348}
349
350void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
351                                               FunctionArgList &Args) {
352  // Emit a call to cudaSetupArgument for each arg in Args.
353  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
354  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
355  CharUnits Offset = CharUnits::Zero();
356  for (const VarDecl *A : Args) {
357    CharUnits TyWidth, TyAlign;
358    std::tie(TyWidth, TyAlign) =
359        CGM.getContext().getTypeInfoInChars(A->getType());
360    Offset = Offset.alignTo(TyAlign);
361    llvm::Value *Args[] = {
362        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
363                                      VoidPtrTy),
364        llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
365        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
366    };
367    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
368    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
369    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
370    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
371    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
372    CGF.EmitBlock(NextBlock);
373    Offset += TyWidth;
374  }
375
376  // Emit the call to cudaLaunch
377  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
378  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
379  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
380  CGF.EmitBranch(EndBlock);
381
382  CGF.EmitBlock(EndBlock);
383}
384
385/// Creates a function that sets up state on the host side for CUDA objects that
386/// have a presence on both the host and device sides. Specifically, registers
387/// the host side of kernel functions and device global variables with the CUDA
388/// runtime.
389/// \code
390/// void __cuda_register_globals(void** GpuBinaryHandle) {
391///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
392///    ...
393///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
394///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
395///    ...
396///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
397/// }
398/// \endcode
399llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
400  // No need to register anything
401  if (EmittedKernels.empty() && DeviceVars.empty())
402    return nullptr;
403
404  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
405      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
406      addUnderscoredPrefixToName("_register_globals"), &TheModule);
407  llvm::BasicBlock *EntryBB =
408      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
409  CGBuilderTy Builder(CGM, Context);
410  Builder.SetInsertPoint(EntryBB);
411
412  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
413  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
414  llvm::Type *RegisterFuncParams[] = {
415      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
416      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
417  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
418      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
419      addUnderscoredPrefixToName("RegisterFunction"));
420
421  // Extract GpuBinaryHandle passed as the first argument passed to
422  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
423  // each emitted kernel.
424  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
425  for (auto &&I : EmittedKernels) {
426    llvm::Constant *KernelName =
427        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
428    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
429    llvm::Value *Args[] = {
430        &GpuBinaryHandlePtr,
431        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
432        KernelName,
433        KernelName,
434        llvm::ConstantInt::get(IntTy, -1),
435        NullPtr,
436        NullPtr,
437        NullPtr,
438        NullPtr,
439        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
440    Builder.CreateCall(RegisterFunc, Args);
441  }
442
443  llvm::Type *VarSizeTy = IntTy;
444  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
445  if (CGM.getLangOpts().HIP ||
446      ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
447    VarSizeTy = SizeTy;
448
449  // void __cudaRegisterVar(void **, char *, char *, const char *,
450  //                        int, int, int, int)
451  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
452                                     CharPtrTy,    IntTy,     VarSizeTy,
453                                     IntTy,        IntTy};
454  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
455      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
456      addUnderscoredPrefixToName("RegisterVar"));
457  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
458  //                            const void **, const char *, int, int);
459  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
460      llvm::FunctionType::get(
461          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
462          false),
463      addUnderscoredPrefixToName("RegisterSurface"));
464  // void __cudaRegisterTexture(void **, const struct textureReference *,
465  //                            const void **, const char *, int, int, int)
466  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
467      llvm::FunctionType::get(
468          VoidTy,
469          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
470          false),
471      addUnderscoredPrefixToName("RegisterTexture"));
472  for (auto &&Info : DeviceVars) {
473    llvm::GlobalVariable *Var = Info.Var;
474    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
475    switch (Info.Flags.getKind()) {
476    case DeviceVarFlags::Variable: {
477      uint64_t VarSize =
478          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
479      llvm::Value *Args[] = {
480          &GpuBinaryHandlePtr,
481          Builder.CreateBitCast(Var, VoidPtrTy),
482          VarName,
483          VarName,
484          llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
485          llvm::ConstantInt::get(VarSizeTy, VarSize),
486          llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
487          llvm::ConstantInt::get(IntTy, 0)};
488      Builder.CreateCall(RegisterVar, Args);
489      break;
490    }
491    case DeviceVarFlags::Surface:
492      Builder.CreateCall(
493          RegisterSurf,
494          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
495           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
496           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
497      break;
498    case DeviceVarFlags::Texture:
499      Builder.CreateCall(
500          RegisterTex,
501          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
502           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
503           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
504           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
505      break;
506    }
507  }
508
509  Builder.CreateRetVoid();
510  return RegisterKernelsFunc;
511}
512
513/// Creates a global constructor function for the module:
514///
515/// For CUDA:
516/// \code
517/// void __cuda_module_ctor(void*) {
518///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
519///     __cuda_register_globals(Handle);
520/// }
521/// \endcode
522///
523/// For HIP:
524/// \code
525/// void __hip_module_ctor(void*) {
526///     if (__hip_gpubin_handle == 0) {
527///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
528///         __hip_register_globals(__hip_gpubin_handle);
529///     }
530/// }
531/// \endcode
532llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
533  bool IsHIP = CGM.getLangOpts().HIP;
534  bool IsCUDA = CGM.getLangOpts().CUDA;
535  // No need to generate ctors/dtors if there is no GPU binary.
536  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
537  if (CudaGpuBinaryFileName.empty() && !IsHIP)
538    return nullptr;
539  if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
540      DeviceVars.empty())
541    return nullptr;
542
543  // void __{cuda|hip}_register_globals(void* handle);
544  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
545  // We always need a function to pass in as callback. Create a dummy
546  // implementation if we don't need to register anything.
547  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
548    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
549
550  // void ** __{cuda|hip}RegisterFatBinary(void *);
551  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
552      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
553      addUnderscoredPrefixToName("RegisterFatBinary"));
554  // struct { int magic, int version, void * gpu_binary, void * dont_care };
555  llvm::StructType *FatbinWrapperTy =
556      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
557
558  // Register GPU binary with the CUDA runtime, store returned handle in a
559  // global variable and save a reference in GpuBinaryHandle to be cleaned up
560  // in destructor on exit. Then associate all known kernels with the GPU binary
561  // handle so CUDA runtime can figure out what to call on the GPU side.
562  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
563  if (!CudaGpuBinaryFileName.empty()) {
564    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
565        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
566    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
567      CGM.getDiags().Report(diag::err_cannot_open_file)
568          << CudaGpuBinaryFileName << EC.message();
569      return nullptr;
570    }
571    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
572  }
573
574  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
575      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
576      llvm::GlobalValue::InternalLinkage,
577      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
578  llvm::BasicBlock *CtorEntryBB =
579      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
580  CGBuilderTy CtorBuilder(CGM, Context);
581
582  CtorBuilder.SetInsertPoint(CtorEntryBB);
583
584  const char *FatbinConstantName;
585  const char *FatbinSectionName;
586  const char *ModuleIDSectionName;
587  StringRef ModuleIDPrefix;
588  llvm::Constant *FatBinStr;
589  unsigned FatMagic;
590  if (IsHIP) {
591    FatbinConstantName = ".hip_fatbin";
592    FatbinSectionName = ".hipFatBinSegment";
593
594    ModuleIDSectionName = "__hip_module_id";
595    ModuleIDPrefix = "__hip_";
596
597    if (CudaGpuBinary) {
598      // If fatbin is available from early finalization, create a string
599      // literal containing the fat binary loaded from the given file.
600      FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()),
601                                     "", FatbinConstantName, 8);
602    } else {
603      // If fatbin is not available, create an external symbol
604      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
605      // to contain the fat binary but will be populated somewhere else,
606      // e.g. by lld through link script.
607      FatBinStr = new llvm::GlobalVariable(
608        CGM.getModule(), CGM.Int8Ty,
609        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
610        "__hip_fatbin", nullptr,
611        llvm::GlobalVariable::NotThreadLocal);
612      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
613    }
614
615    FatMagic = HIPFatMagic;
616  } else {
617    if (RelocatableDeviceCode)
618      FatbinConstantName = CGM.getTriple().isMacOSX()
619                               ? "__NV_CUDA,__nv_relfatbin"
620                               : "__nv_relfatbin";
621    else
622      FatbinConstantName =
623          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
624    // NVIDIA's cuobjdump looks for fatbins in this section.
625    FatbinSectionName =
626        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
627
628    ModuleIDSectionName = CGM.getTriple().isMacOSX()
629                              ? "__NV_CUDA,__nv_module_id"
630                              : "__nv_module_id";
631    ModuleIDPrefix = "__nv_";
632
633    // For CUDA, create a string literal containing the fat binary loaded from
634    // the given file.
635    FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
636                                   FatbinConstantName, 8);
637    FatMagic = CudaFatMagic;
638  }
639
640  // Create initialized wrapper structure that points to the loaded GPU binary
641  ConstantInitBuilder Builder(CGM);
642  auto Values = Builder.beginStruct(FatbinWrapperTy);
643  // Fatbin wrapper magic.
644  Values.addInt(IntTy, FatMagic);
645  // Fatbin version.
646  Values.addInt(IntTy, 1);
647  // Data.
648  Values.add(FatBinStr);
649  // Unused in fatbin v1.
650  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
651  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
652      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
653      /*constant*/ true);
654  FatbinWrapper->setSection(FatbinSectionName);
655
656  // There is only one HIP fat binary per linked module, however there are
657  // multiple constructor functions. Make sure the fat binary is registered
658  // only once. The constructor functions are executed by the dynamic loader
659  // before the program gains control. The dynamic loader cannot execute the
660  // constructor functions concurrently since doing that would not guarantee
661  // thread safety of the loaded program. Therefore we can assume sequential
662  // execution of constructor functions here.
663  if (IsHIP) {
664    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
665        llvm::GlobalValue::LinkOnceAnyLinkage;
666    llvm::BasicBlock *IfBlock =
667        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
668    llvm::BasicBlock *ExitBlock =
669        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
670    // The name, size, and initialization pattern of this variable is part
671    // of HIP ABI.
672    GpuBinaryHandle = new llvm::GlobalVariable(
673        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
674        Linkage,
675        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
676        "__hip_gpubin_handle");
677    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
678    // Prevent the weak symbol in different shared libraries being merged.
679    if (Linkage != llvm::GlobalValue::InternalLinkage)
680      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
681    Address GpuBinaryAddr(
682        GpuBinaryHandle,
683        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
684    {
685      auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
686      llvm::Constant *Zero =
687          llvm::Constant::getNullValue(HandleValue->getType());
688      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
689      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
690    }
691    {
692      CtorBuilder.SetInsertPoint(IfBlock);
693      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
694      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
695          RegisterFatbinFunc,
696          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
697      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
698      CtorBuilder.CreateBr(ExitBlock);
699    }
700    {
701      CtorBuilder.SetInsertPoint(ExitBlock);
702      // Call __hip_register_globals(GpuBinaryHandle);
703      if (RegisterGlobalsFunc) {
704        auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
705        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
706      }
707    }
708  } else if (!RelocatableDeviceCode) {
709    // Register binary with CUDA runtime. This is substantially different in
710    // default mode vs. separate compilation!
711    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
712    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
713        RegisterFatbinFunc,
714        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
715    GpuBinaryHandle = new llvm::GlobalVariable(
716        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
717        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
718    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
719    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
720                                   CGM.getPointerAlign());
721
722    // Call __cuda_register_globals(GpuBinaryHandle);
723    if (RegisterGlobalsFunc)
724      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
725
726    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
727    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
728                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
729      // void __cudaRegisterFatBinaryEnd(void **);
730      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
731          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
732          "__cudaRegisterFatBinaryEnd");
733      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
734    }
735  } else {
736    // Generate a unique module ID.
737    SmallString<64> ModuleID;
738    llvm::raw_svector_ostream OS(ModuleID);
739    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
740    llvm::Constant *ModuleIDConstant = makeConstantString(
741        std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
742
743    // Create an alias for the FatbinWrapper that nvcc will look for.
744    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
745                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
746
747    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
748    // void *, void (*)(void **))
749    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
750    RegisterLinkedBinaryName += ModuleID;
751    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
752        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
753
754    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
755    llvm::Value *Args[] = {RegisterGlobalsFunc,
756                           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
757                           ModuleIDConstant,
758                           makeDummyFunction(getCallbackFnTy())};
759    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
760  }
761
762  // Create destructor and register it with atexit() the way NVCC does it. Doing
763  // it during regular destructor phase worked in CUDA before 9.2 but results in
764  // double-free in 9.2.
765  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
766    // extern "C" int atexit(void (*f)(void));
767    llvm::FunctionType *AtExitTy =
768        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
769    llvm::FunctionCallee AtExitFunc =
770        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
771                                  /*Local=*/true);
772    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
773  }
774
775  CtorBuilder.CreateRetVoid();
776  return ModuleCtorFunc;
777}
778
779/// Creates a global destructor function that unregisters the GPU code blob
780/// registered by constructor.
781///
782/// For CUDA:
783/// \code
784/// void __cuda_module_dtor(void*) {
785///     __cudaUnregisterFatBinary(Handle);
786/// }
787/// \endcode
788///
789/// For HIP:
790/// \code
791/// void __hip_module_dtor(void*) {
792///     if (__hip_gpubin_handle) {
793///         __hipUnregisterFatBinary(__hip_gpubin_handle);
794///         __hip_gpubin_handle = 0;
795///     }
796/// }
797/// \endcode
798llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
799  // No need for destructor if we don't have a handle to unregister.
800  if (!GpuBinaryHandle)
801    return nullptr;
802
803  // void __cudaUnregisterFatBinary(void ** handle);
804  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
805      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
806      addUnderscoredPrefixToName("UnregisterFatBinary"));
807
808  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
809      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
810      llvm::GlobalValue::InternalLinkage,
811      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
812
813  llvm::BasicBlock *DtorEntryBB =
814      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
815  CGBuilderTy DtorBuilder(CGM, Context);
816  DtorBuilder.SetInsertPoint(DtorEntryBB);
817
818  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
819                                             GpuBinaryHandle->getAlignment()));
820  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
821  // There is only one HIP fat binary per linked module, however there are
822  // multiple destructor functions. Make sure the fat binary is unregistered
823  // only once.
824  if (CGM.getLangOpts().HIP) {
825    llvm::BasicBlock *IfBlock =
826        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
827    llvm::BasicBlock *ExitBlock =
828        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
829    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
830    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
831    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
832
833    DtorBuilder.SetInsertPoint(IfBlock);
834    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
835    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
836    DtorBuilder.CreateBr(ExitBlock);
837
838    DtorBuilder.SetInsertPoint(ExitBlock);
839  } else {
840    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
841  }
842  DtorBuilder.CreateRetVoid();
843  return ModuleDtorFunc;
844}
845
846CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
847  return new CGNVCUDARuntime(CGM);
848}
849