1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CGOpenMPRuntimeNVPTX.h"
16#include "CodeGenFunction.h"
17#include "clang/AST/Attr.h"
18#include "clang/AST/DeclOpenMP.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/IR/IntrinsicsNVPTX.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33  llvm::FunctionCallee EnterCallee = nullptr;
34  ArrayRef<llvm::Value *> EnterArgs;
35  llvm::FunctionCallee ExitCallee = nullptr;
36  ArrayRef<llvm::Value *> ExitArgs;
37  bool Conditional = false;
38  llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42                ArrayRef<llvm::Value *> EnterArgs,
43                llvm::FunctionCallee ExitCallee,
44                ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45      : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46        ExitArgs(ExitArgs), Conditional(Conditional) {}
47  void Enter(CodeGenFunction &CGF) override {
48    llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49    if (Conditional) {
50      llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51      auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52      ContBlock = CGF.createBasicBlock("omp_if.end");
53      // Generate the branch (If-stmt)
54      CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55      CGF.EmitBlock(ThenBlock);
56    }
57  }
58  void Done(CodeGenFunction &CGF) {
59    // Emit the rest of blocks/branches
60    CGF.EmitBranch(ContBlock);
61    CGF.EmitBlock(ContBlock, true);
62  }
63  void Exit(CodeGenFunction &CGF) override {
64    CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65  }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75      CGOpenMPRuntimeGPU::EM_Unknown;
76  CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77  bool SavedRuntimeMode = false;
78  bool *RuntimeMode = nullptr;
79
80public:
81  /// Constructor for Non-SPMD mode.
82  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
83      : ExecMode(ExecMode) {
84    SavedExecMode = ExecMode;
85    ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
86  }
87  /// Constructor for SPMD mode.
88  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
89                            bool &RuntimeMode, bool FullRuntimeMode)
90      : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
91    SavedExecMode = ExecMode;
92    SavedRuntimeMode = RuntimeMode;
93    ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
94    RuntimeMode = FullRuntimeMode;
95  }
96  ~ExecutionRuntimeModesRAII() {
97    ExecMode = SavedExecMode;
98    if (RuntimeMode)
99      *RuntimeMode = SavedRuntimeMode;
100  }
101};
102
103/// GPU Configuration:  This information can be derived from cuda registers,
104/// however, providing compile time constants helps generate more efficient
105/// code.  For all practical purposes this is fine because the configuration
106/// is the same for all known NVPTX architectures.
107enum MachineConfiguration : unsigned {
108  /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
109  /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2,
110  /// and GV_Warp_Size_Log2_Mask.
111
112  /// Global memory alignment for performance.
113  GlobalMemoryAlignment = 128,
114
115  /// Maximal size of the shared memory buffer.
116  SharedMemorySize = 128,
117};
118
119static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120  RefExpr = RefExpr->IgnoreParens();
121  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124      Base = TempASE->getBase()->IgnoreParenImpCasts();
125    RefExpr = Base;
126  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129      Base = TempOASE->getBase()->IgnoreParenImpCasts();
130    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131      Base = TempASE->getBase()->IgnoreParenImpCasts();
132    RefExpr = Base;
133  }
134  RefExpr = RefExpr->IgnoreParenImpCasts();
135  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137  const auto *ME = cast<MemberExpr>(RefExpr);
138  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139}
140
141
142static RecordDecl *buildRecordForGlobalizedVars(
143    ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144    ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146        &MappedDeclsFields, int BufSize) {
147  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149    return nullptr;
150  SmallVector<VarsDataTy, 4> GlobalizedVars;
151  for (const ValueDecl *D : EscapedDecls)
152    GlobalizedVars.emplace_back(
153        CharUnits::fromQuantity(std::max(
154            C.getDeclAlign(D).getQuantity(),
155            static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156        D);
157  for (const ValueDecl *D : EscapedDeclsForTeams)
158    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160    return L.first > R.first;
161  });
162
163  // Build struct _globalized_locals_ty {
164  //         /*  globalized vars  */[WarSize] align (max(decl_align,
165  //         GlobalMemoryAlignment))
166  //         /*  globalized vars  */ for EscapedDeclsForTeams
167  //       };
168  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169  GlobalizedRD->startDefinition();
170  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171      EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172  for (const auto &Pair : GlobalizedVars) {
173    const ValueDecl *VD = Pair.second;
174    QualType Type = VD->getType();
175    if (Type->isLValueReferenceType())
176      Type = C.getPointerType(Type.getNonReferenceType());
177    else
178      Type = Type.getNonReferenceType();
179    SourceLocation Loc = VD->getLocation();
180    FieldDecl *Field;
181    if (SingleEscaped.count(VD)) {
182      Field = FieldDecl::Create(
183          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184          C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185          /*BW=*/nullptr, /*Mutable=*/false,
186          /*InitStyle=*/ICIS_NoInit);
187      Field->setAccess(AS_public);
188      if (VD->hasAttrs()) {
189        for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190             E(VD->getAttrs().end());
191             I != E; ++I)
192          Field->addAttr(*I);
193      }
194    } else {
195      llvm::APInt ArraySize(32, BufSize);
196      Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197                                    0);
198      Field = FieldDecl::Create(
199          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200          C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201          /*BW=*/nullptr, /*Mutable=*/false,
202          /*InitStyle=*/ICIS_NoInit);
203      Field->setAccess(AS_public);
204      llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205                                     static_cast<CharUnits::QuantityType>(
206                                         GlobalMemoryAlignment)));
207      Field->addAttr(AlignedAttr::CreateImplicit(
208          C, /*IsAlignmentExpr=*/true,
209          IntegerLiteral::Create(C, Align,
210                                 C.getIntTypeForBitwidth(32, /*Signed=*/0),
211                                 SourceLocation()),
212          {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213    }
214    GlobalizedRD->addDecl(Field);
215    MappedDeclsFields.try_emplace(VD, Field);
216  }
217  GlobalizedRD->completeDefinition();
218  return GlobalizedRD;
219}
220
221/// Get the list of variables that can escape their declaration context.
222class CheckVarsEscapingDeclContext final
223    : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224  CodeGenFunction &CGF;
225  llvm::SetVector<const ValueDecl *> EscapedDecls;
226  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228  RecordDecl *GlobalizedRD = nullptr;
229  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230  bool AllEscaped = false;
231  bool IsForCombinedParallelRegion = false;
232
233  void markAsEscaped(const ValueDecl *VD) {
234    // Do not globalize declare target variables.
235    if (!isa<VarDecl>(VD) ||
236        OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237      return;
238    VD = cast<ValueDecl>(VD->getCanonicalDecl());
239    // Use user-specified allocation.
240    if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241      return;
242    // Variables captured by value must be globalized.
243    if (auto *CSI = CGF.CapturedStmtInfo) {
244      if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245        // Check if need to capture the variable that was already captured by
246        // value in the outer region.
247        if (!IsForCombinedParallelRegion) {
248          if (!FD->hasAttrs())
249            return;
250          const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251          if (!Attr)
252            return;
253          if (((Attr->getCaptureKind() != OMPC_map) &&
254               !isOpenMPPrivate(Attr->getCaptureKind())) ||
255              ((Attr->getCaptureKind() == OMPC_map) &&
256               !FD->getType()->isAnyPointerType()))
257            return;
258        }
259        if (!FD->getType()->isReferenceType()) {
260          assert(!VD->getType()->isVariablyModifiedType() &&
261                 "Parameter captured by value with variably modified type");
262          EscapedParameters.insert(VD);
263        } else if (!IsForCombinedParallelRegion) {
264          return;
265        }
266      }
267    }
268    if ((!CGF.CapturedStmtInfo ||
269         (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270        VD->getType()->isReferenceType())
271      // Do not globalize variables with reference type.
272      return;
273    if (VD->getType()->isVariablyModifiedType())
274      EscapedVariableLengthDecls.insert(VD);
275    else
276      EscapedDecls.insert(VD);
277  }
278
279  void VisitValueDecl(const ValueDecl *VD) {
280    if (VD->getType()->isLValueReferenceType())
281      markAsEscaped(VD);
282    if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283      if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284        const bool SavedAllEscaped = AllEscaped;
285        AllEscaped = VD->getType()->isLValueReferenceType();
286        Visit(VarD->getInit());
287        AllEscaped = SavedAllEscaped;
288      }
289    }
290  }
291  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292                               ArrayRef<OMPClause *> Clauses,
293                               bool IsCombinedParallelRegion) {
294    if (!S)
295      return;
296    for (const CapturedStmt::Capture &C : S->captures()) {
297      if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298        const ValueDecl *VD = C.getCapturedVar();
299        bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300        if (IsCombinedParallelRegion) {
301          // Check if the variable is privatized in the combined construct and
302          // those private copies must be shared in the inner parallel
303          // directive.
304          IsForCombinedParallelRegion = false;
305          for (const OMPClause *C : Clauses) {
306            if (!isOpenMPPrivate(C->getClauseKind()) ||
307                C->getClauseKind() == OMPC_reduction ||
308                C->getClauseKind() == OMPC_linear ||
309                C->getClauseKind() == OMPC_private)
310              continue;
311            ArrayRef<const Expr *> Vars;
312            if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313              Vars = PC->getVarRefs();
314            else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315              Vars = PC->getVarRefs();
316            else
317              llvm_unreachable("Unexpected clause.");
318            for (const auto *E : Vars) {
319              const Decl *D =
320                  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321              if (D == VD->getCanonicalDecl()) {
322                IsForCombinedParallelRegion = true;
323                break;
324              }
325            }
326            if (IsForCombinedParallelRegion)
327              break;
328          }
329        }
330        markAsEscaped(VD);
331        if (isa<OMPCapturedExprDecl>(VD))
332          VisitValueDecl(VD);
333        IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334      }
335    }
336  }
337
338  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339    assert(!GlobalizedRD &&
340           "Record for globalized variables is built already.");
341    ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342    unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
343    if (IsInTTDRegion)
344      EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345    else
346      EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347    GlobalizedRD = ::buildRecordForGlobalizedVars(
348        CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349        MappedDeclsFields, WarpSize);
350  }
351
352public:
353  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354                               ArrayRef<const ValueDecl *> TeamsReductions)
355      : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356  }
357  virtual ~CheckVarsEscapingDeclContext() = default;
358  void VisitDeclStmt(const DeclStmt *S) {
359    if (!S)
360      return;
361    for (const Decl *D : S->decls())
362      if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363        VisitValueDecl(VD);
364  }
365  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366    if (!D)
367      return;
368    if (!D->hasAssociatedStmt())
369      return;
370    if (const auto *S =
371            dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372      // Do not analyze directives that do not actually require capturing,
373      // like `omp for` or `omp simd` directives.
374      llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375      getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377        VisitStmt(S->getCapturedStmt());
378        return;
379      }
380      VisitOpenMPCapturedStmt(
381          S, D->clauses(),
382          CaptureRegions.back() == OMPD_parallel &&
383              isOpenMPDistributeDirective(D->getDirectiveKind()));
384    }
385  }
386  void VisitCapturedStmt(const CapturedStmt *S) {
387    if (!S)
388      return;
389    for (const CapturedStmt::Capture &C : S->captures()) {
390      if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391        const ValueDecl *VD = C.getCapturedVar();
392        markAsEscaped(VD);
393        if (isa<OMPCapturedExprDecl>(VD))
394          VisitValueDecl(VD);
395      }
396    }
397  }
398  void VisitLambdaExpr(const LambdaExpr *E) {
399    if (!E)
400      return;
401    for (const LambdaCapture &C : E->captures()) {
402      if (C.capturesVariable()) {
403        if (C.getCaptureKind() == LCK_ByRef) {
404          const ValueDecl *VD = C.getCapturedVar();
405          markAsEscaped(VD);
406          if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407            VisitValueDecl(VD);
408        }
409      }
410    }
411  }
412  void VisitBlockExpr(const BlockExpr *E) {
413    if (!E)
414      return;
415    for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416      if (C.isByRef()) {
417        const VarDecl *VD = C.getVariable();
418        markAsEscaped(VD);
419        if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420          VisitValueDecl(VD);
421      }
422    }
423  }
424  void VisitCallExpr(const CallExpr *E) {
425    if (!E)
426      return;
427    for (const Expr *Arg : E->arguments()) {
428      if (!Arg)
429        continue;
430      if (Arg->isLValue()) {
431        const bool SavedAllEscaped = AllEscaped;
432        AllEscaped = true;
433        Visit(Arg);
434        AllEscaped = SavedAllEscaped;
435      } else {
436        Visit(Arg);
437      }
438    }
439    Visit(E->getCallee());
440  }
441  void VisitDeclRefExpr(const DeclRefExpr *E) {
442    if (!E)
443      return;
444    const ValueDecl *VD = E->getDecl();
445    if (AllEscaped)
446      markAsEscaped(VD);
447    if (isa<OMPCapturedExprDecl>(VD))
448      VisitValueDecl(VD);
449    else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450      if (VarD->isInitCapture())
451        VisitValueDecl(VD);
452  }
453  void VisitUnaryOperator(const UnaryOperator *E) {
454    if (!E)
455      return;
456    if (E->getOpcode() == UO_AddrOf) {
457      const bool SavedAllEscaped = AllEscaped;
458      AllEscaped = true;
459      Visit(E->getSubExpr());
460      AllEscaped = SavedAllEscaped;
461    } else {
462      Visit(E->getSubExpr());
463    }
464  }
465  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466    if (!E)
467      return;
468    if (E->getCastKind() == CK_ArrayToPointerDecay) {
469      const bool SavedAllEscaped = AllEscaped;
470      AllEscaped = true;
471      Visit(E->getSubExpr());
472      AllEscaped = SavedAllEscaped;
473    } else {
474      Visit(E->getSubExpr());
475    }
476  }
477  void VisitExpr(const Expr *E) {
478    if (!E)
479      return;
480    bool SavedAllEscaped = AllEscaped;
481    if (!E->isLValue())
482      AllEscaped = false;
483    for (const Stmt *Child : E->children())
484      if (Child)
485        Visit(Child);
486    AllEscaped = SavedAllEscaped;
487  }
488  void VisitStmt(const Stmt *S) {
489    if (!S)
490      return;
491    for (const Stmt *Child : S->children())
492      if (Child)
493        Visit(Child);
494  }
495
496  /// Returns the record that handles all the escaped local variables and used
497  /// instead of their original storage.
498  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499    if (!GlobalizedRD)
500      buildRecordForGlobalizedVars(IsInTTDRegion);
501    return GlobalizedRD;
502  }
503
504  /// Returns the field in the globalized record for the escaped variable.
505  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506    assert(GlobalizedRD &&
507           "Record for globalized variables must be generated already.");
508    auto I = MappedDeclsFields.find(VD);
509    if (I == MappedDeclsFields.end())
510      return nullptr;
511    return I->getSecond();
512  }
513
514  /// Returns the list of the escaped local variables/parameters.
515  ArrayRef<const ValueDecl *> getEscapedDecls() const {
516    return EscapedDecls.getArrayRef();
517  }
518
519  /// Checks if the escaped local variable is actually a parameter passed by
520  /// value.
521  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522    return EscapedParameters;
523  }
524
525  /// Returns the list of the escaped variables with the variably modified
526  /// types.
527  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528    return EscapedVariableLengthDecls.getArrayRef();
529  }
530};
531} // anonymous namespace
532
533/// Get the id of the warp in the block.
534/// We assume that the warp size is 32, which is always the case
535/// on the NVPTX device, to generate more efficient code.
536static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537  CGBuilderTy &Bld = CGF.Builder;
538  unsigned LaneIDBits =
539      CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
540  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542}
543
544/// Get the id of the current lane in the Warp.
545/// We assume that the warp size is 32, which is always the case
546/// on the NVPTX device, to generate more efficient code.
547static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548  CGBuilderTy &Bld = CGF.Builder;
549  unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
550      llvm::omp::GV_Warp_Size_Log2_Mask);
551  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
552  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
553                       "nvptx_lane_id");
554}
555
556/// Get the value of the thread_limit clause in the teams directive.
557/// For the 'generic' execution mode, the runtime encodes thread_limit in
558/// the launch parameters, always starting thread_limit+warpSize threads per
559/// CTA. The threads in the last warp are reserved for master execution.
560/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
561static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
562                                   bool IsInSPMDExecutionMode = false) {
563  CGBuilderTy &Bld = CGF.Builder;
564  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
565  llvm::Value *ThreadLimit = nullptr;
566  if (IsInSPMDExecutionMode)
567    ThreadLimit = RT.getGPUNumThreads(CGF);
568  else {
569    llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF);
570    llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF);
571    ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit");
572  }
573  assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit");
574  return ThreadLimit;
575}
576
577/// Get the thread id of the OMP master thread.
578/// The master thread id is the first thread (lane) of the last warp in the
579/// GPU block.  Warp size is assumed to be some power of 2.
580/// Thread id is 0 indexed.
581/// E.g: If NumThreads is 33, master id is 32.
582///      If NumThreads is 64, master id is 32.
583///      If NumThreads is 1024, master id is 992.
584static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
585  CGBuilderTy &Bld = CGF.Builder;
586  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
587  llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
588  // We assume that the warp size is a power of 2.
589  llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
590
591  llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1));
592  return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid");
593}
594
595CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState(
596    CodeGenModule &CGM, SourceLocation Loc)
597    : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
598      Loc(Loc) {
599  createWorkerFunction(CGM);
600}
601
602void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction(
603    CodeGenModule &CGM) {
604  // Create an worker function with no arguments.
605
606  WorkerFn = llvm::Function::Create(
607      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
608      /*placeholder=*/"_worker", &CGM.getModule());
609  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
610  WorkerFn->setDoesNotRecurse();
611}
612
613CGOpenMPRuntimeGPU::ExecutionMode
614CGOpenMPRuntimeGPU::getExecutionMode() const {
615  return CurrentExecutionMode;
616}
617
618static CGOpenMPRuntimeGPU::DataSharingMode
619getDataSharingMode(CodeGenModule &CGM) {
620  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
621                                          : CGOpenMPRuntimeGPU::Generic;
622}
623
624/// Check for inner (nested) SPMD construct, if any
625static bool hasNestedSPMDDirective(ASTContext &Ctx,
626                                   const OMPExecutableDirective &D) {
627  const auto *CS = D.getInnermostCapturedStmt();
628  const auto *Body =
629      CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
630  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
631
632  if (const auto *NestedDir =
633          dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
634    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
635    switch (D.getDirectiveKind()) {
636    case OMPD_target:
637      if (isOpenMPParallelDirective(DKind))
638        return true;
639      if (DKind == OMPD_teams) {
640        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
641            /*IgnoreCaptured=*/true);
642        if (!Body)
643          return false;
644        ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
645        if (const auto *NND =
646                dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
647          DKind = NND->getDirectiveKind();
648          if (isOpenMPParallelDirective(DKind))
649            return true;
650        }
651      }
652      return false;
653    case OMPD_target_teams:
654      return isOpenMPParallelDirective(DKind);
655    case OMPD_target_simd:
656    case OMPD_target_parallel:
657    case OMPD_target_parallel_for:
658    case OMPD_target_parallel_for_simd:
659    case OMPD_target_teams_distribute:
660    case OMPD_target_teams_distribute_simd:
661    case OMPD_target_teams_distribute_parallel_for:
662    case OMPD_target_teams_distribute_parallel_for_simd:
663    case OMPD_parallel:
664    case OMPD_for:
665    case OMPD_parallel_for:
666    case OMPD_parallel_master:
667    case OMPD_parallel_sections:
668    case OMPD_for_simd:
669    case OMPD_parallel_for_simd:
670    case OMPD_cancel:
671    case OMPD_cancellation_point:
672    case OMPD_ordered:
673    case OMPD_threadprivate:
674    case OMPD_allocate:
675    case OMPD_task:
676    case OMPD_simd:
677    case OMPD_sections:
678    case OMPD_section:
679    case OMPD_single:
680    case OMPD_master:
681    case OMPD_critical:
682    case OMPD_taskyield:
683    case OMPD_barrier:
684    case OMPD_taskwait:
685    case OMPD_taskgroup:
686    case OMPD_atomic:
687    case OMPD_flush:
688    case OMPD_depobj:
689    case OMPD_scan:
690    case OMPD_teams:
691    case OMPD_target_data:
692    case OMPD_target_exit_data:
693    case OMPD_target_enter_data:
694    case OMPD_distribute:
695    case OMPD_distribute_simd:
696    case OMPD_distribute_parallel_for:
697    case OMPD_distribute_parallel_for_simd:
698    case OMPD_teams_distribute:
699    case OMPD_teams_distribute_simd:
700    case OMPD_teams_distribute_parallel_for:
701    case OMPD_teams_distribute_parallel_for_simd:
702    case OMPD_target_update:
703    case OMPD_declare_simd:
704    case OMPD_declare_variant:
705    case OMPD_begin_declare_variant:
706    case OMPD_end_declare_variant:
707    case OMPD_declare_target:
708    case OMPD_end_declare_target:
709    case OMPD_declare_reduction:
710    case OMPD_declare_mapper:
711    case OMPD_taskloop:
712    case OMPD_taskloop_simd:
713    case OMPD_master_taskloop:
714    case OMPD_master_taskloop_simd:
715    case OMPD_parallel_master_taskloop:
716    case OMPD_parallel_master_taskloop_simd:
717    case OMPD_requires:
718    case OMPD_unknown:
719    default:
720      llvm_unreachable("Unexpected directive.");
721    }
722  }
723
724  return false;
725}
726
727static bool supportsSPMDExecutionMode(ASTContext &Ctx,
728                                      const OMPExecutableDirective &D) {
729  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
730  switch (DirectiveKind) {
731  case OMPD_target:
732  case OMPD_target_teams:
733    return hasNestedSPMDDirective(Ctx, D);
734  case OMPD_target_parallel:
735  case OMPD_target_parallel_for:
736  case OMPD_target_parallel_for_simd:
737  case OMPD_target_teams_distribute_parallel_for:
738  case OMPD_target_teams_distribute_parallel_for_simd:
739  case OMPD_target_simd:
740  case OMPD_target_teams_distribute_simd:
741    return true;
742  case OMPD_target_teams_distribute:
743    return false;
744  case OMPD_parallel:
745  case OMPD_for:
746  case OMPD_parallel_for:
747  case OMPD_parallel_master:
748  case OMPD_parallel_sections:
749  case OMPD_for_simd:
750  case OMPD_parallel_for_simd:
751  case OMPD_cancel:
752  case OMPD_cancellation_point:
753  case OMPD_ordered:
754  case OMPD_threadprivate:
755  case OMPD_allocate:
756  case OMPD_task:
757  case OMPD_simd:
758  case OMPD_sections:
759  case OMPD_section:
760  case OMPD_single:
761  case OMPD_master:
762  case OMPD_critical:
763  case OMPD_taskyield:
764  case OMPD_barrier:
765  case OMPD_taskwait:
766  case OMPD_taskgroup:
767  case OMPD_atomic:
768  case OMPD_flush:
769  case OMPD_depobj:
770  case OMPD_scan:
771  case OMPD_teams:
772  case OMPD_target_data:
773  case OMPD_target_exit_data:
774  case OMPD_target_enter_data:
775  case OMPD_distribute:
776  case OMPD_distribute_simd:
777  case OMPD_distribute_parallel_for:
778  case OMPD_distribute_parallel_for_simd:
779  case OMPD_teams_distribute:
780  case OMPD_teams_distribute_simd:
781  case OMPD_teams_distribute_parallel_for:
782  case OMPD_teams_distribute_parallel_for_simd:
783  case OMPD_target_update:
784  case OMPD_declare_simd:
785  case OMPD_declare_variant:
786  case OMPD_begin_declare_variant:
787  case OMPD_end_declare_variant:
788  case OMPD_declare_target:
789  case OMPD_end_declare_target:
790  case OMPD_declare_reduction:
791  case OMPD_declare_mapper:
792  case OMPD_taskloop:
793  case OMPD_taskloop_simd:
794  case OMPD_master_taskloop:
795  case OMPD_master_taskloop_simd:
796  case OMPD_parallel_master_taskloop:
797  case OMPD_parallel_master_taskloop_simd:
798  case OMPD_requires:
799  case OMPD_unknown:
800  default:
801    break;
802  }
803  llvm_unreachable(
804      "Unknown programming model for OpenMP directive on NVPTX target.");
805}
806
807/// Check if the directive is loops based and has schedule clause at all or has
808/// static scheduling.
809static bool hasStaticScheduling(const OMPExecutableDirective &D) {
810  assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
811         isOpenMPLoopDirective(D.getDirectiveKind()) &&
812         "Expected loop-based directive.");
813  return !D.hasClausesOfKind<OMPOrderedClause>() &&
814         (!D.hasClausesOfKind<OMPScheduleClause>() ||
815          llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
816                       [](const OMPScheduleClause *C) {
817                         return C->getScheduleKind() == OMPC_SCHEDULE_static;
818                       }));
819}
820
821/// Check for inner (nested) lightweight runtime construct, if any
822static bool hasNestedLightweightDirective(ASTContext &Ctx,
823                                          const OMPExecutableDirective &D) {
824  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
825  const auto *CS = D.getInnermostCapturedStmt();
826  const auto *Body =
827      CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
828  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
829
830  if (const auto *NestedDir =
831          dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
832    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
833    switch (D.getDirectiveKind()) {
834    case OMPD_target:
835      if (isOpenMPParallelDirective(DKind) &&
836          isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
837          hasStaticScheduling(*NestedDir))
838        return true;
839      if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
840        return true;
841      if (DKind == OMPD_parallel) {
842        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
843            /*IgnoreCaptured=*/true);
844        if (!Body)
845          return false;
846        ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
847        if (const auto *NND =
848                dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
849          DKind = NND->getDirectiveKind();
850          if (isOpenMPWorksharingDirective(DKind) &&
851              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
852            return true;
853        }
854      } else if (DKind == OMPD_teams) {
855        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
856            /*IgnoreCaptured=*/true);
857        if (!Body)
858          return false;
859        ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
860        if (const auto *NND =
861                dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
862          DKind = NND->getDirectiveKind();
863          if (isOpenMPParallelDirective(DKind) &&
864              isOpenMPWorksharingDirective(DKind) &&
865              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
866            return true;
867          if (DKind == OMPD_parallel) {
868            Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
869                /*IgnoreCaptured=*/true);
870            if (!Body)
871              return false;
872            ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
873            if (const auto *NND =
874                    dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
875              DKind = NND->getDirectiveKind();
876              if (isOpenMPWorksharingDirective(DKind) &&
877                  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
878                return true;
879            }
880          }
881        }
882      }
883      return false;
884    case OMPD_target_teams:
885      if (isOpenMPParallelDirective(DKind) &&
886          isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
887          hasStaticScheduling(*NestedDir))
888        return true;
889      if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
890        return true;
891      if (DKind == OMPD_parallel) {
892        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
893            /*IgnoreCaptured=*/true);
894        if (!Body)
895          return false;
896        ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
897        if (const auto *NND =
898                dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
899          DKind = NND->getDirectiveKind();
900          if (isOpenMPWorksharingDirective(DKind) &&
901              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
902            return true;
903        }
904      }
905      return false;
906    case OMPD_target_parallel:
907      if (DKind == OMPD_simd)
908        return true;
909      return isOpenMPWorksharingDirective(DKind) &&
910             isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
911    case OMPD_target_teams_distribute:
912    case OMPD_target_simd:
913    case OMPD_target_parallel_for:
914    case OMPD_target_parallel_for_simd:
915    case OMPD_target_teams_distribute_simd:
916    case OMPD_target_teams_distribute_parallel_for:
917    case OMPD_target_teams_distribute_parallel_for_simd:
918    case OMPD_parallel:
919    case OMPD_for:
920    case OMPD_parallel_for:
921    case OMPD_parallel_master:
922    case OMPD_parallel_sections:
923    case OMPD_for_simd:
924    case OMPD_parallel_for_simd:
925    case OMPD_cancel:
926    case OMPD_cancellation_point:
927    case OMPD_ordered:
928    case OMPD_threadprivate:
929    case OMPD_allocate:
930    case OMPD_task:
931    case OMPD_simd:
932    case OMPD_sections:
933    case OMPD_section:
934    case OMPD_single:
935    case OMPD_master:
936    case OMPD_critical:
937    case OMPD_taskyield:
938    case OMPD_barrier:
939    case OMPD_taskwait:
940    case OMPD_taskgroup:
941    case OMPD_atomic:
942    case OMPD_flush:
943    case OMPD_depobj:
944    case OMPD_scan:
945    case OMPD_teams:
946    case OMPD_target_data:
947    case OMPD_target_exit_data:
948    case OMPD_target_enter_data:
949    case OMPD_distribute:
950    case OMPD_distribute_simd:
951    case OMPD_distribute_parallel_for:
952    case OMPD_distribute_parallel_for_simd:
953    case OMPD_teams_distribute:
954    case OMPD_teams_distribute_simd:
955    case OMPD_teams_distribute_parallel_for:
956    case OMPD_teams_distribute_parallel_for_simd:
957    case OMPD_target_update:
958    case OMPD_declare_simd:
959    case OMPD_declare_variant:
960    case OMPD_begin_declare_variant:
961    case OMPD_end_declare_variant:
962    case OMPD_declare_target:
963    case OMPD_end_declare_target:
964    case OMPD_declare_reduction:
965    case OMPD_declare_mapper:
966    case OMPD_taskloop:
967    case OMPD_taskloop_simd:
968    case OMPD_master_taskloop:
969    case OMPD_master_taskloop_simd:
970    case OMPD_parallel_master_taskloop:
971    case OMPD_parallel_master_taskloop_simd:
972    case OMPD_requires:
973    case OMPD_unknown:
974    default:
975      llvm_unreachable("Unexpected directive.");
976    }
977  }
978
979  return false;
980}
981
982/// Checks if the construct supports lightweight runtime. It must be SPMD
983/// construct + inner loop-based construct with static scheduling.
984static bool supportsLightweightRuntime(ASTContext &Ctx,
985                                       const OMPExecutableDirective &D) {
986  if (!supportsSPMDExecutionMode(Ctx, D))
987    return false;
988  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
989  switch (DirectiveKind) {
990  case OMPD_target:
991  case OMPD_target_teams:
992  case OMPD_target_parallel:
993    return hasNestedLightweightDirective(Ctx, D);
994  case OMPD_target_parallel_for:
995  case OMPD_target_parallel_for_simd:
996  case OMPD_target_teams_distribute_parallel_for:
997  case OMPD_target_teams_distribute_parallel_for_simd:
998    // (Last|First)-privates must be shared in parallel region.
999    return hasStaticScheduling(D);
1000  case OMPD_target_simd:
1001  case OMPD_target_teams_distribute_simd:
1002    return true;
1003  case OMPD_target_teams_distribute:
1004    return false;
1005  case OMPD_parallel:
1006  case OMPD_for:
1007  case OMPD_parallel_for:
1008  case OMPD_parallel_master:
1009  case OMPD_parallel_sections:
1010  case OMPD_for_simd:
1011  case OMPD_parallel_for_simd:
1012  case OMPD_cancel:
1013  case OMPD_cancellation_point:
1014  case OMPD_ordered:
1015  case OMPD_threadprivate:
1016  case OMPD_allocate:
1017  case OMPD_task:
1018  case OMPD_simd:
1019  case OMPD_sections:
1020  case OMPD_section:
1021  case OMPD_single:
1022  case OMPD_master:
1023  case OMPD_critical:
1024  case OMPD_taskyield:
1025  case OMPD_barrier:
1026  case OMPD_taskwait:
1027  case OMPD_taskgroup:
1028  case OMPD_atomic:
1029  case OMPD_flush:
1030  case OMPD_depobj:
1031  case OMPD_scan:
1032  case OMPD_teams:
1033  case OMPD_target_data:
1034  case OMPD_target_exit_data:
1035  case OMPD_target_enter_data:
1036  case OMPD_distribute:
1037  case OMPD_distribute_simd:
1038  case OMPD_distribute_parallel_for:
1039  case OMPD_distribute_parallel_for_simd:
1040  case OMPD_teams_distribute:
1041  case OMPD_teams_distribute_simd:
1042  case OMPD_teams_distribute_parallel_for:
1043  case OMPD_teams_distribute_parallel_for_simd:
1044  case OMPD_target_update:
1045  case OMPD_declare_simd:
1046  case OMPD_declare_variant:
1047  case OMPD_begin_declare_variant:
1048  case OMPD_end_declare_variant:
1049  case OMPD_declare_target:
1050  case OMPD_end_declare_target:
1051  case OMPD_declare_reduction:
1052  case OMPD_declare_mapper:
1053  case OMPD_taskloop:
1054  case OMPD_taskloop_simd:
1055  case OMPD_master_taskloop:
1056  case OMPD_master_taskloop_simd:
1057  case OMPD_parallel_master_taskloop:
1058  case OMPD_parallel_master_taskloop_simd:
1059  case OMPD_requires:
1060  case OMPD_unknown:
1061  default:
1062    break;
1063  }
1064  llvm_unreachable(
1065      "Unknown programming model for OpenMP directive on NVPTX target.");
1066}
1067
1068void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1069                                             StringRef ParentName,
1070                                             llvm::Function *&OutlinedFn,
1071                                             llvm::Constant *&OutlinedFnID,
1072                                             bool IsOffloadEntry,
1073                                             const RegionCodeGenTy &CodeGen) {
1074  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1075  EntryFunctionState EST;
1076  WorkerFunctionState WST(CGM, D.getBeginLoc());
1077  Work.clear();
1078  WrapperFunctionsMap.clear();
1079
1080  // Emit target region as a standalone region.
1081  class NVPTXPrePostActionTy : public PrePostActionTy {
1082    CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083    CGOpenMPRuntimeGPU::WorkerFunctionState &WST;
1084
1085  public:
1086    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1087                         CGOpenMPRuntimeGPU::WorkerFunctionState &WST)
1088        : EST(EST), WST(WST) {}
1089    void Enter(CodeGenFunction &CGF) override {
1090      auto &RT =
1091          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1092      RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1093      // Skip target region initialization.
1094      RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1095    }
1096    void Exit(CodeGenFunction &CGF) override {
1097      auto &RT =
1098          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1099      RT.clearLocThreadIdInsertPt(CGF);
1100      RT.emitNonSPMDEntryFooter(CGF, EST);
1101    }
1102  } Action(EST, WST);
1103  CodeGen.setAction(Action);
1104  IsInTTDRegion = true;
1105  // Reserve place for the globalized memory.
1106  GlobalizedRecords.emplace_back();
1107  if (!KernelStaticGlobalized) {
1108    KernelStaticGlobalized = new llvm::GlobalVariable(
1109        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1110        llvm::GlobalValue::InternalLinkage,
1111        llvm::UndefValue::get(CGM.VoidPtrTy),
1112        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1113        llvm::GlobalValue::NotThreadLocal,
1114        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1115  }
1116  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1117                                   IsOffloadEntry, CodeGen);
1118  IsInTTDRegion = false;
1119
1120  // Now change the name of the worker function to correspond to this target
1121  // region's entry function.
1122  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1123
1124  // Create the worker function
1125  emitWorkerFunction(WST);
1126}
1127
1128// Setup NVPTX threads for master-worker OpenMP scheme.
1129void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1130                                                  EntryFunctionState &EST,
1131                                                  WorkerFunctionState &WST) {
1132  CGBuilderTy &Bld = CGF.Builder;
1133
1134  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1135  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1136  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1137  EST.ExitBB = CGF.createBasicBlock(".exit");
1138
1139  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1140  llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF);
1141  llvm::Value *ThreadLimit = getThreadLimit(CGF);
1142  llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit);
1143  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1144
1145  CGF.EmitBlock(WorkerBB);
1146  emitCall(CGF, WST.Loc, WST.WorkerFn);
1147  CGF.EmitBranch(EST.ExitBB);
1148
1149  CGF.EmitBlock(MasterCheckBB);
1150  GPUThreadID = RT.getGPUThreadID(CGF);
1151  llvm::Value *MasterThreadID = getMasterThreadID(CGF);
1152  llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID);
1153  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1154
1155  CGF.EmitBlock(MasterBB);
1156  IsInTargetMasterThreadRegion = true;
1157  // SEQUENTIAL (MASTER) REGION START
1158  // First action in sequential region:
1159  // Initialize the state of the OpenMP runtime library on the GPU.
1160  // TODO: Optimize runtime initialization and pass in correct value.
1161  llvm::Value *Args[] = {getThreadLimit(CGF),
1162                         Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1163  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1164                          CGM.getModule(), OMPRTL___kmpc_kernel_init),
1165                      Args);
1166
1167  // For data sharing, we need to initialize the stack.
1168  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1169      CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
1170
1171  emitGenericVarsProlog(CGF, WST.Loc);
1172}
1173
1174void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1175                                                  EntryFunctionState &EST) {
1176  IsInTargetMasterThreadRegion = false;
1177  if (!CGF.HaveInsertPoint())
1178    return;
1179
1180  emitGenericVarsEpilog(CGF);
1181
1182  if (!EST.ExitBB)
1183    EST.ExitBB = CGF.createBasicBlock(".exit");
1184
1185  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1186  CGF.EmitBranch(TerminateBB);
1187
1188  CGF.EmitBlock(TerminateBB);
1189  // Signal termination condition.
1190  // TODO: Optimize runtime initialization and pass in correct value.
1191  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1192  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193                          CGM.getModule(), OMPRTL___kmpc_kernel_deinit),
1194                      Args);
1195  // Barrier to terminate worker threads.
1196  syncCTAThreads(CGF);
1197  // Master thread jumps to exit point.
1198  CGF.EmitBranch(EST.ExitBB);
1199
1200  CGF.EmitBlock(EST.ExitBB);
1201  EST.ExitBB = nullptr;
1202}
1203
1204void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1205                                          StringRef ParentName,
1206                                          llvm::Function *&OutlinedFn,
1207                                          llvm::Constant *&OutlinedFnID,
1208                                          bool IsOffloadEntry,
1209                                          const RegionCodeGenTy &CodeGen) {
1210  ExecutionRuntimeModesRAII ModeRAII(
1211      CurrentExecutionMode, RequiresFullRuntime,
1212      CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1213          !supportsLightweightRuntime(CGM.getContext(), D));
1214  EntryFunctionState EST;
1215
1216  // Emit target region as a standalone region.
1217  class NVPTXPrePostActionTy : public PrePostActionTy {
1218    CGOpenMPRuntimeGPU &RT;
1219    CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1220    const OMPExecutableDirective &D;
1221
1222  public:
1223    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1224                         CGOpenMPRuntimeGPU::EntryFunctionState &EST,
1225                         const OMPExecutableDirective &D)
1226        : RT(RT), EST(EST), D(D) {}
1227    void Enter(CodeGenFunction &CGF) override {
1228      RT.emitSPMDEntryHeader(CGF, EST, D);
1229      // Skip target region initialization.
1230      RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1231    }
1232    void Exit(CodeGenFunction &CGF) override {
1233      RT.clearLocThreadIdInsertPt(CGF);
1234      RT.emitSPMDEntryFooter(CGF, EST);
1235    }
1236  } Action(*this, EST, D);
1237  CodeGen.setAction(Action);
1238  IsInTTDRegion = true;
1239  // Reserve place for the globalized memory.
1240  GlobalizedRecords.emplace_back();
1241  if (!KernelStaticGlobalized) {
1242    KernelStaticGlobalized = new llvm::GlobalVariable(
1243        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1244        llvm::GlobalValue::InternalLinkage,
1245        llvm::UndefValue::get(CGM.VoidPtrTy),
1246        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1247        llvm::GlobalValue::NotThreadLocal,
1248        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1249  }
1250  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1251                                   IsOffloadEntry, CodeGen);
1252  IsInTTDRegion = false;
1253}
1254
1255void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
1256    CodeGenFunction &CGF, EntryFunctionState &EST,
1257    const OMPExecutableDirective &D) {
1258  CGBuilderTy &Bld = CGF.Builder;
1259
1260  // Setup BBs in entry function.
1261  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1262  EST.ExitBB = CGF.createBasicBlock(".exit");
1263
1264  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1265                         /*RequiresOMPRuntime=*/
1266                         Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1267  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1268                          CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
1269                      Args);
1270
1271  if (RequiresFullRuntime) {
1272    // For data sharing, we need to initialize the stack.
1273    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1274        CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
1275  }
1276
1277  CGF.EmitBranch(ExecuteBB);
1278
1279  CGF.EmitBlock(ExecuteBB);
1280
1281  IsInTargetMasterThreadRegion = true;
1282}
1283
1284void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF,
1285                                               EntryFunctionState &EST) {
1286  IsInTargetMasterThreadRegion = false;
1287  if (!CGF.HaveInsertPoint())
1288    return;
1289
1290  if (!EST.ExitBB)
1291    EST.ExitBB = CGF.createBasicBlock(".exit");
1292
1293  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1294  CGF.EmitBranch(OMPDeInitBB);
1295
1296  CGF.EmitBlock(OMPDeInitBB);
1297  // DeInitialize the OMP state in the runtime; called by all active threads.
1298  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1299                         CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1300  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1301                          CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2),
1302                      Args);
1303  CGF.EmitBranch(EST.ExitBB);
1304
1305  CGF.EmitBlock(EST.ExitBB);
1306  EST.ExitBB = nullptr;
1307}
1308
1309// Create a unique global variable to indicate the execution mode of this target
1310// region. The execution mode is either 'generic', or 'spmd' depending on the
1311// target directive. This variable is picked up by the offload library to setup
1312// the device appropriately before kernel launch. If the execution mode is
1313// 'generic', the runtime reserves one warp for the master, otherwise, all
1314// warps participate in parallel work.
1315static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1316                                     bool Mode) {
1317  auto *GVMode =
1318      new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1319                               llvm::GlobalValue::WeakAnyLinkage,
1320                               llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1321                               Twine(Name, "_exec_mode"));
1322  CGM.addCompilerUsedGlobal(GVMode);
1323}
1324
1325void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) {
1326  ASTContext &Ctx = CGM.getContext();
1327
1328  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1329  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1330                    WST.Loc, WST.Loc);
1331  emitWorkerLoop(CGF, WST);
1332  CGF.FinishFunction();
1333}
1334
1335void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF,
1336                                        WorkerFunctionState &WST) {
1337  //
1338  // The workers enter this loop and wait for parallel work from the master.
1339  // When the master encounters a parallel region it sets up the work + variable
1340  // arguments, and wakes up the workers.  The workers first check to see if
1341  // they are required for the parallel region, i.e., within the # of requested
1342  // parallel threads.  The activated workers load the variable arguments and
1343  // execute the parallel work.
1344  //
1345
1346  CGBuilderTy &Bld = CGF.Builder;
1347
1348  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1349  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1350  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1351  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1352  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1353  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1354
1355  CGF.EmitBranch(AwaitBB);
1356
1357  // Workers wait for work from master.
1358  CGF.EmitBlock(AwaitBB);
1359  // Wait for parallel work
1360  syncCTAThreads(CGF);
1361
1362  Address WorkFn =
1363      CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1364  Address ExecStatus =
1365      CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1366  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1367  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1368
1369  // TODO: Optimize runtime initialization and pass in correct value.
1370  llvm::Value *Args[] = {WorkFn.getPointer()};
1371  llvm::Value *Ret =
1372      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1373                              CGM.getModule(), OMPRTL___kmpc_kernel_parallel),
1374                          Args);
1375  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1376
1377  // On termination condition (workid == 0), exit loop.
1378  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1379  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1380  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1381
1382  // Activate requested workers.
1383  CGF.EmitBlock(SelectWorkersBB);
1384  llvm::Value *IsActive =
1385      Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1386  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1387
1388  // Signal start of parallel region.
1389  CGF.EmitBlock(ExecuteBB);
1390  // Skip initialization.
1391  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1392
1393  // Process work items: outlined parallel functions.
1394  for (llvm::Function *W : Work) {
1395    // Try to match this outlined function.
1396    llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1397
1398    llvm::Value *WorkFnMatch =
1399        Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1400
1401    llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1402    llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1403    Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1404
1405    // Execute this outlined function.
1406    CGF.EmitBlock(ExecuteFNBB);
1407
1408    // Insert call to work function via shared wrapper. The shared
1409    // wrapper takes two arguments:
1410    //   - the parallelism level;
1411    //   - the thread ID;
1412    emitCall(CGF, WST.Loc, W,
1413             {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1414
1415    // Go to end of parallel region.
1416    CGF.EmitBranch(TerminateBB);
1417
1418    CGF.EmitBlock(CheckNextBB);
1419  }
1420  // Default case: call to outlined function through pointer if the target
1421  // region makes a declare target call that may contain an orphaned parallel
1422  // directive.
1423  auto *ParallelFnTy =
1424      llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1425                              /*isVarArg=*/false);
1426  llvm::Value *WorkFnCast =
1427      Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1428  // Insert call to work function via shared wrapper. The shared
1429  // wrapper takes two arguments:
1430  //   - the parallelism level;
1431  //   - the thread ID;
1432  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1433           {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1434  // Go to end of parallel region.
1435  CGF.EmitBranch(TerminateBB);
1436
1437  // Signal end of parallel region.
1438  CGF.EmitBlock(TerminateBB);
1439  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1440                          CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel),
1441                      llvm::None);
1442  CGF.EmitBranch(BarrierBB);
1443
1444  // All active and inactive workers wait at a barrier after parallel region.
1445  CGF.EmitBlock(BarrierBB);
1446  // Barrier after parallel region.
1447  syncCTAThreads(CGF);
1448  CGF.EmitBranch(AwaitBB);
1449
1450  // Exit target region.
1451  CGF.EmitBlock(ExitBB);
1452  // Skip initialization.
1453  clearLocThreadIdInsertPt(CGF);
1454}
1455
1456void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1457                                              llvm::Constant *Addr,
1458                                              uint64_t Size, int32_t,
1459                                              llvm::GlobalValue::LinkageTypes) {
1460  // TODO: Add support for global variables on the device after declare target
1461  // support.
1462  if (!isa<llvm::Function>(Addr))
1463    return;
1464  llvm::Module &M = CGM.getModule();
1465  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1466
1467  // Get "nvvm.annotations" metadata node
1468  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1469
1470  llvm::Metadata *MDVals[] = {
1471      llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1472      llvm::ConstantAsMetadata::get(
1473          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1474  // Append metadata to nvvm.annotations
1475  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1476}
1477
1478void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1479    const OMPExecutableDirective &D, StringRef ParentName,
1480    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1481    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1482  if (!IsOffloadEntry) // Nothing to do.
1483    return;
1484
1485  assert(!ParentName.empty() && "Invalid target region parent name!");
1486
1487  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1488  if (Mode)
1489    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1490                   CodeGen);
1491  else
1492    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1493                      CodeGen);
1494
1495  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1496}
1497
1498namespace {
1499LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1500/// Enum for accesseing the reserved_2 field of the ident_t struct.
1501enum ModeFlagsTy : unsigned {
1502  /// Bit set to 1 when in SPMD mode.
1503  KMP_IDENT_SPMD_MODE = 0x01,
1504  /// Bit set to 1 when a simplified runtime is used.
1505  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1506  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1507};
1508
1509/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1510static const ModeFlagsTy UndefinedMode =
1511    (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1512} // anonymous namespace
1513
1514unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1515  switch (getExecutionMode()) {
1516  case EM_SPMD:
1517    if (requiresFullRuntime())
1518      return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1519    return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1520  case EM_NonSPMD:
1521    assert(requiresFullRuntime() && "Expected full runtime.");
1522    return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1523  case EM_Unknown:
1524    return UndefinedMode;
1525  }
1526  llvm_unreachable("Unknown flags are requested.");
1527}
1528
1529CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1530    : CGOpenMPRuntime(CGM, "_", "$") {
1531  if (!CGM.getLangOpts().OpenMPIsDevice)
1532    llvm_unreachable("OpenMP NVPTX can only handle device code.");
1533}
1534
1535void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1536                                              ProcBindKind ProcBind,
1537                                              SourceLocation Loc) {
1538  // Do nothing in case of SPMD mode and L0 parallel.
1539  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1540    return;
1541
1542  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1543}
1544
1545void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1546                                                llvm::Value *NumThreads,
1547                                                SourceLocation Loc) {
1548  // Do nothing in case of SPMD mode and L0 parallel.
1549  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1550    return;
1551
1552  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1553}
1554
1555void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1556                                              const Expr *NumTeams,
1557                                              const Expr *ThreadLimit,
1558                                              SourceLocation Loc) {}
1559
1560llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1561    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1562    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1563  // Emit target region as a standalone region.
1564  class NVPTXPrePostActionTy : public PrePostActionTy {
1565    bool &IsInParallelRegion;
1566    bool PrevIsInParallelRegion;
1567
1568  public:
1569    NVPTXPrePostActionTy(bool &IsInParallelRegion)
1570        : IsInParallelRegion(IsInParallelRegion) {}
1571    void Enter(CodeGenFunction &CGF) override {
1572      PrevIsInParallelRegion = IsInParallelRegion;
1573      IsInParallelRegion = true;
1574    }
1575    void Exit(CodeGenFunction &CGF) override {
1576      IsInParallelRegion = PrevIsInParallelRegion;
1577    }
1578  } Action(IsInParallelRegion);
1579  CodeGen.setAction(Action);
1580  bool PrevIsInTTDRegion = IsInTTDRegion;
1581  IsInTTDRegion = false;
1582  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1583  IsInTargetMasterThreadRegion = false;
1584  auto *OutlinedFun =
1585      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1586          D, ThreadIDVar, InnermostKind, CodeGen));
1587  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1588  IsInTTDRegion = PrevIsInTTDRegion;
1589  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1590      !IsInParallelRegion) {
1591    llvm::Function *WrapperFun =
1592        createParallelDataSharingWrapper(OutlinedFun, D);
1593    WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1594  }
1595
1596  return OutlinedFun;
1597}
1598
1599/// Get list of lastprivate variables from the teams distribute ... or
1600/// teams {distribute ...} directives.
1601static void
1602getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1603                             llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1604  assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1605         "expected teams directive.");
1606  const OMPExecutableDirective *Dir = &D;
1607  if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1608    if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1609            Ctx,
1610            D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1611                /*IgnoreCaptured=*/true))) {
1612      Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1613      if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1614        Dir = nullptr;
1615    }
1616  }
1617  if (!Dir)
1618    return;
1619  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1620    for (const Expr *E : C->getVarRefs())
1621      Vars.push_back(getPrivateItem(E));
1622  }
1623}
1624
1625/// Get list of reduction variables from the teams ... directives.
1626static void
1627getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1628                      llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1629  assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1630         "expected teams directive.");
1631  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1632    for (const Expr *E : C->privates())
1633      Vars.push_back(getPrivateItem(E));
1634  }
1635}
1636
1637llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1638    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1639    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1640  SourceLocation Loc = D.getBeginLoc();
1641
1642  const RecordDecl *GlobalizedRD = nullptr;
1643  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1644  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1645  unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
1646  // Globalize team reductions variable unconditionally in all modes.
1647  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1648    getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1649  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1650    getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1651    if (!LastPrivatesReductions.empty()) {
1652      GlobalizedRD = ::buildRecordForGlobalizedVars(
1653          CGM.getContext(), llvm::None, LastPrivatesReductions,
1654          MappedDeclsFields, WarpSize);
1655    }
1656  } else if (!LastPrivatesReductions.empty()) {
1657    assert(!TeamAndReductions.first &&
1658           "Previous team declaration is not expected.");
1659    TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1660    std::swap(TeamAndReductions.second, LastPrivatesReductions);
1661  }
1662
1663  // Emit target region as a standalone region.
1664  class NVPTXPrePostActionTy : public PrePostActionTy {
1665    SourceLocation &Loc;
1666    const RecordDecl *GlobalizedRD;
1667    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1668        &MappedDeclsFields;
1669
1670  public:
1671    NVPTXPrePostActionTy(
1672        SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1673        llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1674            &MappedDeclsFields)
1675        : Loc(Loc), GlobalizedRD(GlobalizedRD),
1676          MappedDeclsFields(MappedDeclsFields) {}
1677    void Enter(CodeGenFunction &CGF) override {
1678      auto &Rt =
1679          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1680      if (GlobalizedRD) {
1681        auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1682        I->getSecond().GlobalRecord = GlobalizedRD;
1683        I->getSecond().MappedParams =
1684            std::make_unique<CodeGenFunction::OMPMapVars>();
1685        DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1686        for (const auto &Pair : MappedDeclsFields) {
1687          assert(Pair.getFirst()->isCanonicalDecl() &&
1688                 "Expected canonical declaration");
1689          Data.insert(std::make_pair(Pair.getFirst(),
1690                                     MappedVarData(Pair.getSecond(),
1691                                                   /*IsOnePerTeam=*/true)));
1692        }
1693      }
1694      Rt.emitGenericVarsProlog(CGF, Loc);
1695    }
1696    void Exit(CodeGenFunction &CGF) override {
1697      static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1698          .emitGenericVarsEpilog(CGF);
1699    }
1700  } Action(Loc, GlobalizedRD, MappedDeclsFields);
1701  CodeGen.setAction(Action);
1702  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1703      D, ThreadIDVar, InnermostKind, CodeGen);
1704
1705  return OutlinedFun;
1706}
1707
1708void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1709                                                 SourceLocation Loc,
1710                                                 bool WithSPMDCheck) {
1711  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1712      getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1713    return;
1714
1715  CGBuilderTy &Bld = CGF.Builder;
1716
1717  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1718  if (I == FunctionGlobalizedDecls.end())
1719    return;
1720  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1721    QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1722    QualType SecGlobalRecTy;
1723
1724    // Recover pointer to this function's global record. The runtime will
1725    // handle the specifics of the allocation of the memory.
1726    // Use actual memory size of the record including the padding
1727    // for alignment purposes.
1728    unsigned Alignment =
1729        CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1730    unsigned GlobalRecordSize =
1731        CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
1732    GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1733
1734    llvm::PointerType *GlobalRecPtrTy =
1735        CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
1736    llvm::Value *GlobalRecCastAddr;
1737    llvm::Value *IsTTD = nullptr;
1738    if (!IsInTTDRegion &&
1739        (WithSPMDCheck ||
1740         getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1741      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1742      llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1743      llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1744      if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
1745        llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1746        llvm::Value *ThreadID = getThreadID(CGF, Loc);
1747        llvm::Value *PL = CGF.EmitRuntimeCall(
1748            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1749                                                  OMPRTL___kmpc_parallel_level),
1750            {RTLoc, ThreadID});
1751        IsTTD = Bld.CreateIsNull(PL);
1752      }
1753      llvm::Value *IsSPMD = Bld.CreateIsNotNull(
1754          CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1755              CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
1756      Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1757      // There is no need to emit line number for unconditional branch.
1758      (void)ApplyDebugLocation::CreateEmpty(CGF);
1759      CGF.EmitBlock(SPMDBB);
1760      Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
1761                               CharUnits::fromQuantity(Alignment));
1762      CGF.EmitBranch(ExitBB);
1763      // There is no need to emit line number for unconditional branch.
1764      (void)ApplyDebugLocation::CreateEmpty(CGF);
1765      CGF.EmitBlock(NonSPMDBB);
1766      llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
1767      if (const RecordDecl *SecGlobalizedVarsRecord =
1768              I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
1769        SecGlobalRecTy =
1770            CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
1771
1772        // Recover pointer to this function's global record. The runtime will
1773        // handle the specifics of the allocation of the memory.
1774        // Use actual memory size of the record including the padding
1775        // for alignment purposes.
1776        unsigned Alignment =
1777            CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
1778        unsigned GlobalRecordSize =
1779            CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
1780        GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1781        Size = Bld.CreateSelect(
1782            IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
1783      }
1784      // TODO: allow the usage of shared memory to be controlled by
1785      // the user, for now, default to global.
1786      llvm::Value *GlobalRecordSizeArg[] = {
1787          Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1788      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1789          OMPBuilder.getOrCreateRuntimeFunction(
1790              CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1791          GlobalRecordSizeArg);
1792      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1793          GlobalRecValue, GlobalRecPtrTy);
1794      CGF.EmitBlock(ExitBB);
1795      auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
1796                                /*NumReservedValues=*/2, "_select_stack");
1797      Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1798      Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1799      GlobalRecCastAddr = Phi;
1800      I->getSecond().GlobalRecordAddr = Phi;
1801      I->getSecond().IsInSPMDModeFlag = IsSPMD;
1802    } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
1803      assert(GlobalizedRecords.back().Records.size() < 2 &&
1804             "Expected less than 2 globalized records: one for target and one "
1805             "for teams.");
1806      unsigned Offset = 0;
1807      for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
1808        QualType RDTy = CGM.getContext().getRecordType(RD);
1809        unsigned Alignment =
1810            CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
1811        unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
1812        Offset =
1813            llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
1814      }
1815      unsigned Alignment =
1816          CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
1817      Offset = llvm::alignTo(Offset, Alignment);
1818      GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
1819      ++GlobalizedRecords.back().RegionCounter;
1820      if (GlobalizedRecords.back().Records.size() == 1) {
1821        assert(KernelStaticGlobalized &&
1822               "Kernel static pointer must be initialized already.");
1823        auto *UseSharedMemory = new llvm::GlobalVariable(
1824            CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
1825            llvm::GlobalValue::InternalLinkage, nullptr,
1826            "_openmp_static_kernel$is_shared");
1827        UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1828        QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
1829            /*DestWidth=*/16, /*Signed=*/0);
1830        llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
1831            Address(UseSharedMemory,
1832                    CGM.getContext().getTypeAlignInChars(Int16Ty)),
1833            /*Volatile=*/false, Int16Ty, Loc);
1834        auto *StaticGlobalized = new llvm::GlobalVariable(
1835            CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
1836            llvm::GlobalValue::CommonLinkage, nullptr);
1837        auto *RecSize = new llvm::GlobalVariable(
1838            CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
1839            llvm::GlobalValue::InternalLinkage, nullptr,
1840            "_openmp_static_kernel$size");
1841        RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1842        llvm::Value *Ld = CGF.EmitLoadOfScalar(
1843            Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
1844            CGM.getContext().getSizeType(), Loc);
1845        llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1846            KernelStaticGlobalized, CGM.VoidPtrPtrTy);
1847        llvm::Value *GlobalRecordSizeArg[] = {
1848            llvm::ConstantInt::get(
1849                CGM.Int16Ty,
1850                getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
1851            StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
1852        CGF.EmitRuntimeCall(
1853            OMPBuilder.getOrCreateRuntimeFunction(
1854                CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
1855            GlobalRecordSizeArg);
1856        GlobalizedRecords.back().Buffer = StaticGlobalized;
1857        GlobalizedRecords.back().RecSize = RecSize;
1858        GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
1859        GlobalizedRecords.back().Loc = Loc;
1860      }
1861      assert(KernelStaticGlobalized && "Global address must be set already.");
1862      Address FrameAddr = CGF.EmitLoadOfPointer(
1863          Address(KernelStaticGlobalized, CGM.getPointerAlign()),
1864          CGM.getContext()
1865              .getPointerType(CGM.getContext().VoidPtrTy)
1866              .castAs<PointerType>());
1867      llvm::Value *GlobalRecValue =
1868          Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
1869      I->getSecond().GlobalRecordAddr = GlobalRecValue;
1870      I->getSecond().IsInSPMDModeFlag = nullptr;
1871      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1872          GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
1873    } else {
1874      // TODO: allow the usage of shared memory to be controlled by
1875      // the user, for now, default to global.
1876      bool UseSharedMemory =
1877          IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
1878      llvm::Value *GlobalRecordSizeArg[] = {
1879          llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1880          CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
1881      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1882          OMPBuilder.getOrCreateRuntimeFunction(
1883              CGM.getModule(),
1884              IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
1885                            : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1886          GlobalRecordSizeArg);
1887      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1888          GlobalRecValue, GlobalRecPtrTy);
1889      I->getSecond().GlobalRecordAddr = GlobalRecValue;
1890      I->getSecond().IsInSPMDModeFlag = nullptr;
1891    }
1892    LValue Base =
1893        CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
1894
1895    // Emit the "global alloca" which is a GEP from the global declaration
1896    // record using the pointer returned by the runtime.
1897    LValue SecBase;
1898    decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
1899    if (IsTTD) {
1900      SecIt = I->getSecond().SecondaryLocalVarData->begin();
1901      llvm::PointerType *SecGlobalRecPtrTy =
1902          CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
1903      SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
1904          Bld.CreatePointerBitCastOrAddrSpaceCast(
1905              I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
1906          SecGlobalRecTy);
1907    }
1908    for (auto &Rec : I->getSecond().LocalVarData) {
1909      bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1910      llvm::Value *ParValue;
1911      if (EscapedParam) {
1912        const auto *VD = cast<VarDecl>(Rec.first);
1913        LValue ParLVal =
1914            CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1915        ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1916      }
1917      LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
1918      // Emit VarAddr basing on lane-id if required.
1919      QualType VarTy;
1920      if (Rec.second.IsOnePerTeam) {
1921        VarTy = Rec.second.FD->getType();
1922      } else {
1923        Address Addr = VarAddr.getAddress(CGF);
1924        llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
1925            Addr.getElementType(), Addr.getPointer(),
1926            {Bld.getInt32(0), getNVPTXLaneID(CGF)});
1927        VarTy =
1928            Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
1929        VarAddr = CGF.MakeAddrLValue(
1930            Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
1931            AlignmentSource::Decl);
1932      }
1933      Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1934      if (!IsInTTDRegion &&
1935          (WithSPMDCheck ||
1936           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
1937        assert(I->getSecond().IsInSPMDModeFlag &&
1938               "Expected unknown execution mode or required SPMD check.");
1939        if (IsTTD) {
1940          assert(SecIt->second.IsOnePerTeam &&
1941                 "Secondary glob data must be one per team.");
1942          LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
1943          VarAddr.setAddress(
1944              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
1945                                       VarAddr.getPointer(CGF)),
1946                      VarAddr.getAlignment()));
1947          Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1948        }
1949        Address GlobalPtr = Rec.second.PrivateAddr;
1950        Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
1951        Rec.second.PrivateAddr = Address(
1952            Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
1953                             LocalAddr.getPointer(), GlobalPtr.getPointer()),
1954            LocalAddr.getAlignment());
1955      }
1956      if (EscapedParam) {
1957        const auto *VD = cast<VarDecl>(Rec.first);
1958        CGF.EmitStoreOfScalar(ParValue, VarAddr);
1959        I->getSecond().MappedParams->setVarAddr(CGF, VD,
1960                                                VarAddr.getAddress(CGF));
1961      }
1962      if (IsTTD)
1963        ++SecIt;
1964    }
1965  }
1966  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1967    // Recover pointer to this function's global record. The runtime will
1968    // handle the specifics of the allocation of the memory.
1969    // Use actual memory size of the record including the padding
1970    // for alignment purposes.
1971    CGBuilderTy &Bld = CGF.Builder;
1972    llvm::Value *Size = CGF.getTypeSize(VD->getType());
1973    CharUnits Align = CGM.getContext().getDeclAlign(VD);
1974    Size = Bld.CreateNUWAdd(
1975        Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1976    llvm::Value *AlignVal =
1977        llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1978    Size = Bld.CreateUDiv(Size, AlignVal);
1979    Size = Bld.CreateNUWMul(Size, AlignVal);
1980    // TODO: allow the usage of shared memory to be controlled by
1981    // the user, for now, default to global.
1982    llvm::Value *GlobalRecordSizeArg[] = {
1983        Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1984    llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1985        OMPBuilder.getOrCreateRuntimeFunction(
1986            CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
1987        GlobalRecordSizeArg);
1988    llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1989        GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1990    LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1991                                     CGM.getContext().getDeclAlign(VD),
1992                                     AlignmentSource::Decl);
1993    I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1994                                            Base.getAddress(CGF));
1995    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1996  }
1997  I->getSecond().MappedParams->apply(CGF);
1998}
1999
2000void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
2001                                                 bool WithSPMDCheck) {
2002  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
2003      getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
2004    return;
2005
2006  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2007  if (I != FunctionGlobalizedDecls.end()) {
2008    I->getSecond().MappedParams->restore(CGF);
2009    if (!CGF.HaveInsertPoint())
2010      return;
2011    for (llvm::Value *Addr :
2012         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2013      CGF.EmitRuntimeCall(
2014          OMPBuilder.getOrCreateRuntimeFunction(
2015              CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2016          Addr);
2017    }
2018    if (I->getSecond().GlobalRecordAddr) {
2019      if (!IsInTTDRegion &&
2020          (WithSPMDCheck ||
2021           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
2022        CGBuilderTy &Bld = CGF.Builder;
2023        llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2024        llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2025        Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2026        // There is no need to emit line number for unconditional branch.
2027        (void)ApplyDebugLocation::CreateEmpty(CGF);
2028        CGF.EmitBlock(NonSPMDBB);
2029        CGF.EmitRuntimeCall(
2030            OMPBuilder.getOrCreateRuntimeFunction(
2031                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2032            CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2033        CGF.EmitBlock(ExitBB);
2034      } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
2035        assert(GlobalizedRecords.back().RegionCounter > 0 &&
2036               "region counter must be > 0.");
2037        --GlobalizedRecords.back().RegionCounter;
2038        // Emit the restore function only in the target region.
2039        if (GlobalizedRecords.back().RegionCounter == 0) {
2040          QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2041              /*DestWidth=*/16, /*Signed=*/0);
2042          llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2043              Address(GlobalizedRecords.back().UseSharedMemory,
2044                      CGM.getContext().getTypeAlignInChars(Int16Ty)),
2045              /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2046          llvm::Value *Args[] = {
2047              llvm::ConstantInt::get(
2048                  CGM.Int16Ty,
2049                  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
2050              IsInSharedMemory};
2051          CGF.EmitRuntimeCall(
2052              OMPBuilder.getOrCreateRuntimeFunction(
2053                  CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
2054              Args);
2055        }
2056      } else {
2057        CGF.EmitRuntimeCall(
2058            OMPBuilder.getOrCreateRuntimeFunction(
2059                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
2060            I->getSecond().GlobalRecordAddr);
2061      }
2062    }
2063  }
2064}
2065
2066void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
2067                                         const OMPExecutableDirective &D,
2068                                         SourceLocation Loc,
2069                                         llvm::Function *OutlinedFn,
2070                                         ArrayRef<llvm::Value *> CapturedVars) {
2071  if (!CGF.HaveInsertPoint())
2072    return;
2073
2074  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2075                                                      /*Name=*/".zero.addr");
2076  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2077  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2078  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2079  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2080  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2081  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2082}
2083
2084void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
2085                                          SourceLocation Loc,
2086                                          llvm::Function *OutlinedFn,
2087                                          ArrayRef<llvm::Value *> CapturedVars,
2088                                          const Expr *IfCond) {
2089  if (!CGF.HaveInsertPoint())
2090    return;
2091
2092  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
2093                        IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
2094    CGBuilderTy &Bld = CGF.Builder;
2095    llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
2096    llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
2097    if (WFn) {
2098      ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2099      // Remember for post-processing in worker loop.
2100      Work.emplace_back(WFn);
2101    }
2102    llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
2103
2104    // Create a private scope that will globalize the arguments
2105    // passed from the outside of the target region.
2106    // TODO: Is that needed?
2107    CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2108
2109    Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
2110        llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
2111        "captured_vars_addrs");
2112    // There's something to share.
2113    if (!CapturedVars.empty()) {
2114      // Prepare for parallel region. Indicate the outlined function.
2115      ASTContext &Ctx = CGF.getContext();
2116      unsigned Idx = 0;
2117      for (llvm::Value *V : CapturedVars) {
2118        Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
2119        llvm::Value *PtrV;
2120        if (V->getType()->isIntegerTy())
2121          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2122        else
2123          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2124        CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2125                              Ctx.getPointerType(Ctx.VoidPtrTy));
2126        ++Idx;
2127      }
2128    }
2129
2130    llvm::Value *IfCondVal = nullptr;
2131    if (IfCond)
2132      IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
2133                                    /* isSigned */ false);
2134    else
2135      IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
2136
2137    assert(IfCondVal && "Expected a value");
2138    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2139    llvm::Value *Args[] = {
2140        RTLoc,
2141        getThreadID(CGF, Loc),
2142        IfCondVal,
2143        llvm::ConstantInt::get(CGF.Int32Ty, -1),
2144        llvm::ConstantInt::get(CGF.Int32Ty, -1),
2145        FnPtr,
2146        ID,
2147        Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
2148                                   CGF.VoidPtrPtrTy),
2149        llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2150    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2151                            CGM.getModule(), OMPRTL___kmpc_parallel_51),
2152                        Args);
2153  };
2154
2155  RegionCodeGenTy RCG(ParallelGen);
2156  RCG(CGF);
2157}
2158
2159void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
2160  // Always emit simple barriers!
2161  if (!CGF.HaveInsertPoint())
2162    return;
2163  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2164  // This function does not use parameters, so we can emit just default values.
2165  llvm::Value *Args[] = {
2166      llvm::ConstantPointerNull::get(
2167          cast<llvm::PointerType>(getIdentTyPointerTy())),
2168      llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2169  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2170                          CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
2171                      Args);
2172}
2173
2174void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
2175                                           SourceLocation Loc,
2176                                           OpenMPDirectiveKind Kind, bool,
2177                                           bool) {
2178  // Always emit simple barriers!
2179  if (!CGF.HaveInsertPoint())
2180    return;
2181  // Build call __kmpc_cancel_barrier(loc, thread_id);
2182  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2183  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2184                         getThreadID(CGF, Loc)};
2185
2186  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2187                          CGM.getModule(), OMPRTL___kmpc_barrier),
2188                      Args);
2189}
2190
2191void CGOpenMPRuntimeGPU::emitCriticalRegion(
2192    CodeGenFunction &CGF, StringRef CriticalName,
2193    const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2194    const Expr *Hint) {
2195  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2196  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2197  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2198  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2199  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2200
2201  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2202
2203  // Get the mask of active threads in the warp.
2204  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2205      CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
2206  // Fetch team-local id of the thread.
2207  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2208
2209  // Get the width of the team.
2210  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
2211
2212  // Initialize the counter variable for the loop.
2213  QualType Int32Ty =
2214      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2215  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2216  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2217  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2218                        /*isInit=*/true);
2219
2220  // Block checks if loop counter exceeds upper bound.
2221  CGF.EmitBlock(LoopBB);
2222  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2223  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2224  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2225
2226  // Block tests which single thread should execute region, and which threads
2227  // should go straight to synchronisation point.
2228  CGF.EmitBlock(TestBB);
2229  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2230  llvm::Value *CmpThreadToCounter =
2231      CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2232  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2233
2234  // Block emits the body of the critical region.
2235  CGF.EmitBlock(BodyBB);
2236
2237  // Output the critical statement.
2238  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2239                                      Hint);
2240
2241  // After the body surrounded by the critical region, the single executing
2242  // thread will jump to the synchronisation point.
2243  // Block waits for all threads in current team to finish then increments the
2244  // counter variable and returns to the loop.
2245  CGF.EmitBlock(SyncBB);
2246  // Reconverge active threads in the warp.
2247  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
2248                                CGM.getModule(), OMPRTL___kmpc_syncwarp),
2249                            Mask);
2250
2251  llvm::Value *IncCounterVal =
2252      CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2253  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2254  CGF.EmitBranch(LoopBB);
2255
2256  // Block that is reached when  all threads in the team complete the region.
2257  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2258}
2259
2260/// Cast value to the specified type.
2261static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2262                                    QualType ValTy, QualType CastTy,
2263                                    SourceLocation Loc) {
2264  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2265         "Cast type must sized.");
2266  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2267         "Val type must sized.");
2268  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2269  if (ValTy == CastTy)
2270    return Val;
2271  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2272      CGF.getContext().getTypeSizeInChars(CastTy))
2273    return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2274  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2275    return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2276                                     CastTy->hasSignedIntegerRepresentation());
2277  Address CastItem = CGF.CreateMemTemp(CastTy);
2278  Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2279      CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2280  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
2281                        LValueBaseInfo(AlignmentSource::Type),
2282                        TBAAAccessInfo());
2283  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
2284                              LValueBaseInfo(AlignmentSource::Type),
2285                              TBAAAccessInfo());
2286}
2287
2288/// This function creates calls to one of two shuffle functions to copy
2289/// variables between lanes in a warp.
2290static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2291                                                 llvm::Value *Elem,
2292                                                 QualType ElemType,
2293                                                 llvm::Value *Offset,
2294                                                 SourceLocation Loc) {
2295  CodeGenModule &CGM = CGF.CGM;
2296  CGBuilderTy &Bld = CGF.Builder;
2297  CGOpenMPRuntimeGPU &RT =
2298      *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
2299  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
2300
2301  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2302  assert(Size.getQuantity() <= 8 &&
2303         "Unsupported bitwidth in shuffle instruction.");
2304
2305  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
2306                                  ? OMPRTL___kmpc_shuffle_int32
2307                                  : OMPRTL___kmpc_shuffle_int64;
2308
2309  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2310  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2311      Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2312  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2313  llvm::Value *WarpSize =
2314      Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2315
2316  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2317      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
2318      {ElemCast, Offset, WarpSize});
2319
2320  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2321}
2322
2323static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2324                            Address DestAddr, QualType ElemType,
2325                            llvm::Value *Offset, SourceLocation Loc) {
2326  CGBuilderTy &Bld = CGF.Builder;
2327
2328  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2329  // Create the loop over the big sized data.
2330  // ptr = (void*)Elem;
2331  // ptrEnd = (void*) Elem + 1;
2332  // Step = 8;
2333  // while (ptr + Step < ptrEnd)
2334  //   shuffle((int64_t)*ptr);
2335  // Step = 4;
2336  // while (ptr + Step < ptrEnd)
2337  //   shuffle((int32_t)*ptr);
2338  // ...
2339  Address ElemPtr = DestAddr;
2340  Address Ptr = SrcAddr;
2341  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2342      Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2343  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2344    if (Size < CharUnits::fromQuantity(IntSize))
2345      continue;
2346    QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2347        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2348        /*Signed=*/1);
2349    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2350    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2351    ElemPtr =
2352        Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2353    if (Size.getQuantity() / IntSize > 1) {
2354      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2355      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2356      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2357      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2358      CGF.EmitBlock(PreCondBB);
2359      llvm::PHINode *PhiSrc =
2360          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2361      PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2362      llvm::PHINode *PhiDest =
2363          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2364      PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2365      Ptr = Address(PhiSrc, Ptr.getAlignment());
2366      ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2367      llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2368          PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2369                                   Ptr.getPointer(), CGF.VoidPtrTy));
2370      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2371                       ThenBB, ExitBB);
2372      CGF.EmitBlock(ThenBB);
2373      llvm::Value *Res = createRuntimeShuffleFunction(
2374          CGF,
2375          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2376                               LValueBaseInfo(AlignmentSource::Type),
2377                               TBAAAccessInfo()),
2378          IntType, Offset, Loc);
2379      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2380                            LValueBaseInfo(AlignmentSource::Type),
2381                            TBAAAccessInfo());
2382      Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2383      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2384      PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2385      PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2386      CGF.EmitBranch(PreCondBB);
2387      CGF.EmitBlock(ExitBB);
2388    } else {
2389      llvm::Value *Res = createRuntimeShuffleFunction(
2390          CGF,
2391          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
2392                               LValueBaseInfo(AlignmentSource::Type),
2393                               TBAAAccessInfo()),
2394          IntType, Offset, Loc);
2395      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
2396                            LValueBaseInfo(AlignmentSource::Type),
2397                            TBAAAccessInfo());
2398      Ptr = Bld.CreateConstGEP(Ptr, 1);
2399      ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2400    }
2401    Size = Size % IntSize;
2402  }
2403}
2404
2405namespace {
2406enum CopyAction : unsigned {
2407  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2408  // the warp using shuffle instructions.
2409  RemoteLaneToThread,
2410  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2411  ThreadCopy,
2412  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2413  ThreadToScratchpad,
2414  // ScratchpadToThread: Copy from a scratchpad array in global memory
2415  // containing team-reduced data to a thread's stack.
2416  ScratchpadToThread,
2417};
2418} // namespace
2419
2420struct CopyOptionsTy {
2421  llvm::Value *RemoteLaneOffset;
2422  llvm::Value *ScratchpadIndex;
2423  llvm::Value *ScratchpadWidth;
2424};
2425
2426/// Emit instructions to copy a Reduce list, which contains partially
2427/// aggregated values, in the specified direction.
2428static void emitReductionListCopy(
2429    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2430    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2431    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2432
2433  CodeGenModule &CGM = CGF.CGM;
2434  ASTContext &C = CGM.getContext();
2435  CGBuilderTy &Bld = CGF.Builder;
2436
2437  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2438  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2439  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2440
2441  // Iterates, element-by-element, through the source Reduce list and
2442  // make a copy.
2443  unsigned Idx = 0;
2444  unsigned Size = Privates.size();
2445  for (const Expr *Private : Privates) {
2446    Address SrcElementAddr = Address::invalid();
2447    Address DestElementAddr = Address::invalid();
2448    Address DestElementPtrAddr = Address::invalid();
2449    // Should we shuffle in an element from a remote lane?
2450    bool ShuffleInElement = false;
2451    // Set to true to update the pointer in the dest Reduce list to a
2452    // newly created element.
2453    bool UpdateDestListPtr = false;
2454    // Increment the src or dest pointer to the scratchpad, for each
2455    // new element.
2456    bool IncrScratchpadSrc = false;
2457    bool IncrScratchpadDest = false;
2458
2459    switch (Action) {
2460    case RemoteLaneToThread: {
2461      // Step 1.1: Get the address for the src element in the Reduce list.
2462      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2463      SrcElementAddr = CGF.EmitLoadOfPointer(
2464          SrcElementPtrAddr,
2465          C.getPointerType(Private->getType())->castAs<PointerType>());
2466
2467      // Step 1.2: Create a temporary to store the element in the destination
2468      // Reduce list.
2469      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2470      DestElementAddr =
2471          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2472      ShuffleInElement = true;
2473      UpdateDestListPtr = true;
2474      break;
2475    }
2476    case ThreadCopy: {
2477      // Step 1.1: Get the address for the src element in the Reduce list.
2478      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2479      SrcElementAddr = CGF.EmitLoadOfPointer(
2480          SrcElementPtrAddr,
2481          C.getPointerType(Private->getType())->castAs<PointerType>());
2482
2483      // Step 1.2: Get the address for dest element.  The destination
2484      // element has already been created on the thread's stack.
2485      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2486      DestElementAddr = CGF.EmitLoadOfPointer(
2487          DestElementPtrAddr,
2488          C.getPointerType(Private->getType())->castAs<PointerType>());
2489      break;
2490    }
2491    case ThreadToScratchpad: {
2492      // Step 1.1: Get the address for the src element in the Reduce list.
2493      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2494      SrcElementAddr = CGF.EmitLoadOfPointer(
2495          SrcElementPtrAddr,
2496          C.getPointerType(Private->getType())->castAs<PointerType>());
2497
2498      // Step 1.2: Get the address for dest element:
2499      // address = base + index * ElementSizeInChars.
2500      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2501      llvm::Value *CurrentOffset =
2502          Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2503      llvm::Value *ScratchPadElemAbsolutePtrVal =
2504          Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2505      ScratchPadElemAbsolutePtrVal =
2506          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2507      DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2508                                C.getTypeAlignInChars(Private->getType()));
2509      IncrScratchpadDest = true;
2510      break;
2511    }
2512    case ScratchpadToThread: {
2513      // Step 1.1: Get the address for the src element in the scratchpad.
2514      // address = base + index * ElementSizeInChars.
2515      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2516      llvm::Value *CurrentOffset =
2517          Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2518      llvm::Value *ScratchPadElemAbsolutePtrVal =
2519          Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2520      ScratchPadElemAbsolutePtrVal =
2521          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2522      SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2523                               C.getTypeAlignInChars(Private->getType()));
2524      IncrScratchpadSrc = true;
2525
2526      // Step 1.2: Create a temporary to store the element in the destination
2527      // Reduce list.
2528      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2529      DestElementAddr =
2530          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2531      UpdateDestListPtr = true;
2532      break;
2533    }
2534    }
2535
2536    // Regardless of src and dest of copy, we emit the load of src
2537    // element as this is required in all directions
2538    SrcElementAddr = Bld.CreateElementBitCast(
2539        SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2540    DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2541                                               SrcElementAddr.getElementType());
2542
2543    // Now that all active lanes have read the element in the
2544    // Reduce list, shuffle over the value from the remote lane.
2545    if (ShuffleInElement) {
2546      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2547                      RemoteLaneOffset, Private->getExprLoc());
2548    } else {
2549      switch (CGF.getEvaluationKind(Private->getType())) {
2550      case TEK_Scalar: {
2551        llvm::Value *Elem = CGF.EmitLoadOfScalar(
2552            SrcElementAddr, /*Volatile=*/false, Private->getType(),
2553            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2554            TBAAAccessInfo());
2555        // Store the source element value to the dest element address.
2556        CGF.EmitStoreOfScalar(
2557            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2558            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2559        break;
2560      }
2561      case TEK_Complex: {
2562        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2563            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2564            Private->getExprLoc());
2565        CGF.EmitStoreOfComplex(
2566            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2567            /*isInit=*/false);
2568        break;
2569      }
2570      case TEK_Aggregate:
2571        CGF.EmitAggregateCopy(
2572            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2573            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2574            Private->getType(), AggValueSlot::DoesNotOverlap);
2575        break;
2576      }
2577    }
2578
2579    // Step 3.1: Modify reference in dest Reduce list as needed.
2580    // Modifying the reference in Reduce list to point to the newly
2581    // created element.  The element is live in the current function
2582    // scope and that of functions it invokes (i.e., reduce_function).
2583    // RemoteReduceData[i] = (void*)&RemoteElem
2584    if (UpdateDestListPtr) {
2585      CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2586                                DestElementAddr.getPointer(), CGF.VoidPtrTy),
2587                            DestElementPtrAddr, /*Volatile=*/false,
2588                            C.VoidPtrTy);
2589    }
2590
2591    // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2592    // address of the next element in scratchpad memory, unless we're currently
2593    // processing the last one.  Memory alignment is also taken care of here.
2594    if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2595      llvm::Value *ScratchpadBasePtr =
2596          IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2597      llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2598      ScratchpadBasePtr = Bld.CreateNUWAdd(
2599          ScratchpadBasePtr,
2600          Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2601
2602      // Take care of global memory alignment for performance
2603      ScratchpadBasePtr = Bld.CreateNUWSub(
2604          ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2605      ScratchpadBasePtr = Bld.CreateUDiv(
2606          ScratchpadBasePtr,
2607          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2608      ScratchpadBasePtr = Bld.CreateNUWAdd(
2609          ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2610      ScratchpadBasePtr = Bld.CreateNUWMul(
2611          ScratchpadBasePtr,
2612          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2613
2614      if (IncrScratchpadDest)
2615        DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2616      else /* IncrScratchpadSrc = true */
2617        SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2618    }
2619
2620    ++Idx;
2621  }
2622}
2623
2624/// This function emits a helper that gathers Reduce lists from the first
2625/// lane of every active warp to lanes in the first warp.
2626///
2627/// void inter_warp_copy_func(void* reduce_data, num_warps)
2628///   shared smem[warp_size];
2629///   For all data entries D in reduce_data:
2630///     sync
2631///     If (I am the first lane in each warp)
2632///       Copy my local D to smem[warp_id]
2633///     sync
2634///     if (I am the first warp)
2635///       Copy smem[thread_id] to my local D
2636static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2637                                              ArrayRef<const Expr *> Privates,
2638                                              QualType ReductionArrayTy,
2639                                              SourceLocation Loc) {
2640  ASTContext &C = CGM.getContext();
2641  llvm::Module &M = CGM.getModule();
2642
2643  // ReduceList: thread local Reduce list.
2644  // At the stage of the computation when this function is called, partially
2645  // aggregated values reside in the first lane of every active warp.
2646  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2647                                  C.VoidPtrTy, ImplicitParamDecl::Other);
2648  // NumWarps: number of warps active in the parallel region.  This could
2649  // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651                                C.getIntTypeForBitwidth(32, /* Signed */ true),
2652                                ImplicitParamDecl::Other);
2653  FunctionArgList Args;
2654  Args.push_back(&ReduceListArg);
2655  Args.push_back(&NumWarpsArg);
2656
2657  const CGFunctionInfo &CGFI =
2658      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2659  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2660                                    llvm::GlobalValue::InternalLinkage,
2661                                    "_omp_reduction_inter_warp_copy_func", &M);
2662  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663  Fn->setDoesNotRecurse();
2664  CodeGenFunction CGF(CGM);
2665  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666
2667  CGBuilderTy &Bld = CGF.Builder;
2668
2669  // This array is used as a medium to transfer, one reduce element at a time,
2670  // the data from the first lane of every warp to lanes in the first warp
2671  // in order to perform the final step of a reduction in a parallel region
2672  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
2673  // for reduced latency, as well as to have a distinct copy for concurrently
2674  // executing target regions.  The array is declared with common linkage so
2675  // as to be shared across compilation units.
2676  StringRef TransferMediumName =
2677      "__openmp_nvptx_data_transfer_temporary_storage";
2678  llvm::GlobalVariable *TransferMedium =
2679      M.getGlobalVariable(TransferMediumName);
2680  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
2681  if (!TransferMedium) {
2682    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2683    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2684    TransferMedium = new llvm::GlobalVariable(
2685        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2686        llvm::UndefValue::get(Ty), TransferMediumName,
2687        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688        SharedAddressSpace);
2689    CGM.addCompilerUsedGlobal(TransferMedium);
2690  }
2691
2692  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2693  // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2695  // nvptx_lane_id = nvptx_id % warpsize
2696  llvm::Value *LaneID = getNVPTXLaneID(CGF);
2697  // nvptx_warp_id = nvptx_id / warpsize
2698  llvm::Value *WarpID = getNVPTXWarpID(CGF);
2699
2700  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2701  Address LocalReduceList(
2702      Bld.CreatePointerBitCastOrAddrSpaceCast(
2703          CGF.EmitLoadOfScalar(
2704              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2705              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2706          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2707      CGF.getPointerAlign());
2708
2709  unsigned Idx = 0;
2710  for (const Expr *Private : Privates) {
2711    //
2712    // Warp master copies reduce element to transfer medium in __shared__
2713    // memory.
2714    //
2715    unsigned RealTySize =
2716        C.getTypeSizeInChars(Private->getType())
2717            .alignTo(C.getTypeAlignInChars(Private->getType()))
2718            .getQuantity();
2719    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2720      unsigned NumIters = RealTySize / TySize;
2721      if (NumIters == 0)
2722        continue;
2723      QualType CType = C.getIntTypeForBitwidth(
2724          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2725      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2726      CharUnits Align = CharUnits::fromQuantity(TySize);
2727      llvm::Value *Cnt = nullptr;
2728      Address CntAddr = Address::invalid();
2729      llvm::BasicBlock *PrecondBB = nullptr;
2730      llvm::BasicBlock *ExitBB = nullptr;
2731      if (NumIters > 1) {
2732        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2733        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2734                              /*Volatile=*/false, C.IntTy);
2735        PrecondBB = CGF.createBasicBlock("precond");
2736        ExitBB = CGF.createBasicBlock("exit");
2737        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2738        // There is no need to emit line number for unconditional branch.
2739        (void)ApplyDebugLocation::CreateEmpty(CGF);
2740        CGF.EmitBlock(PrecondBB);
2741        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2742        llvm::Value *Cmp =
2743            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2744        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2745        CGF.EmitBlock(BodyBB);
2746      }
2747      // kmpc_barrier.
2748      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2749                                             /*EmitChecks=*/false,
2750                                             /*ForceSimpleCall=*/true);
2751      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2752      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2753      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2754
2755      // if (lane_id == 0)
2756      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2757      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2758      CGF.EmitBlock(ThenBB);
2759
2760      // Reduce element = LocalReduceList[i]
2761      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2762      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2763          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2764      // elemptr = ((CopyType*)(elemptrptr)) + I
2765      Address ElemPtr = Address(ElemPtrPtr, Align);
2766      ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2767      if (NumIters > 1) {
2768        ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
2769                          ElemPtr.getAlignment());
2770      }
2771
2772      // Get pointer to location in transfer medium.
2773      // MediumPtr = &medium[warp_id]
2774      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2775          TransferMedium->getValueType(), TransferMedium,
2776          {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2777      Address MediumPtr(MediumPtrVal, Align);
2778      // Casting to actual data type.
2779      // MediumPtr = (CopyType*)MediumPtrAddr;
2780      MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2781
2782      // elem = *elemptr
2783      //*MediumPtr = elem
2784      llvm::Value *Elem = CGF.EmitLoadOfScalar(
2785          ElemPtr, /*Volatile=*/false, CType, Loc,
2786          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2787      // Store the source element value to the dest element address.
2788      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2789                            LValueBaseInfo(AlignmentSource::Type),
2790                            TBAAAccessInfo());
2791
2792      Bld.CreateBr(MergeBB);
2793
2794      CGF.EmitBlock(ElseBB);
2795      Bld.CreateBr(MergeBB);
2796
2797      CGF.EmitBlock(MergeBB);
2798
2799      // kmpc_barrier.
2800      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2801                                             /*EmitChecks=*/false,
2802                                             /*ForceSimpleCall=*/true);
2803
2804      //
2805      // Warp 0 copies reduce element from transfer medium.
2806      //
2807      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2808      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2809      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2810
2811      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2812      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2813          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2814
2815      // Up to 32 threads in warp 0 are active.
2816      llvm::Value *IsActiveThread =
2817          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2818      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2819
2820      CGF.EmitBlock(W0ThenBB);
2821
2822      // SrcMediumPtr = &medium[tid]
2823      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2824          TransferMedium->getValueType(), TransferMedium,
2825          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2826      Address SrcMediumPtr(SrcMediumPtrVal, Align);
2827      // SrcMediumVal = *SrcMediumPtr;
2828      SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2829
2830      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2831      Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2832      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2833          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2834      Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2835      TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2836      if (NumIters > 1) {
2837        TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
2838                                TargetElemPtr.getAlignment());
2839      }
2840
2841      // *TargetElemPtr = SrcMediumVal;
2842      llvm::Value *SrcMediumValue =
2843          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2844      CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2845                            CType);
2846      Bld.CreateBr(W0MergeBB);
2847
2848      CGF.EmitBlock(W0ElseBB);
2849      Bld.CreateBr(W0MergeBB);
2850
2851      CGF.EmitBlock(W0MergeBB);
2852
2853      if (NumIters > 1) {
2854        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2855        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2856        CGF.EmitBranch(PrecondBB);
2857        (void)ApplyDebugLocation::CreateEmpty(CGF);
2858        CGF.EmitBlock(ExitBB);
2859      }
2860      RealTySize %= TySize;
2861    }
2862    ++Idx;
2863  }
2864
2865  CGF.FinishFunction();
2866  return Fn;
2867}
2868
2869/// Emit a helper that reduces data across two OpenMP threads (lanes)
2870/// in the same warp.  It uses shuffle instructions to copy over data from
2871/// a remote lane's stack.  The reduction algorithm performed is specified
2872/// by the fourth parameter.
2873///
2874/// Algorithm Versions.
2875/// Full Warp Reduce (argument value 0):
2876///   This algorithm assumes that all 32 lanes are active and gathers
2877///   data from these 32 lanes, producing a single resultant value.
2878/// Contiguous Partial Warp Reduce (argument value 1):
2879///   This algorithm assumes that only a *contiguous* subset of lanes
2880///   are active.  This happens for the last warp in a parallel region
2881///   when the user specified num_threads is not an integer multiple of
2882///   32.  This contiguous subset always starts with the zeroth lane.
2883/// Partial Warp Reduce (argument value 2):
2884///   This algorithm gathers data from any number of lanes at any position.
2885/// All reduced values are stored in the lowest possible lane.  The set
2886/// of problems every algorithm addresses is a super set of those
2887/// addressable by algorithms with a lower version number.  Overhead
2888/// increases as algorithm version increases.
2889///
2890/// Terminology
2891/// Reduce element:
2892///   Reduce element refers to the individual data field with primitive
2893///   data types to be combined and reduced across threads.
2894/// Reduce list:
2895///   Reduce list refers to a collection of local, thread-private
2896///   reduce elements.
2897/// Remote Reduce list:
2898///   Remote Reduce list refers to a collection of remote (relative to
2899///   the current thread) reduce elements.
2900///
2901/// We distinguish between three states of threads that are important to
2902/// the implementation of this function.
2903/// Alive threads:
2904///   Threads in a warp executing the SIMT instruction, as distinguished from
2905///   threads that are inactive due to divergent control flow.
2906/// Active threads:
2907///   The minimal set of threads that has to be alive upon entry to this
2908///   function.  The computation is correct iff active threads are alive.
2909///   Some threads are alive but they are not active because they do not
2910///   contribute to the computation in any useful manner.  Turning them off
2911///   may introduce control flow overheads without any tangible benefits.
2912/// Effective threads:
2913///   In order to comply with the argument requirements of the shuffle
2914///   function, we must keep all lanes holding data alive.  But at most
2915///   half of them perform value aggregation; we refer to this half of
2916///   threads as effective. The other half is simply handing off their
2917///   data.
2918///
2919/// Procedure
2920/// Value shuffle:
2921///   In this step active threads transfer data from higher lane positions
2922///   in the warp to lower lane positions, creating Remote Reduce list.
2923/// Value aggregation:
2924///   In this step, effective threads combine their thread local Reduce list
2925///   with Remote Reduce list and store the result in the thread local
2926///   Reduce list.
2927/// Value copy:
2928///   In this step, we deal with the assumption made by algorithm 2
2929///   (i.e. contiguity assumption).  When we have an odd number of lanes
2930///   active, say 2k+1, only k threads will be effective and therefore k
2931///   new values will be produced.  However, the Reduce list owned by the
2932///   (2k+1)th thread is ignored in the value aggregation.  Therefore
2933///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2934///   that the contiguity assumption still holds.
2935static llvm::Function *emitShuffleAndReduceFunction(
2936    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2937    QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2938  ASTContext &C = CGM.getContext();
2939
2940  // Thread local Reduce list used to host the values of data to be reduced.
2941  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2942                                  C.VoidPtrTy, ImplicitParamDecl::Other);
2943  // Current lane id; could be logical.
2944  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2945                              ImplicitParamDecl::Other);
2946  // Offset of the remote source lane relative to the current lane.
2947  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2948                                        C.ShortTy, ImplicitParamDecl::Other);
2949  // Algorithm version.  This is expected to be known at compile time.
2950  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2951                               C.ShortTy, ImplicitParamDecl::Other);
2952  FunctionArgList Args;
2953  Args.push_back(&ReduceListArg);
2954  Args.push_back(&LaneIDArg);
2955  Args.push_back(&RemoteLaneOffsetArg);
2956  Args.push_back(&AlgoVerArg);
2957
2958  const CGFunctionInfo &CGFI =
2959      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2960  auto *Fn = llvm::Function::Create(
2961      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2962      "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2963  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2964  Fn->setDoesNotRecurse();
2965
2966  CodeGenFunction CGF(CGM);
2967  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2968
2969  CGBuilderTy &Bld = CGF.Builder;
2970
2971  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2972  Address LocalReduceList(
2973      Bld.CreatePointerBitCastOrAddrSpaceCast(
2974          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2975                               C.VoidPtrTy, SourceLocation()),
2976          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2977      CGF.getPointerAlign());
2978
2979  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2980  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2981      AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2982
2983  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2984  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2985      AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2986
2987  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2988  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2989      AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2990
2991  // Create a local thread-private variable to host the Reduce list
2992  // from a remote lane.
2993  Address RemoteReduceList =
2994      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2995
2996  // This loop iterates through the list of reduce elements and copies,
2997  // element by element, from a remote lane in the warp to RemoteReduceList,
2998  // hosted on the thread's stack.
2999  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3000                        LocalReduceList, RemoteReduceList,
3001                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3002                         /*ScratchpadIndex=*/nullptr,
3003                         /*ScratchpadWidth=*/nullptr});
3004
3005  // The actions to be performed on the Remote Reduce list is dependent
3006  // on the algorithm version.
3007  //
3008  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3009  //  LaneId % 2 == 0 && Offset > 0):
3010  //    do the reduction value aggregation
3011  //
3012  //  The thread local variable Reduce list is mutated in place to host the
3013  //  reduced data, which is the aggregated value produced from local and
3014  //  remote lanes.
3015  //
3016  //  Note that AlgoVer is expected to be a constant integer known at compile
3017  //  time.
3018  //  When AlgoVer==0, the first conjunction evaluates to true, making
3019  //    the entire predicate true during compile time.
3020  //  When AlgoVer==1, the second conjunction has only the second part to be
3021  //    evaluated during runtime.  Other conjunctions evaluates to false
3022  //    during compile time.
3023  //  When AlgoVer==2, the third conjunction has only the second part to be
3024  //    evaluated during runtime.  Other conjunctions evaluates to false
3025  //    during compile time.
3026  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3027
3028  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3030      Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031
3032  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3033  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3034      Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3035  CondAlgo2 = Bld.CreateAnd(
3036      CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3037
3038  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3039  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3040
3041  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3042  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3043  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3044  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3045
3046  CGF.EmitBlock(ThenBB);
3047  // reduce_function(LocalReduceList, RemoteReduceList)
3048  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3049      LocalReduceList.getPointer(), CGF.VoidPtrTy);
3050  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3051      RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3052  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3053      CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3054  Bld.CreateBr(MergeBB);
3055
3056  CGF.EmitBlock(ElseBB);
3057  Bld.CreateBr(MergeBB);
3058
3059  CGF.EmitBlock(MergeBB);
3060
3061  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3062  // Reduce list.
3063  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3064  llvm::Value *CondCopy = Bld.CreateAnd(
3065      Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3066
3067  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3068  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3069  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3070  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3071
3072  CGF.EmitBlock(CpyThenBB);
3073  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3074                        RemoteReduceList, LocalReduceList);
3075  Bld.CreateBr(CpyMergeBB);
3076
3077  CGF.EmitBlock(CpyElseBB);
3078  Bld.CreateBr(CpyMergeBB);
3079
3080  CGF.EmitBlock(CpyMergeBB);
3081
3082  CGF.FinishFunction();
3083  return Fn;
3084}
3085
3086/// This function emits a helper that copies all the reduction variables from
3087/// the team into the provided global buffer for the reduction variables.
3088///
3089/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3090///   For all data entries D in reduce_data:
3091///     Copy local D to buffer.D[Idx]
3092static llvm::Value *emitListToGlobalCopyFunction(
3093    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3094    QualType ReductionArrayTy, SourceLocation Loc,
3095    const RecordDecl *TeamReductionRec,
3096    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3097        &VarFieldMap) {
3098  ASTContext &C = CGM.getContext();
3099
3100  // Buffer: global reduction buffer.
3101  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3102                              C.VoidPtrTy, ImplicitParamDecl::Other);
3103  // Idx: index of the buffer.
3104  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3105                           ImplicitParamDecl::Other);
3106  // ReduceList: thread local Reduce list.
3107  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3108                                  C.VoidPtrTy, ImplicitParamDecl::Other);
3109  FunctionArgList Args;
3110  Args.push_back(&BufferArg);
3111  Args.push_back(&IdxArg);
3112  Args.push_back(&ReduceListArg);
3113
3114  const CGFunctionInfo &CGFI =
3115      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3116  auto *Fn = llvm::Function::Create(
3117      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3118      "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3119  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3120  Fn->setDoesNotRecurse();
3121  CodeGenFunction CGF(CGM);
3122  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3123
3124  CGBuilderTy &Bld = CGF.Builder;
3125
3126  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3127  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3128  Address LocalReduceList(
3129      Bld.CreatePointerBitCastOrAddrSpaceCast(
3130          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3131                               C.VoidPtrTy, Loc),
3132          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3133      CGF.getPointerAlign());
3134  QualType StaticTy = C.getRecordType(TeamReductionRec);
3135  llvm::Type *LLVMReductionsBufferTy =
3136      CGM.getTypes().ConvertTypeForMem(StaticTy);
3137  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3138      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3139      LLVMReductionsBufferTy->getPointerTo());
3140  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3141                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3142                                              /*Volatile=*/false, C.IntTy,
3143                                              Loc)};
3144  unsigned Idx = 0;
3145  for (const Expr *Private : Privates) {
3146    // Reduce element = LocalReduceList[i]
3147    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3148    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3149        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3150    // elemptr = ((CopyType*)(elemptrptr)) + I
3151    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3152        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3153    Address ElemPtr =
3154        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3155    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3156    // Global = Buffer.VD[Idx];
3157    const FieldDecl *FD = VarFieldMap.lookup(VD);
3158    LValue GlobLVal = CGF.EmitLValueForField(
3159        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3160    Address GlobAddr = GlobLVal.getAddress(CGF);
3161    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3162        GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3163    GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3164    switch (CGF.getEvaluationKind(Private->getType())) {
3165    case TEK_Scalar: {
3166      llvm::Value *V = CGF.EmitLoadOfScalar(
3167          ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
3168          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
3169      CGF.EmitStoreOfScalar(V, GlobLVal);
3170      break;
3171    }
3172    case TEK_Complex: {
3173      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3174          CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3175      CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3176      break;
3177    }
3178    case TEK_Aggregate:
3179      CGF.EmitAggregateCopy(GlobLVal,
3180                            CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3181                            Private->getType(), AggValueSlot::DoesNotOverlap);
3182      break;
3183    }
3184    ++Idx;
3185  }
3186
3187  CGF.FinishFunction();
3188  return Fn;
3189}
3190
3191/// This function emits a helper that reduces all the reduction variables from
3192/// the team into the provided global buffer for the reduction variables.
3193///
3194/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3195///  void *GlobPtrs[];
3196///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3197///  ...
3198///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3199///  reduce_function(GlobPtrs, reduce_data);
3200static llvm::Value *emitListToGlobalReduceFunction(
3201    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3202    QualType ReductionArrayTy, SourceLocation Loc,
3203    const RecordDecl *TeamReductionRec,
3204    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3205        &VarFieldMap,
3206    llvm::Function *ReduceFn) {
3207  ASTContext &C = CGM.getContext();
3208
3209  // Buffer: global reduction buffer.
3210  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3211                              C.VoidPtrTy, ImplicitParamDecl::Other);
3212  // Idx: index of the buffer.
3213  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3214                           ImplicitParamDecl::Other);
3215  // ReduceList: thread local Reduce list.
3216  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3217                                  C.VoidPtrTy, ImplicitParamDecl::Other);
3218  FunctionArgList Args;
3219  Args.push_back(&BufferArg);
3220  Args.push_back(&IdxArg);
3221  Args.push_back(&ReduceListArg);
3222
3223  const CGFunctionInfo &CGFI =
3224      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3225  auto *Fn = llvm::Function::Create(
3226      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3227      "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3228  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3229  Fn->setDoesNotRecurse();
3230  CodeGenFunction CGF(CGM);
3231  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3232
3233  CGBuilderTy &Bld = CGF.Builder;
3234
3235  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3236  QualType StaticTy = C.getRecordType(TeamReductionRec);
3237  llvm::Type *LLVMReductionsBufferTy =
3238      CGM.getTypes().ConvertTypeForMem(StaticTy);
3239  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3240      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3241      LLVMReductionsBufferTy->getPointerTo());
3242
3243  // 1. Build a list of reduction variables.
3244  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3245  Address ReductionList =
3246      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3247  auto IPriv = Privates.begin();
3248  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3249                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3250                                              /*Volatile=*/false, C.IntTy,
3251                                              Loc)};
3252  unsigned Idx = 0;
3253  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3254    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3255    // Global = Buffer.VD[Idx];
3256    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3257    const FieldDecl *FD = VarFieldMap.lookup(VD);
3258    LValue GlobLVal = CGF.EmitLValueForField(
3259        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3260    Address GlobAddr = GlobLVal.getAddress(CGF);
3261    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3262        GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3263    llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3264    CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3265    if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266      // Store array size.
3267      ++Idx;
3268      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3269      llvm::Value *Size = CGF.Builder.CreateIntCast(
3270          CGF.getVLASize(
3271                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3272              .NumElts,
3273          CGF.SizeTy, /*isSigned=*/false);
3274      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3275                              Elem);
3276    }
3277  }
3278
3279  // Call reduce_function(GlobalReduceList, ReduceList)
3280  llvm::Value *GlobalReduceList =
3281      CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3282  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3283  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3284      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3285  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3286      CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3287  CGF.FinishFunction();
3288  return Fn;
3289}
3290
3291/// This function emits a helper that copies all the reduction variables from
3292/// the team into the provided global buffer for the reduction variables.
3293///
3294/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3295///   For all data entries D in reduce_data:
3296///     Copy buffer.D[Idx] to local D;
3297static llvm::Value *emitGlobalToListCopyFunction(
3298    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3299    QualType ReductionArrayTy, SourceLocation Loc,
3300    const RecordDecl *TeamReductionRec,
3301    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3302        &VarFieldMap) {
3303  ASTContext &C = CGM.getContext();
3304
3305  // Buffer: global reduction buffer.
3306  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3307                              C.VoidPtrTy, ImplicitParamDecl::Other);
3308  // Idx: index of the buffer.
3309  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3310                           ImplicitParamDecl::Other);
3311  // ReduceList: thread local Reduce list.
3312  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3313                                  C.VoidPtrTy, ImplicitParamDecl::Other);
3314  FunctionArgList Args;
3315  Args.push_back(&BufferArg);
3316  Args.push_back(&IdxArg);
3317  Args.push_back(&ReduceListArg);
3318
3319  const CGFunctionInfo &CGFI =
3320      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3321  auto *Fn = llvm::Function::Create(
3322      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3323      "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3324  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3325  Fn->setDoesNotRecurse();
3326  CodeGenFunction CGF(CGM);
3327  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3328
3329  CGBuilderTy &Bld = CGF.Builder;
3330
3331  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3332  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3333  Address LocalReduceList(
3334      Bld.CreatePointerBitCastOrAddrSpaceCast(
3335          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3336                               C.VoidPtrTy, Loc),
3337          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3338      CGF.getPointerAlign());
3339  QualType StaticTy = C.getRecordType(TeamReductionRec);
3340  llvm::Type *LLVMReductionsBufferTy =
3341      CGM.getTypes().ConvertTypeForMem(StaticTy);
3342  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3343      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3344      LLVMReductionsBufferTy->getPointerTo());
3345
3346  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3347                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3348                                              /*Volatile=*/false, C.IntTy,
3349                                              Loc)};
3350  unsigned Idx = 0;
3351  for (const Expr *Private : Privates) {
3352    // Reduce element = LocalReduceList[i]
3353    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3354    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3355        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3356    // elemptr = ((CopyType*)(elemptrptr)) + I
3357    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358        ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3359    Address ElemPtr =
3360        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3361    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3362    // Global = Buffer.VD[Idx];
3363    const FieldDecl *FD = VarFieldMap.lookup(VD);
3364    LValue GlobLVal = CGF.EmitLValueForField(
3365        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3366    Address GlobAddr = GlobLVal.getAddress(CGF);
3367    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3368        GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3369    GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
3370    switch (CGF.getEvaluationKind(Private->getType())) {
3371    case TEK_Scalar: {
3372      llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3373      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
3374                            LValueBaseInfo(AlignmentSource::Type),
3375                            TBAAAccessInfo());
3376      break;
3377    }
3378    case TEK_Complex: {
3379      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3380      CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3381                             /*isInit=*/false);
3382      break;
3383    }
3384    case TEK_Aggregate:
3385      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3386                            GlobLVal, Private->getType(),
3387                            AggValueSlot::DoesNotOverlap);
3388      break;
3389    }
3390    ++Idx;
3391  }
3392
3393  CGF.FinishFunction();
3394  return Fn;
3395}
3396
3397/// This function emits a helper that reduces all the reduction variables from
3398/// the team into the provided global buffer for the reduction variables.
3399///
3400/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3401///  void *GlobPtrs[];
3402///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3403///  ...
3404///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3405///  reduce_function(reduce_data, GlobPtrs);
3406static llvm::Value *emitGlobalToListReduceFunction(
3407    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3408    QualType ReductionArrayTy, SourceLocation Loc,
3409    const RecordDecl *TeamReductionRec,
3410    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3411        &VarFieldMap,
3412    llvm::Function *ReduceFn) {
3413  ASTContext &C = CGM.getContext();
3414
3415  // Buffer: global reduction buffer.
3416  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3417                              C.VoidPtrTy, ImplicitParamDecl::Other);
3418  // Idx: index of the buffer.
3419  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3420                           ImplicitParamDecl::Other);
3421  // ReduceList: thread local Reduce list.
3422  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423                                  C.VoidPtrTy, ImplicitParamDecl::Other);
3424  FunctionArgList Args;
3425  Args.push_back(&BufferArg);
3426  Args.push_back(&IdxArg);
3427  Args.push_back(&ReduceListArg);
3428
3429  const CGFunctionInfo &CGFI =
3430      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3431  auto *Fn = llvm::Function::Create(
3432      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3433      "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3434  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3435  Fn->setDoesNotRecurse();
3436  CodeGenFunction CGF(CGM);
3437  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3438
3439  CGBuilderTy &Bld = CGF.Builder;
3440
3441  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3442  QualType StaticTy = C.getRecordType(TeamReductionRec);
3443  llvm::Type *LLVMReductionsBufferTy =
3444      CGM.getTypes().ConvertTypeForMem(StaticTy);
3445  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3446      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3447      LLVMReductionsBufferTy->getPointerTo());
3448
3449  // 1. Build a list of reduction variables.
3450  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3451  Address ReductionList =
3452      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3453  auto IPriv = Privates.begin();
3454  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3455                         CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3456                                              /*Volatile=*/false, C.IntTy,
3457                                              Loc)};
3458  unsigned Idx = 0;
3459  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3460    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3461    // Global = Buffer.VD[Idx];
3462    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3463    const FieldDecl *FD = VarFieldMap.lookup(VD);
3464    LValue GlobLVal = CGF.EmitLValueForField(
3465        CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3466    Address GlobAddr = GlobLVal.getAddress(CGF);
3467    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
3468        GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
3469    llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3470    CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3471    if ((*IPriv)->getType()->isVariablyModifiedType()) {
3472      // Store array size.
3473      ++Idx;
3474      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3475      llvm::Value *Size = CGF.Builder.CreateIntCast(
3476          CGF.getVLASize(
3477                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3478              .NumElts,
3479          CGF.SizeTy, /*isSigned=*/false);
3480      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3481                              Elem);
3482    }
3483  }
3484
3485  // Call reduce_function(ReduceList, GlobalReduceList)
3486  llvm::Value *GlobalReduceList =
3487      CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3488  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3489  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3490      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3491  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3492      CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3493  CGF.FinishFunction();
3494  return Fn;
3495}
3496
3497///
3498/// Design of OpenMP reductions on the GPU
3499///
3500/// Consider a typical OpenMP program with one or more reduction
3501/// clauses:
3502///
3503/// float foo;
3504/// double bar;
3505/// #pragma omp target teams distribute parallel for \
3506///             reduction(+:foo) reduction(*:bar)
3507/// for (int i = 0; i < N; i++) {
3508///   foo += A[i]; bar *= B[i];
3509/// }
3510///
3511/// where 'foo' and 'bar' are reduced across all OpenMP threads in
3512/// all teams.  In our OpenMP implementation on the NVPTX device an
3513/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3514/// within a team are mapped to CUDA threads within a threadblock.
3515/// Our goal is to efficiently aggregate values across all OpenMP
3516/// threads such that:
3517///
3518///   - the compiler and runtime are logically concise, and
3519///   - the reduction is performed efficiently in a hierarchical
3520///     manner as follows: within OpenMP threads in the same warp,
3521///     across warps in a threadblock, and finally across teams on
3522///     the NVPTX device.
3523///
3524/// Introduction to Decoupling
3525///
3526/// We would like to decouple the compiler and the runtime so that the
3527/// latter is ignorant of the reduction variables (number, data types)
3528/// and the reduction operators.  This allows a simpler interface
3529/// and implementation while still attaining good performance.
3530///
3531/// Pseudocode for the aforementioned OpenMP program generated by the
3532/// compiler is as follows:
3533///
3534/// 1. Create private copies of reduction variables on each OpenMP
3535///    thread: 'foo_private', 'bar_private'
3536/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3537///    to it and writes the result in 'foo_private' and 'bar_private'
3538///    respectively.
3539/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3540///    and store the result on the team master:
3541///
3542///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3543///        reduceData, shuffleReduceFn, interWarpCpyFn)
3544///
3545///     where:
3546///       struct ReduceData {
3547///         double *foo;
3548///         double *bar;
3549///       } reduceData
3550///       reduceData.foo = &foo_private
3551///       reduceData.bar = &bar_private
3552///
3553///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3554///     auxiliary functions generated by the compiler that operate on
3555///     variables of type 'ReduceData'.  They aid the runtime perform
3556///     algorithmic steps in a data agnostic manner.
3557///
3558///     'shuffleReduceFn' is a pointer to a function that reduces data
3559///     of type 'ReduceData' across two OpenMP threads (lanes) in the
3560///     same warp.  It takes the following arguments as input:
3561///
3562///     a. variable of type 'ReduceData' on the calling lane,
3563///     b. its lane_id,
3564///     c. an offset relative to the current lane_id to generate a
3565///        remote_lane_id.  The remote lane contains the second
3566///        variable of type 'ReduceData' that is to be reduced.
3567///     d. an algorithm version parameter determining which reduction
3568///        algorithm to use.
3569///
3570///     'shuffleReduceFn' retrieves data from the remote lane using
3571///     efficient GPU shuffle intrinsics and reduces, using the
3572///     algorithm specified by the 4th parameter, the two operands
3573///     element-wise.  The result is written to the first operand.
3574///
3575///     Different reduction algorithms are implemented in different
3576///     runtime functions, all calling 'shuffleReduceFn' to perform
3577///     the essential reduction step.  Therefore, based on the 4th
3578///     parameter, this function behaves slightly differently to
3579///     cooperate with the runtime to ensure correctness under
3580///     different circumstances.
3581///
3582///     'InterWarpCpyFn' is a pointer to a function that transfers
3583///     reduced variables across warps.  It tunnels, through CUDA
3584///     shared memory, the thread-private data of type 'ReduceData'
3585///     from lane 0 of each warp to a lane in the first warp.
3586/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3587///    The last team writes the global reduced value to memory.
3588///
3589///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
3590///             reduceData, shuffleReduceFn, interWarpCpyFn,
3591///             scratchpadCopyFn, loadAndReduceFn)
3592///
3593///     'scratchpadCopyFn' is a helper that stores reduced
3594///     data from the team master to a scratchpad array in
3595///     global memory.
3596///
3597///     'loadAndReduceFn' is a helper that loads data from
3598///     the scratchpad array and reduces it with the input
3599///     operand.
3600///
3601///     These compiler generated functions hide address
3602///     calculation and alignment information from the runtime.
3603/// 5. if ret == 1:
3604///     The team master of the last team stores the reduced
3605///     result to the globals in memory.
3606///     foo += reduceData.foo; bar *= reduceData.bar
3607///
3608///
3609/// Warp Reduction Algorithms
3610///
3611/// On the warp level, we have three algorithms implemented in the
3612/// OpenMP runtime depending on the number of active lanes:
3613///
3614/// Full Warp Reduction
3615///
3616/// The reduce algorithm within a warp where all lanes are active
3617/// is implemented in the runtime as follows:
3618///
3619/// full_warp_reduce(void *reduce_data,
3620///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3621///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3622///     ShuffleReduceFn(reduce_data, 0, offset, 0);
3623/// }
3624///
3625/// The algorithm completes in log(2, WARPSIZE) steps.
3626///
3627/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3628/// not used therefore we save instructions by not retrieving lane_id
3629/// from the corresponding special registers.  The 4th parameter, which
3630/// represents the version of the algorithm being used, is set to 0 to
3631/// signify full warp reduction.
3632///
3633/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3634///
3635/// #reduce_elem refers to an element in the local lane's data structure
3636/// #remote_elem is retrieved from a remote lane
3637/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3638/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3639///
3640/// Contiguous Partial Warp Reduction
3641///
3642/// This reduce algorithm is used within a warp where only the first
3643/// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
3644/// number of OpenMP threads in a parallel region is not a multiple of
3645/// WARPSIZE.  The algorithm is implemented in the runtime as follows:
3646///
3647/// void
3648/// contiguous_partial_reduce(void *reduce_data,
3649///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
3650///                           int size, int lane_id) {
3651///   int curr_size;
3652///   int offset;
3653///   curr_size = size;
3654///   mask = curr_size/2;
3655///   while (offset>0) {
3656///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3657///     curr_size = (curr_size+1)/2;
3658///     offset = curr_size/2;
3659///   }
3660/// }
3661///
3662/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3663///
3664/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3665/// if (lane_id < offset)
3666///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3667/// else
3668///     reduce_elem = remote_elem
3669///
3670/// This algorithm assumes that the data to be reduced are located in a
3671/// contiguous subset of lanes starting from the first.  When there is
3672/// an odd number of active lanes, the data in the last lane is not
3673/// aggregated with any other lane's dat but is instead copied over.
3674///
3675/// Dispersed Partial Warp Reduction
3676///
3677/// This algorithm is used within a warp when any discontiguous subset of
3678/// lanes are active.  It is used to implement the reduction operation
3679/// across lanes in an OpenMP simd region or in a nested parallel region.
3680///
3681/// void
3682/// dispersed_partial_reduce(void *reduce_data,
3683///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3684///   int size, remote_id;
3685///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
3686///   do {
3687///       remote_id = next_active_lane_id_right_after_me();
3688///       # the above function returns 0 of no active lane
3689///       # is present right after the current lane.
3690///       size = number_of_active_lanes_in_this_warp();
3691///       logical_lane_id /= 2;
3692///       ShuffleReduceFn(reduce_data, logical_lane_id,
3693///                       remote_id-1-threadIdx.x, 2);
3694///   } while (logical_lane_id % 2 == 0 && size > 1);
3695/// }
3696///
3697/// There is no assumption made about the initial state of the reduction.
3698/// Any number of lanes (>=1) could be active at any position.  The reduction
3699/// result is returned in the first active lane.
3700///
3701/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702///
3703/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3704/// if (lane_id % 2 == 0 && offset > 0)
3705///     reduce_elem = reduce_elem REDUCE_OP remote_elem
3706/// else
3707///     reduce_elem = remote_elem
3708///
3709///
3710/// Intra-Team Reduction
3711///
3712/// This function, as implemented in the runtime call
3713/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3714/// threads in a team.  It first reduces within a warp using the
3715/// aforementioned algorithms.  We then proceed to gather all such
3716/// reduced values at the first warp.
3717///
3718/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3719/// data from each of the "warp master" (zeroth lane of each warp, where
3720/// warp-reduced data is held) to the zeroth warp.  This step reduces (in
3721/// a mathematical sense) the problem of reduction across warp masters in
3722/// a block to the problem of warp reduction.
3723///
3724///
3725/// Inter-Team Reduction
3726///
3727/// Once a team has reduced its data to a single value, it is stored in
3728/// a global scratchpad array.  Since each team has a distinct slot, this
3729/// can be done without locking.
3730///
3731/// The last team to write to the scratchpad array proceeds to reduce the
3732/// scratchpad array.  One or more workers in the last team use the helper
3733/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3734/// the k'th worker reduces every k'th element.
3735///
3736/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3737/// reduce across workers and compute a globally reduced value.
3738///
3739void CGOpenMPRuntimeGPU::emitReduction(
3740    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3741    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3742    ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3743  if (!CGF.HaveInsertPoint())
3744    return;
3745
3746  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3747#ifndef NDEBUG
3748  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3749#endif
3750
3751  if (Options.SimpleReduction) {
3752    assert(!TeamsReduction && !ParallelReduction &&
3753           "Invalid reduction selection in emitReduction.");
3754    CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3755                                   ReductionOps, Options);
3756    return;
3757  }
3758
3759  assert((TeamsReduction || ParallelReduction) &&
3760         "Invalid reduction selection in emitReduction.");
3761
3762  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3763  // RedList, shuffle_reduce_func, interwarp_copy_func);
3764  // or
3765  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3766  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3767  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3768
3769  llvm::Value *Res;
3770  ASTContext &C = CGM.getContext();
3771  // 1. Build a list of reduction variables.
3772  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3773  auto Size = RHSExprs.size();
3774  for (const Expr *E : Privates) {
3775    if (E->getType()->isVariablyModifiedType())
3776      // Reserve place for array size.
3777      ++Size;
3778  }
3779  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3780  QualType ReductionArrayTy =
3781      C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3782                             /*IndexTypeQuals=*/0);
3783  Address ReductionList =
3784      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3785  auto IPriv = Privates.begin();
3786  unsigned Idx = 0;
3787  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3788    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3789    CGF.Builder.CreateStore(
3790        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3791            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3792        Elem);
3793    if ((*IPriv)->getType()->isVariablyModifiedType()) {
3794      // Store array size.
3795      ++Idx;
3796      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3797      llvm::Value *Size = CGF.Builder.CreateIntCast(
3798          CGF.getVLASize(
3799                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3800              .NumElts,
3801          CGF.SizeTy, /*isSigned=*/false);
3802      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3803                              Elem);
3804    }
3805  }
3806
3807  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3808      ReductionList.getPointer(), CGF.VoidPtrTy);
3809  llvm::Function *ReductionFn = emitReductionFunction(
3810      Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3811      LHSExprs, RHSExprs, ReductionOps);
3812  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3813  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3814      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3815  llvm::Value *InterWarpCopyFn =
3816      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3817
3818  if (ParallelReduction) {
3819    llvm::Value *Args[] = {RTLoc,
3820                           ThreadId,
3821                           CGF.Builder.getInt32(RHSExprs.size()),
3822                           ReductionArrayTySize,
3823                           RL,
3824                           ShuffleAndReduceFn,
3825                           InterWarpCopyFn};
3826
3827    Res = CGF.EmitRuntimeCall(
3828        OMPBuilder.getOrCreateRuntimeFunction(
3829            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3830        Args);
3831  } else {
3832    assert(TeamsReduction && "expected teams reduction.");
3833    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3834    llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3835    int Cnt = 0;
3836    for (const Expr *DRE : Privates) {
3837      PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3838      ++Cnt;
3839    }
3840    const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3841        CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3842        C.getLangOpts().OpenMPCUDAReductionBufNum);
3843    TeamsReductions.push_back(TeamReductionRec);
3844    if (!KernelTeamsReductionPtr) {
3845      KernelTeamsReductionPtr = new llvm::GlobalVariable(
3846          CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3847          llvm::GlobalValue::InternalLinkage, nullptr,
3848          "_openmp_teams_reductions_buffer_$_$ptr");
3849    }
3850    llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3851        Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3852        /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3853    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3854        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3855    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3856        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3857        ReductionFn);
3858    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3859        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3860    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3861        CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3862        ReductionFn);
3863
3864    llvm::Value *Args[] = {
3865        RTLoc,
3866        ThreadId,
3867        GlobalBufferPtr,
3868        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3869        RL,
3870        ShuffleAndReduceFn,
3871        InterWarpCopyFn,
3872        GlobalToBufferCpyFn,
3873        GlobalToBufferRedFn,
3874        BufferToGlobalCpyFn,
3875        BufferToGlobalRedFn};
3876
3877    Res = CGF.EmitRuntimeCall(
3878        OMPBuilder.getOrCreateRuntimeFunction(
3879            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3880        Args);
3881  }
3882
3883  // 5. Build if (res == 1)
3884  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3885  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3886  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3887      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3888  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3889
3890  // 6. Build then branch: where we have reduced values in the master
3891  //    thread in each team.
3892  //    __kmpc_end_reduce{_nowait}(<gtid>);
3893  //    break;
3894  CGF.EmitBlock(ThenBB);
3895
3896  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3897  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3898                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3899    auto IPriv = Privates.begin();
3900    auto ILHS = LHSExprs.begin();
3901    auto IRHS = RHSExprs.begin();
3902    for (const Expr *E : ReductionOps) {
3903      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3904                                  cast<DeclRefExpr>(*IRHS));
3905      ++IPriv;
3906      ++ILHS;
3907      ++IRHS;
3908    }
3909  };
3910  llvm::Value *EndArgs[] = {ThreadId};
3911  RegionCodeGenTy RCG(CodeGen);
3912  NVPTXActionTy Action(
3913      nullptr, llvm::None,
3914      OMPBuilder.getOrCreateRuntimeFunction(
3915          CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3916      EndArgs);
3917  RCG.setAction(Action);
3918  RCG(CGF);
3919  // There is no need to emit line number for unconditional branch.
3920  (void)ApplyDebugLocation::CreateEmpty(CGF);
3921  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3922}
3923
3924const VarDecl *
3925CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3926                                       const VarDecl *NativeParam) const {
3927  if (!NativeParam->getType()->isReferenceType())
3928    return NativeParam;
3929  QualType ArgType = NativeParam->getType();
3930  QualifierCollector QC;
3931  const Type *NonQualTy = QC.strip(ArgType);
3932  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3933  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3934    if (Attr->getCaptureKind() == OMPC_map) {
3935      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3936                                                        LangAS::opencl_global);
3937    } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
3938               PointeeTy.isConstant(CGM.getContext())) {
3939      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3940                                                        LangAS::opencl_generic);
3941    }
3942  }
3943  ArgType = CGM.getContext().getPointerType(PointeeTy);
3944  QC.addRestrict();
3945  enum { NVPTX_local_addr = 5 };
3946  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3947  ArgType = QC.apply(CGM.getContext(), ArgType);
3948  if (isa<ImplicitParamDecl>(NativeParam))
3949    return ImplicitParamDecl::Create(
3950        CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3951        NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3952  return ParmVarDecl::Create(
3953      CGM.getContext(),
3954      const_cast<DeclContext *>(NativeParam->getDeclContext()),
3955      NativeParam->getBeginLoc(), NativeParam->getLocation(),
3956      NativeParam->getIdentifier(), ArgType,
3957      /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3958}
3959
3960Address
3961CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3962                                          const VarDecl *NativeParam,
3963                                          const VarDecl *TargetParam) const {
3964  assert(NativeParam != TargetParam &&
3965         NativeParam->getType()->isReferenceType() &&
3966         "Native arg must not be the same as target arg.");
3967  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3968  QualType NativeParamType = NativeParam->getType();
3969  QualifierCollector QC;
3970  const Type *NonQualTy = QC.strip(NativeParamType);
3971  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3972  unsigned NativePointeeAddrSpace =
3973      CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3974  QualType TargetTy = TargetParam->getType();
3975  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3976      LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3977  // First cast to generic.
3978  TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3979      TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3980                      /*AddrSpace=*/0));
3981  // Cast from generic to native address space.
3982  TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3983      TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3984                      NativePointeeAddrSpace));
3985  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3986  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3987                        NativeParamType);
3988  return NativeParamAddr;
3989}
3990
3991void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3992    CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3993    ArrayRef<llvm::Value *> Args) const {
3994  SmallVector<llvm::Value *, 4> TargetArgs;
3995  TargetArgs.reserve(Args.size());
3996  auto *FnType = OutlinedFn.getFunctionType();
3997  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3998    if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3999      TargetArgs.append(std::next(Args.begin(), I), Args.end());
4000      break;
4001    }
4002    llvm::Type *TargetType = FnType->getParamType(I);
4003    llvm::Value *NativeArg = Args[I];
4004    if (!TargetType->isPointerTy()) {
4005      TargetArgs.emplace_back(NativeArg);
4006      continue;
4007    }
4008    llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4009        NativeArg,
4010        NativeArg->getType()->getPointerElementType()->getPointerTo());
4011    TargetArgs.emplace_back(
4012        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4013  }
4014  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4015}
4016
4017/// Emit function which wraps the outline parallel region
4018/// and controls the arguments which are passed to this function.
4019/// The wrapper ensures that the outlined function is called
4020/// with the correct arguments when data is shared.
4021llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
4022    llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4023  ASTContext &Ctx = CGM.getContext();
4024  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4025
4026  // Create a function that takes as argument the source thread.
4027  FunctionArgList WrapperArgs;
4028  QualType Int16QTy =
4029      Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4030  QualType Int32QTy =
4031      Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4032  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4033                                     /*Id=*/nullptr, Int16QTy,
4034                                     ImplicitParamDecl::Other);
4035  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4036                               /*Id=*/nullptr, Int32QTy,
4037                               ImplicitParamDecl::Other);
4038  WrapperArgs.emplace_back(&ParallelLevelArg);
4039  WrapperArgs.emplace_back(&WrapperArg);
4040
4041  const CGFunctionInfo &CGFI =
4042      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4043
4044  auto *Fn = llvm::Function::Create(
4045      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4046      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4047
4048  // Ensure we do not inline the function. This is trivially true for the ones
4049  // passed to __kmpc_fork_call but the ones calles in serialized regions
4050  // could be inlined. This is not a perfect but it is closer to the invariant
4051  // we want, namely, every data environment starts with a new function.
4052  // TODO: We should pass the if condition to the runtime function and do the
4053  //       handling there. Much cleaner code.
4054  Fn->addFnAttr(llvm::Attribute::NoInline);
4055
4056  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4057  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4058  Fn->setDoesNotRecurse();
4059
4060  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4061  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4062                    D.getBeginLoc(), D.getBeginLoc());
4063
4064  const auto *RD = CS.getCapturedRecordDecl();
4065  auto CurField = RD->field_begin();
4066
4067  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4068                                                      /*Name=*/".zero.addr");
4069  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4070  // Get the array of arguments.
4071  SmallVector<llvm::Value *, 8> Args;
4072
4073  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4074  Args.emplace_back(ZeroAddr.getPointer());
4075
4076  CGBuilderTy &Bld = CGF.Builder;
4077  auto CI = CS.capture_begin();
4078
4079  // Use global memory for data sharing.
4080  // Handle passing of global args to workers.
4081  Address GlobalArgs =
4082      CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4083  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4084  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4085  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4086                          CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
4087                      DataSharingArgs);
4088
4089  // Retrieve the shared variables from the list of references returned
4090  // by the runtime. Pass the variables to the outlined function.
4091  Address SharedArgListAddress = Address::invalid();
4092  if (CS.capture_size() > 0 ||
4093      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4094    SharedArgListAddress = CGF.EmitLoadOfPointer(
4095        GlobalArgs, CGF.getContext()
4096                        .getPointerType(CGF.getContext().getPointerType(
4097                            CGF.getContext().VoidPtrTy))
4098                        .castAs<PointerType>());
4099  }
4100  unsigned Idx = 0;
4101  if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4102    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4103    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4104        Src, CGF.SizeTy->getPointerTo());
4105    llvm::Value *LB = CGF.EmitLoadOfScalar(
4106        TypedAddress,
4107        /*Volatile=*/false,
4108        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4109        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4110    Args.emplace_back(LB);
4111    ++Idx;
4112    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4113    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4114        Src, CGF.SizeTy->getPointerTo());
4115    llvm::Value *UB = CGF.EmitLoadOfScalar(
4116        TypedAddress,
4117        /*Volatile=*/false,
4118        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4119        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4120    Args.emplace_back(UB);
4121    ++Idx;
4122  }
4123  if (CS.capture_size() > 0) {
4124    ASTContext &CGFContext = CGF.getContext();
4125    for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4126      QualType ElemTy = CurField->getType();
4127      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4128      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4129          Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4130      llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4131                                              /*Volatile=*/false,
4132                                              CGFContext.getPointerType(ElemTy),
4133                                              CI->getLocation());
4134      if (CI->capturesVariableByCopy() &&
4135          !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4136        Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4137                              CI->getLocation());
4138      }
4139      Args.emplace_back(Arg);
4140    }
4141  }
4142
4143  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4144  CGF.FinishFunction();
4145  return Fn;
4146}
4147
4148void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
4149                                              const Decl *D) {
4150  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4151    return;
4152
4153  assert(D && "Expected function or captured|block decl.");
4154  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4155         "Function is registered already.");
4156  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4157         "Team is set but not processed.");
4158  const Stmt *Body = nullptr;
4159  bool NeedToDelayGlobalization = false;
4160  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4161    Body = FD->getBody();
4162  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4163    Body = BD->getBody();
4164  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4165    Body = CD->getBody();
4166    NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4167    if (NeedToDelayGlobalization &&
4168        getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
4169      return;
4170  }
4171  if (!Body)
4172    return;
4173  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4174  VarChecker.Visit(Body);
4175  const RecordDecl *GlobalizedVarsRecord =
4176      VarChecker.getGlobalizedRecord(IsInTTDRegion);
4177  TeamAndReductions.first = nullptr;
4178  TeamAndReductions.second.clear();
4179  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4180      VarChecker.getEscapedVariableLengthDecls();
4181  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4182    return;
4183  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4184  I->getSecond().MappedParams =
4185      std::make_unique<CodeGenFunction::OMPMapVars>();
4186  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4187  I->getSecond().EscapedParameters.insert(
4188      VarChecker.getEscapedParameters().begin(),
4189      VarChecker.getEscapedParameters().end());
4190  I->getSecond().EscapedVariableLengthDecls.append(
4191      EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4192  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4193  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4194    assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4195    const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4196    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4197  }
4198  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4199    CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4200    VarChecker.Visit(Body);
4201    I->getSecond().SecondaryGlobalRecord =
4202        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4203    I->getSecond().SecondaryLocalVarData.emplace();
4204    DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4205    for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4206      assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4207      const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4208      Data.insert(
4209          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4210    }
4211  }
4212  if (!NeedToDelayGlobalization) {
4213    emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4214    struct GlobalizationScope final : EHScopeStack::Cleanup {
4215      GlobalizationScope() = default;
4216
4217      void Emit(CodeGenFunction &CGF, Flags flags) override {
4218        static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
4219            .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4220      }
4221    };
4222    CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4223  }
4224}
4225
4226Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
4227                                                        const VarDecl *VD) {
4228  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4229    const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4230    auto AS = LangAS::Default;
4231    switch (A->getAllocatorType()) {
4232      // Use the default allocator here as by default local vars are
4233      // threadlocal.
4234    case OMPAllocateDeclAttr::OMPNullMemAlloc:
4235    case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4236    case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4237    case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4238    case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4239      // Follow the user decision - use default allocation.
4240      return Address::invalid();
4241    case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4242      // TODO: implement aupport for user-defined allocators.
4243      return Address::invalid();
4244    case OMPAllocateDeclAttr::OMPConstMemAlloc:
4245      AS = LangAS::cuda_constant;
4246      break;
4247    case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4248      AS = LangAS::cuda_shared;
4249      break;
4250    case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4251    case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4252      break;
4253    }
4254    llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4255    auto *GV = new llvm::GlobalVariable(
4256        CGM.getModule(), VarTy, /*isConstant=*/false,
4257        llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
4258        VD->getName(),
4259        /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4260        CGM.getContext().getTargetAddressSpace(AS));
4261    CharUnits Align = CGM.getContext().getDeclAlign(VD);
4262    GV->setAlignment(Align.getAsAlign());
4263    return Address(
4264        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4265            GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
4266                    VD->getType().getAddressSpace()))),
4267        Align);
4268  }
4269
4270  if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
4271    return Address::invalid();
4272
4273  VD = VD->getCanonicalDecl();
4274  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4275  if (I == FunctionGlobalizedDecls.end())
4276    return Address::invalid();
4277  auto VDI = I->getSecond().LocalVarData.find(VD);
4278  if (VDI != I->getSecond().LocalVarData.end())
4279    return VDI->second.PrivateAddr;
4280  if (VD->hasAttrs()) {
4281    for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4282         E(VD->attr_end());
4283         IT != E; ++IT) {
4284      auto VDI = I->getSecond().LocalVarData.find(
4285          cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4286              ->getCanonicalDecl());
4287      if (VDI != I->getSecond().LocalVarData.end())
4288        return VDI->second.PrivateAddr;
4289    }
4290  }
4291
4292  return Address::invalid();
4293}
4294
4295void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
4296  FunctionGlobalizedDecls.erase(CGF.CurFn);
4297  CGOpenMPRuntime::functionFinished(CGF);
4298}
4299
4300void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
4301    CodeGenFunction &CGF, const OMPLoopDirective &S,
4302    OpenMPDistScheduleClauseKind &ScheduleKind,
4303    llvm::Value *&Chunk) const {
4304  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
4305  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
4306    ScheduleKind = OMPC_DIST_SCHEDULE_static;
4307    Chunk = CGF.EmitScalarConversion(
4308        RT.getGPUNumThreads(CGF),
4309        CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4310        S.getIterationVariable()->getType(), S.getBeginLoc());
4311    return;
4312  }
4313  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4314      CGF, S, ScheduleKind, Chunk);
4315}
4316
4317void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
4318    CodeGenFunction &CGF, const OMPLoopDirective &S,
4319    OpenMPScheduleClauseKind &ScheduleKind,
4320    const Expr *&ChunkExpr) const {
4321  ScheduleKind = OMPC_SCHEDULE_static;
4322  // Chunk size is 1 in this case.
4323  llvm::APInt ChunkSize(32, 1);
4324  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4325      CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4326      SourceLocation());
4327}
4328
4329void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
4330    CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4331  assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4332         " Expected target-based directive.");
4333  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4334  for (const CapturedStmt::Capture &C : CS->captures()) {
4335    // Capture variables captured by reference in lambdas for target-based
4336    // directives.
4337    if (!C.capturesVariable())
4338      continue;
4339    const VarDecl *VD = C.getCapturedVar();
4340    const auto *RD = VD->getType()
4341                         .getCanonicalType()
4342                         .getNonReferenceType()
4343                         ->getAsCXXRecordDecl();
4344    if (!RD || !RD->isLambda())
4345      continue;
4346    Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4347    LValue VDLVal;
4348    if (VD->getType().getCanonicalType()->isReferenceType())
4349      VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4350    else
4351      VDLVal = CGF.MakeAddrLValue(
4352          VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4353    llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4354    FieldDecl *ThisCapture = nullptr;
4355    RD->getCaptureFields(Captures, ThisCapture);
4356    if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4357      LValue ThisLVal =
4358          CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4359      llvm::Value *CXXThis = CGF.LoadCXXThis();
4360      CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4361    }
4362    for (const LambdaCapture &LC : RD->captures()) {
4363      if (LC.getCaptureKind() != LCK_ByRef)
4364        continue;
4365      const VarDecl *VD = LC.getCapturedVar();
4366      if (!CS->capturesVariable(VD))
4367        continue;
4368      auto It = Captures.find(VD);
4369      assert(It != Captures.end() && "Found lambda capture without field.");
4370      LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4371      Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4372      if (VD->getType().getCanonicalType()->isReferenceType())
4373        VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4374                                               VD->getType().getCanonicalType())
4375                     .getAddress(CGF);
4376      CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4377    }
4378  }
4379}
4380
4381unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const {
4382  return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4383}
4384
4385bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4386                                                            LangAS &AS) {
4387  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4388    return false;
4389  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4390  switch(A->getAllocatorType()) {
4391  case OMPAllocateDeclAttr::OMPNullMemAlloc:
4392  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4393  // Not supported, fallback to the default mem space.
4394  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4395  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4396  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4397  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4398  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4399    AS = LangAS::Default;
4400    return true;
4401  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4402    AS = LangAS::cuda_constant;
4403    return true;
4404  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4405    AS = LangAS::cuda_shared;
4406    return true;
4407  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4408    llvm_unreachable("Expected predefined allocator for the variables with the "
4409                     "static storage.");
4410  }
4411  return false;
4412}
4413
4414// Get current CudaArch and ignore any unknown values
4415static CudaArch getCudaArch(CodeGenModule &CGM) {
4416  if (!CGM.getTarget().hasFeature("ptx"))
4417    return CudaArch::UNKNOWN;
4418  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
4419    if (Feature.getValue()) {
4420      CudaArch Arch = StringToCudaArch(Feature.getKey());
4421      if (Arch != CudaArch::UNKNOWN)
4422        return Arch;
4423    }
4424  }
4425  return CudaArch::UNKNOWN;
4426}
4427
4428/// Check to see if target architecture supports unified addressing which is
4429/// a restriction for OpenMP requires clause "unified_shared_memory".
4430void CGOpenMPRuntimeGPU::processRequiresDirective(
4431    const OMPRequiresDecl *D) {
4432  for (const OMPClause *Clause : D->clauselists()) {
4433    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4434      CudaArch Arch = getCudaArch(CGM);
4435      switch (Arch) {
4436      case CudaArch::SM_20:
4437      case CudaArch::SM_21:
4438      case CudaArch::SM_30:
4439      case CudaArch::SM_32:
4440      case CudaArch::SM_35:
4441      case CudaArch::SM_37:
4442      case CudaArch::SM_50:
4443      case CudaArch::SM_52:
4444      case CudaArch::SM_53: {
4445        SmallString<256> Buffer;
4446        llvm::raw_svector_ostream Out(Buffer);
4447        Out << "Target architecture " << CudaArchToString(Arch)
4448            << " does not support unified addressing";
4449        CGM.Error(Clause->getBeginLoc(), Out.str());
4450        return;
4451      }
4452      case CudaArch::SM_60:
4453      case CudaArch::SM_61:
4454      case CudaArch::SM_62:
4455      case CudaArch::SM_70:
4456      case CudaArch::SM_72:
4457      case CudaArch::SM_75:
4458      case CudaArch::SM_80:
4459      case CudaArch::SM_86:
4460      case CudaArch::GFX600:
4461      case CudaArch::GFX601:
4462      case CudaArch::GFX602:
4463      case CudaArch::GFX700:
4464      case CudaArch::GFX701:
4465      case CudaArch::GFX702:
4466      case CudaArch::GFX703:
4467      case CudaArch::GFX704:
4468      case CudaArch::GFX705:
4469      case CudaArch::GFX801:
4470      case CudaArch::GFX802:
4471      case CudaArch::GFX803:
4472      case CudaArch::GFX805:
4473      case CudaArch::GFX810:
4474      case CudaArch::GFX900:
4475      case CudaArch::GFX902:
4476      case CudaArch::GFX904:
4477      case CudaArch::GFX906:
4478      case CudaArch::GFX908:
4479      case CudaArch::GFX909:
4480      case CudaArch::GFX90a:
4481      case CudaArch::GFX90c:
4482      case CudaArch::GFX1010:
4483      case CudaArch::GFX1011:
4484      case CudaArch::GFX1012:
4485      case CudaArch::GFX1030:
4486      case CudaArch::GFX1031:
4487      case CudaArch::GFX1032:
4488      case CudaArch::GFX1033:
4489      case CudaArch::GFX1034:
4490      case CudaArch::UNUSED:
4491      case CudaArch::UNKNOWN:
4492        break;
4493      case CudaArch::LAST:
4494        llvm_unreachable("Unexpected Cuda arch.");
4495      }
4496    }
4497  }
4498  CGOpenMPRuntime::processRequiresDirective(D);
4499}
4500
4501/// Get number of SMs and number of blocks per SM.
4502static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4503  std::pair<unsigned, unsigned> Data;
4504  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4505    Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4506  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4507    Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4508  if (Data.first && Data.second)
4509    return Data;
4510  switch (getCudaArch(CGM)) {
4511  case CudaArch::SM_20:
4512  case CudaArch::SM_21:
4513  case CudaArch::SM_30:
4514  case CudaArch::SM_32:
4515  case CudaArch::SM_35:
4516  case CudaArch::SM_37:
4517  case CudaArch::SM_50:
4518  case CudaArch::SM_52:
4519  case CudaArch::SM_53:
4520    return {16, 16};
4521  case CudaArch::SM_60:
4522  case CudaArch::SM_61:
4523  case CudaArch::SM_62:
4524    return {56, 32};
4525  case CudaArch::SM_70:
4526  case CudaArch::SM_72:
4527  case CudaArch::SM_75:
4528  case CudaArch::SM_80:
4529  case CudaArch::SM_86:
4530    return {84, 32};
4531  case CudaArch::GFX600:
4532  case CudaArch::GFX601:
4533  case CudaArch::GFX602:
4534  case CudaArch::GFX700:
4535  case CudaArch::GFX701:
4536  case CudaArch::GFX702:
4537  case CudaArch::GFX703:
4538  case CudaArch::GFX704:
4539  case CudaArch::GFX705:
4540  case CudaArch::GFX801:
4541  case CudaArch::GFX802:
4542  case CudaArch::GFX803:
4543  case CudaArch::GFX805:
4544  case CudaArch::GFX810:
4545  case CudaArch::GFX900:
4546  case CudaArch::GFX902:
4547  case CudaArch::GFX904:
4548  case CudaArch::GFX906:
4549  case CudaArch::GFX908:
4550  case CudaArch::GFX909:
4551  case CudaArch::GFX90a:
4552  case CudaArch::GFX90c:
4553  case CudaArch::GFX1010:
4554  case CudaArch::GFX1011:
4555  case CudaArch::GFX1012:
4556  case CudaArch::GFX1030:
4557  case CudaArch::GFX1031:
4558  case CudaArch::GFX1032:
4559  case CudaArch::GFX1033:
4560  case CudaArch::GFX1034:
4561  case CudaArch::UNUSED:
4562  case CudaArch::UNKNOWN:
4563    break;
4564  case CudaArch::LAST:
4565    llvm_unreachable("Unexpected Cuda arch.");
4566  }
4567  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4568}
4569
4570void CGOpenMPRuntimeGPU::clear() {
4571  if (!GlobalizedRecords.empty() &&
4572      !CGM.getLangOpts().OpenMPCUDATargetParallel) {
4573    ASTContext &C = CGM.getContext();
4574    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4575    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
4576    RecordDecl *StaticRD = C.buildImplicitRecord(
4577        "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4578    StaticRD->startDefinition();
4579    RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4580        "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4581    SharedStaticRD->startDefinition();
4582    for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4583      if (Records.Records.empty())
4584        continue;
4585      unsigned Size = 0;
4586      unsigned RecAlignment = 0;
4587      for (const RecordDecl *RD : Records.Records) {
4588        QualType RDTy = C.getRecordType(RD);
4589        unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4590        RecAlignment = std::max(RecAlignment, Alignment);
4591        unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4592        Size =
4593            llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4594      }
4595      Size = llvm::alignTo(Size, RecAlignment);
4596      llvm::APInt ArySize(/*numBits=*/64, Size);
4597      QualType SubTy = C.getConstantArrayType(
4598          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4599      const bool UseSharedMemory = Size <= SharedMemorySize;
4600      auto *Field =
4601          FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4602                            SourceLocation(), SourceLocation(), nullptr, SubTy,
4603                            C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4604                            /*BW=*/nullptr, /*Mutable=*/false,
4605                            /*InitStyle=*/ICIS_NoInit);
4606      Field->setAccess(AS_public);
4607      if (UseSharedMemory) {
4608        SharedStaticRD->addDecl(Field);
4609        SharedRecs.push_back(&Records);
4610      } else {
4611        StaticRD->addDecl(Field);
4612        GlobalRecs.push_back(&Records);
4613      }
4614      Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4615      Records.UseSharedMemory->setInitializer(
4616          llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4617    }
4618    // Allocate SharedMemorySize buffer for the shared memory.
4619    // FIXME: nvlink does not handle weak linkage correctly (object with the
4620    // different size are reported as erroneous).
4621    // Restore this code as sson as nvlink is fixed.
4622    if (!SharedStaticRD->field_empty()) {
4623      llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
4624      QualType SubTy = C.getConstantArrayType(
4625          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
4626      auto *Field = FieldDecl::Create(
4627          C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4628          C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4629          /*BW=*/nullptr, /*Mutable=*/false,
4630          /*InitStyle=*/ICIS_NoInit);
4631      Field->setAccess(AS_public);
4632      SharedStaticRD->addDecl(Field);
4633    }
4634    SharedStaticRD->completeDefinition();
4635    if (!SharedStaticRD->field_empty()) {
4636      QualType StaticTy = C.getRecordType(SharedStaticRD);
4637      llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4638      auto *GV = new llvm::GlobalVariable(
4639          CGM.getModule(), LLVMStaticTy,
4640          /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4641          llvm::UndefValue::get(LLVMStaticTy),
4642          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4643          llvm::GlobalValue::NotThreadLocal,
4644          C.getTargetAddressSpace(LangAS::cuda_shared));
4645      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4646          GV, CGM.VoidPtrTy);
4647      for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4648        Rec->Buffer->replaceAllUsesWith(Replacement);
4649        Rec->Buffer->eraseFromParent();
4650      }
4651    }
4652    StaticRD->completeDefinition();
4653    if (!StaticRD->field_empty()) {
4654      QualType StaticTy = C.getRecordType(StaticRD);
4655      std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4656      llvm::APInt Size1(32, SMsBlockPerSM.second);
4657      QualType Arr1Ty =
4658          C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
4659                                 /*IndexTypeQuals=*/0);
4660      llvm::APInt Size2(32, SMsBlockPerSM.first);
4661      QualType Arr2Ty =
4662          C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
4663                                 /*IndexTypeQuals=*/0);
4664      llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4665      // FIXME: nvlink does not handle weak linkage correctly (object with the
4666      // different size are reported as erroneous).
4667      // Restore CommonLinkage as soon as nvlink is fixed.
4668      auto *GV = new llvm::GlobalVariable(
4669          CGM.getModule(), LLVMArr2Ty,
4670          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4671          llvm::Constant::getNullValue(LLVMArr2Ty),
4672          "_openmp_static_glob_rd_$_");
4673      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4674          GV, CGM.VoidPtrTy);
4675      for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4676        Rec->Buffer->replaceAllUsesWith(Replacement);
4677        Rec->Buffer->eraseFromParent();
4678      }
4679    }
4680  }
4681  if (!TeamsReductions.empty()) {
4682    ASTContext &C = CGM.getContext();
4683    RecordDecl *StaticRD = C.buildImplicitRecord(
4684        "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
4685    StaticRD->startDefinition();
4686    for (const RecordDecl *TeamReductionRec : TeamsReductions) {
4687      QualType RecTy = C.getRecordType(TeamReductionRec);
4688      auto *Field = FieldDecl::Create(
4689          C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
4690          C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
4691          /*BW=*/nullptr, /*Mutable=*/false,
4692          /*InitStyle=*/ICIS_NoInit);
4693      Field->setAccess(AS_public);
4694      StaticRD->addDecl(Field);
4695    }
4696    StaticRD->completeDefinition();
4697    QualType StaticTy = C.getRecordType(StaticRD);
4698    llvm::Type *LLVMReductionsBufferTy =
4699        CGM.getTypes().ConvertTypeForMem(StaticTy);
4700    // FIXME: nvlink does not handle weak linkage correctly (object with the
4701    // different size are reported as erroneous).
4702    // Restore CommonLinkage as soon as nvlink is fixed.
4703    auto *GV = new llvm::GlobalVariable(
4704        CGM.getModule(), LLVMReductionsBufferTy,
4705        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
4706        llvm::Constant::getNullValue(LLVMReductionsBufferTy),
4707        "_openmp_teams_reductions_buffer_$_");
4708    KernelTeamsReductionPtr->setInitializer(
4709        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4710                                                             CGM.VoidPtrTy));
4711  }
4712  CGOpenMPRuntime::clear();
4713}
4714