1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This provides a generalized class for OpenMP runtime code generation 10// specialized by GPU targets NVPTX and AMDGCN. 11// 12//===----------------------------------------------------------------------===// 13 14#include "CGOpenMPRuntimeGPU.h" 15#include "CGOpenMPRuntimeNVPTX.h" 16#include "CodeGenFunction.h" 17#include "clang/AST/Attr.h" 18#include "clang/AST/DeclOpenMP.h" 19#include "clang/AST/StmtOpenMP.h" 20#include "clang/AST/StmtVisitor.h" 21#include "clang/Basic/Cuda.h" 22#include "llvm/ADT/SmallPtrSet.h" 23#include "llvm/Frontend/OpenMP/OMPGridValues.h" 24#include "llvm/IR/IntrinsicsNVPTX.h" 25 26using namespace clang; 27using namespace CodeGen; 28using namespace llvm::omp; 29 30namespace { 31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32class NVPTXActionTy final : public PrePostActionTy { 33 llvm::FunctionCallee EnterCallee = nullptr; 34 ArrayRef<llvm::Value *> EnterArgs; 35 llvm::FunctionCallee ExitCallee = nullptr; 36 ArrayRef<llvm::Value *> ExitArgs; 37 bool Conditional = false; 38 llvm::BasicBlock *ContBlock = nullptr; 39 40public: 41 NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42 ArrayRef<llvm::Value *> EnterArgs, 43 llvm::FunctionCallee ExitCallee, 44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46 ExitArgs(ExitArgs), Conditional(Conditional) {} 47 void Enter(CodeGenFunction &CGF) override { 48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49 if (Conditional) { 50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52 ContBlock = CGF.createBasicBlock("omp_if.end"); 53 // Generate the branch (If-stmt) 54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55 CGF.EmitBlock(ThenBlock); 56 } 57 } 58 void Done(CodeGenFunction &CGF) { 59 // Emit the rest of blocks/branches 60 CGF.EmitBranch(ContBlock); 61 CGF.EmitBlock(ContBlock, true); 62 } 63 void Exit(CodeGenFunction &CGF) override { 64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65 } 66}; 67 68/// A class to track the execution mode when codegening directives within 69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70/// to the target region and used by containing directives such as 'parallel' 71/// to emit optimized code. 72class ExecutionRuntimeModesRAII { 73private: 74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75 CGOpenMPRuntimeGPU::EM_Unknown; 76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77 bool SavedRuntimeMode = false; 78 bool *RuntimeMode = nullptr; 79 80public: 81 /// Constructor for Non-SPMD mode. 82 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode) 83 : ExecMode(ExecMode) { 84 SavedExecMode = ExecMode; 85 ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD; 86 } 87 /// Constructor for SPMD mode. 88 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 89 bool &RuntimeMode, bool FullRuntimeMode) 90 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { 91 SavedExecMode = ExecMode; 92 SavedRuntimeMode = RuntimeMode; 93 ExecMode = CGOpenMPRuntimeGPU::EM_SPMD; 94 RuntimeMode = FullRuntimeMode; 95 } 96 ~ExecutionRuntimeModesRAII() { 97 ExecMode = SavedExecMode; 98 if (RuntimeMode) 99 *RuntimeMode = SavedRuntimeMode; 100 } 101}; 102 103/// GPU Configuration: This information can be derived from cuda registers, 104/// however, providing compile time constants helps generate more efficient 105/// code. For all practical purposes this is fine because the configuration 106/// is the same for all known NVPTX architectures. 107enum MachineConfiguration : unsigned { 108 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target 109 /// specific Grid Values like GV_Warp_Size, GV_Warp_Size_Log2, 110 /// and GV_Warp_Size_Log2_Mask. 111 112 /// Global memory alignment for performance. 113 GlobalMemoryAlignment = 128, 114 115 /// Maximal size of the shared memory buffer. 116 SharedMemorySize = 128, 117}; 118 119static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 120 RefExpr = RefExpr->IgnoreParens(); 121 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 122 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 123 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 124 Base = TempASE->getBase()->IgnoreParenImpCasts(); 125 RefExpr = Base; 126 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { 127 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 128 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) 129 Base = TempOASE->getBase()->IgnoreParenImpCasts(); 130 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 131 Base = TempASE->getBase()->IgnoreParenImpCasts(); 132 RefExpr = Base; 133 } 134 RefExpr = RefExpr->IgnoreParenImpCasts(); 135 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 136 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 137 const auto *ME = cast<MemberExpr>(RefExpr); 138 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 139} 140 141 142static RecordDecl *buildRecordForGlobalizedVars( 143 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 144 ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 145 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 146 &MappedDeclsFields, int BufSize) { 147 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 148 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 149 return nullptr; 150 SmallVector<VarsDataTy, 4> GlobalizedVars; 151 for (const ValueDecl *D : EscapedDecls) 152 GlobalizedVars.emplace_back( 153 CharUnits::fromQuantity(std::max( 154 C.getDeclAlign(D).getQuantity(), 155 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))), 156 D); 157 for (const ValueDecl *D : EscapedDeclsForTeams) 158 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 159 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { 160 return L.first > R.first; 161 }); 162 163 // Build struct _globalized_locals_ty { 164 // /* globalized vars */[WarSize] align (max(decl_align, 165 // GlobalMemoryAlignment)) 166 // /* globalized vars */ for EscapedDeclsForTeams 167 // }; 168 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 169 GlobalizedRD->startDefinition(); 170 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 171 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 172 for (const auto &Pair : GlobalizedVars) { 173 const ValueDecl *VD = Pair.second; 174 QualType Type = VD->getType(); 175 if (Type->isLValueReferenceType()) 176 Type = C.getPointerType(Type.getNonReferenceType()); 177 else 178 Type = Type.getNonReferenceType(); 179 SourceLocation Loc = VD->getLocation(); 180 FieldDecl *Field; 181 if (SingleEscaped.count(VD)) { 182 Field = FieldDecl::Create( 183 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 184 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 185 /*BW=*/nullptr, /*Mutable=*/false, 186 /*InitStyle=*/ICIS_NoInit); 187 Field->setAccess(AS_public); 188 if (VD->hasAttrs()) { 189 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 190 E(VD->getAttrs().end()); 191 I != E; ++I) 192 Field->addAttr(*I); 193 } 194 } else { 195 llvm::APInt ArraySize(32, BufSize); 196 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, 197 0); 198 Field = FieldDecl::Create( 199 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 200 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 201 /*BW=*/nullptr, /*Mutable=*/false, 202 /*InitStyle=*/ICIS_NoInit); 203 Field->setAccess(AS_public); 204 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), 205 static_cast<CharUnits::QuantityType>( 206 GlobalMemoryAlignment))); 207 Field->addAttr(AlignedAttr::CreateImplicit( 208 C, /*IsAlignmentExpr=*/true, 209 IntegerLiteral::Create(C, Align, 210 C.getIntTypeForBitwidth(32, /*Signed=*/0), 211 SourceLocation()), 212 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned)); 213 } 214 GlobalizedRD->addDecl(Field); 215 MappedDeclsFields.try_emplace(VD, Field); 216 } 217 GlobalizedRD->completeDefinition(); 218 return GlobalizedRD; 219} 220 221/// Get the list of variables that can escape their declaration context. 222class CheckVarsEscapingDeclContext final 223 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 224 CodeGenFunction &CGF; 225 llvm::SetVector<const ValueDecl *> EscapedDecls; 226 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 227 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 228 RecordDecl *GlobalizedRD = nullptr; 229 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 230 bool AllEscaped = false; 231 bool IsForCombinedParallelRegion = false; 232 233 void markAsEscaped(const ValueDecl *VD) { 234 // Do not globalize declare target variables. 235 if (!isa<VarDecl>(VD) || 236 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 237 return; 238 VD = cast<ValueDecl>(VD->getCanonicalDecl()); 239 // Use user-specified allocation. 240 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 241 return; 242 // Variables captured by value must be globalized. 243 if (auto *CSI = CGF.CapturedStmtInfo) { 244 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 245 // Check if need to capture the variable that was already captured by 246 // value in the outer region. 247 if (!IsForCombinedParallelRegion) { 248 if (!FD->hasAttrs()) 249 return; 250 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 251 if (!Attr) 252 return; 253 if (((Attr->getCaptureKind() != OMPC_map) && 254 !isOpenMPPrivate(Attr->getCaptureKind())) || 255 ((Attr->getCaptureKind() == OMPC_map) && 256 !FD->getType()->isAnyPointerType())) 257 return; 258 } 259 if (!FD->getType()->isReferenceType()) { 260 assert(!VD->getType()->isVariablyModifiedType() && 261 "Parameter captured by value with variably modified type"); 262 EscapedParameters.insert(VD); 263 } else if (!IsForCombinedParallelRegion) { 264 return; 265 } 266 } 267 } 268 if ((!CGF.CapturedStmtInfo || 269 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 270 VD->getType()->isReferenceType()) 271 // Do not globalize variables with reference type. 272 return; 273 if (VD->getType()->isVariablyModifiedType()) 274 EscapedVariableLengthDecls.insert(VD); 275 else 276 EscapedDecls.insert(VD); 277 } 278 279 void VisitValueDecl(const ValueDecl *VD) { 280 if (VD->getType()->isLValueReferenceType()) 281 markAsEscaped(VD); 282 if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 283 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 284 const bool SavedAllEscaped = AllEscaped; 285 AllEscaped = VD->getType()->isLValueReferenceType(); 286 Visit(VarD->getInit()); 287 AllEscaped = SavedAllEscaped; 288 } 289 } 290 } 291 void VisitOpenMPCapturedStmt(const CapturedStmt *S, 292 ArrayRef<OMPClause *> Clauses, 293 bool IsCombinedParallelRegion) { 294 if (!S) 295 return; 296 for (const CapturedStmt::Capture &C : S->captures()) { 297 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 298 const ValueDecl *VD = C.getCapturedVar(); 299 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 300 if (IsCombinedParallelRegion) { 301 // Check if the variable is privatized in the combined construct and 302 // those private copies must be shared in the inner parallel 303 // directive. 304 IsForCombinedParallelRegion = false; 305 for (const OMPClause *C : Clauses) { 306 if (!isOpenMPPrivate(C->getClauseKind()) || 307 C->getClauseKind() == OMPC_reduction || 308 C->getClauseKind() == OMPC_linear || 309 C->getClauseKind() == OMPC_private) 310 continue; 311 ArrayRef<const Expr *> Vars; 312 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 313 Vars = PC->getVarRefs(); 314 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 315 Vars = PC->getVarRefs(); 316 else 317 llvm_unreachable("Unexpected clause."); 318 for (const auto *E : Vars) { 319 const Decl *D = 320 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 321 if (D == VD->getCanonicalDecl()) { 322 IsForCombinedParallelRegion = true; 323 break; 324 } 325 } 326 if (IsForCombinedParallelRegion) 327 break; 328 } 329 } 330 markAsEscaped(VD); 331 if (isa<OMPCapturedExprDecl>(VD)) 332 VisitValueDecl(VD); 333 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 334 } 335 } 336 } 337 338 void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 339 assert(!GlobalizedRD && 340 "Record for globalized variables is built already."); 341 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 342 unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 343 if (IsInTTDRegion) 344 EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 345 else 346 EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 347 GlobalizedRD = ::buildRecordForGlobalizedVars( 348 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 349 MappedDeclsFields, WarpSize); 350 } 351 352public: 353 CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 354 ArrayRef<const ValueDecl *> TeamsReductions) 355 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 356 } 357 virtual ~CheckVarsEscapingDeclContext() = default; 358 void VisitDeclStmt(const DeclStmt *S) { 359 if (!S) 360 return; 361 for (const Decl *D : S->decls()) 362 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 363 VisitValueDecl(VD); 364 } 365 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 366 if (!D) 367 return; 368 if (!D->hasAssociatedStmt()) 369 return; 370 if (const auto *S = 371 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 372 // Do not analyze directives that do not actually require capturing, 373 // like `omp for` or `omp simd` directives. 374 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 375 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 376 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 377 VisitStmt(S->getCapturedStmt()); 378 return; 379 } 380 VisitOpenMPCapturedStmt( 381 S, D->clauses(), 382 CaptureRegions.back() == OMPD_parallel && 383 isOpenMPDistributeDirective(D->getDirectiveKind())); 384 } 385 } 386 void VisitCapturedStmt(const CapturedStmt *S) { 387 if (!S) 388 return; 389 for (const CapturedStmt::Capture &C : S->captures()) { 390 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 391 const ValueDecl *VD = C.getCapturedVar(); 392 markAsEscaped(VD); 393 if (isa<OMPCapturedExprDecl>(VD)) 394 VisitValueDecl(VD); 395 } 396 } 397 } 398 void VisitLambdaExpr(const LambdaExpr *E) { 399 if (!E) 400 return; 401 for (const LambdaCapture &C : E->captures()) { 402 if (C.capturesVariable()) { 403 if (C.getCaptureKind() == LCK_ByRef) { 404 const ValueDecl *VD = C.getCapturedVar(); 405 markAsEscaped(VD); 406 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 407 VisitValueDecl(VD); 408 } 409 } 410 } 411 } 412 void VisitBlockExpr(const BlockExpr *E) { 413 if (!E) 414 return; 415 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 416 if (C.isByRef()) { 417 const VarDecl *VD = C.getVariable(); 418 markAsEscaped(VD); 419 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 420 VisitValueDecl(VD); 421 } 422 } 423 } 424 void VisitCallExpr(const CallExpr *E) { 425 if (!E) 426 return; 427 for (const Expr *Arg : E->arguments()) { 428 if (!Arg) 429 continue; 430 if (Arg->isLValue()) { 431 const bool SavedAllEscaped = AllEscaped; 432 AllEscaped = true; 433 Visit(Arg); 434 AllEscaped = SavedAllEscaped; 435 } else { 436 Visit(Arg); 437 } 438 } 439 Visit(E->getCallee()); 440 } 441 void VisitDeclRefExpr(const DeclRefExpr *E) { 442 if (!E) 443 return; 444 const ValueDecl *VD = E->getDecl(); 445 if (AllEscaped) 446 markAsEscaped(VD); 447 if (isa<OMPCapturedExprDecl>(VD)) 448 VisitValueDecl(VD); 449 else if (const auto *VarD = dyn_cast<VarDecl>(VD)) 450 if (VarD->isInitCapture()) 451 VisitValueDecl(VD); 452 } 453 void VisitUnaryOperator(const UnaryOperator *E) { 454 if (!E) 455 return; 456 if (E->getOpcode() == UO_AddrOf) { 457 const bool SavedAllEscaped = AllEscaped; 458 AllEscaped = true; 459 Visit(E->getSubExpr()); 460 AllEscaped = SavedAllEscaped; 461 } else { 462 Visit(E->getSubExpr()); 463 } 464 } 465 void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 466 if (!E) 467 return; 468 if (E->getCastKind() == CK_ArrayToPointerDecay) { 469 const bool SavedAllEscaped = AllEscaped; 470 AllEscaped = true; 471 Visit(E->getSubExpr()); 472 AllEscaped = SavedAllEscaped; 473 } else { 474 Visit(E->getSubExpr()); 475 } 476 } 477 void VisitExpr(const Expr *E) { 478 if (!E) 479 return; 480 bool SavedAllEscaped = AllEscaped; 481 if (!E->isLValue()) 482 AllEscaped = false; 483 for (const Stmt *Child : E->children()) 484 if (Child) 485 Visit(Child); 486 AllEscaped = SavedAllEscaped; 487 } 488 void VisitStmt(const Stmt *S) { 489 if (!S) 490 return; 491 for (const Stmt *Child : S->children()) 492 if (Child) 493 Visit(Child); 494 } 495 496 /// Returns the record that handles all the escaped local variables and used 497 /// instead of their original storage. 498 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 499 if (!GlobalizedRD) 500 buildRecordForGlobalizedVars(IsInTTDRegion); 501 return GlobalizedRD; 502 } 503 504 /// Returns the field in the globalized record for the escaped variable. 505 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 506 assert(GlobalizedRD && 507 "Record for globalized variables must be generated already."); 508 auto I = MappedDeclsFields.find(VD); 509 if (I == MappedDeclsFields.end()) 510 return nullptr; 511 return I->getSecond(); 512 } 513 514 /// Returns the list of the escaped local variables/parameters. 515 ArrayRef<const ValueDecl *> getEscapedDecls() const { 516 return EscapedDecls.getArrayRef(); 517 } 518 519 /// Checks if the escaped local variable is actually a parameter passed by 520 /// value. 521 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 522 return EscapedParameters; 523 } 524 525 /// Returns the list of the escaped variables with the variably modified 526 /// types. 527 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 528 return EscapedVariableLengthDecls.getArrayRef(); 529 } 530}; 531} // anonymous namespace 532 533/// Get the id of the warp in the block. 534/// We assume that the warp size is 32, which is always the case 535/// on the NVPTX device, to generate more efficient code. 536static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { 537 CGBuilderTy &Bld = CGF.Builder; 538 unsigned LaneIDBits = 539 CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2); 540 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 541 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); 542} 543 544/// Get the id of the current lane in the Warp. 545/// We assume that the warp size is 32, which is always the case 546/// on the NVPTX device, to generate more efficient code. 547static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { 548 CGBuilderTy &Bld = CGF.Builder; 549 unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue( 550 llvm::omp::GV_Warp_Size_Log2_Mask); 551 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 552 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), 553 "nvptx_lane_id"); 554} 555 556/// Get the value of the thread_limit clause in the teams directive. 557/// For the 'generic' execution mode, the runtime encodes thread_limit in 558/// the launch parameters, always starting thread_limit+warpSize threads per 559/// CTA. The threads in the last warp are reserved for master execution. 560/// For the 'spmd' execution mode, all threads in a CTA are part of the team. 561static llvm::Value *getThreadLimit(CodeGenFunction &CGF, 562 bool IsInSPMDExecutionMode = false) { 563 CGBuilderTy &Bld = CGF.Builder; 564 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 565 llvm::Value *ThreadLimit = nullptr; 566 if (IsInSPMDExecutionMode) 567 ThreadLimit = RT.getGPUNumThreads(CGF); 568 else { 569 llvm::Value *GPUNumThreads = RT.getGPUNumThreads(CGF); 570 llvm::Value *GPUWarpSize = RT.getGPUWarpSize(CGF); 571 ThreadLimit = Bld.CreateNUWSub(GPUNumThreads, GPUWarpSize, "thread_limit"); 572 } 573 assert(ThreadLimit != nullptr && "Expected non-null ThreadLimit"); 574 return ThreadLimit; 575} 576 577/// Get the thread id of the OMP master thread. 578/// The master thread id is the first thread (lane) of the last warp in the 579/// GPU block. Warp size is assumed to be some power of 2. 580/// Thread id is 0 indexed. 581/// E.g: If NumThreads is 33, master id is 32. 582/// If NumThreads is 64, master id is 32. 583/// If NumThreads is 1024, master id is 992. 584static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { 585 CGBuilderTy &Bld = CGF.Builder; 586 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 587 llvm::Value *NumThreads = RT.getGPUNumThreads(CGF); 588 // We assume that the warp size is a power of 2. 589 llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1)); 590 591 llvm::Value *NumThreadsSubOne = Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)); 592 return Bld.CreateAnd(NumThreadsSubOne, Bld.CreateNot(Mask), "master_tid"); 593} 594 595CGOpenMPRuntimeGPU::WorkerFunctionState::WorkerFunctionState( 596 CodeGenModule &CGM, SourceLocation Loc) 597 : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), 598 Loc(Loc) { 599 createWorkerFunction(CGM); 600} 601 602void CGOpenMPRuntimeGPU::WorkerFunctionState::createWorkerFunction( 603 CodeGenModule &CGM) { 604 // Create an worker function with no arguments. 605 606 WorkerFn = llvm::Function::Create( 607 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 608 /*placeholder=*/"_worker", &CGM.getModule()); 609 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); 610 WorkerFn->setDoesNotRecurse(); 611} 612 613CGOpenMPRuntimeGPU::ExecutionMode 614CGOpenMPRuntimeGPU::getExecutionMode() const { 615 return CurrentExecutionMode; 616} 617 618static CGOpenMPRuntimeGPU::DataSharingMode 619getDataSharingMode(CodeGenModule &CGM) { 620 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA 621 : CGOpenMPRuntimeGPU::Generic; 622} 623 624/// Check for inner (nested) SPMD construct, if any 625static bool hasNestedSPMDDirective(ASTContext &Ctx, 626 const OMPExecutableDirective &D) { 627 const auto *CS = D.getInnermostCapturedStmt(); 628 const auto *Body = 629 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 630 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 631 632 if (const auto *NestedDir = 633 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 634 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 635 switch (D.getDirectiveKind()) { 636 case OMPD_target: 637 if (isOpenMPParallelDirective(DKind)) 638 return true; 639 if (DKind == OMPD_teams) { 640 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 641 /*IgnoreCaptured=*/true); 642 if (!Body) 643 return false; 644 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 645 if (const auto *NND = 646 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 647 DKind = NND->getDirectiveKind(); 648 if (isOpenMPParallelDirective(DKind)) 649 return true; 650 } 651 } 652 return false; 653 case OMPD_target_teams: 654 return isOpenMPParallelDirective(DKind); 655 case OMPD_target_simd: 656 case OMPD_target_parallel: 657 case OMPD_target_parallel_for: 658 case OMPD_target_parallel_for_simd: 659 case OMPD_target_teams_distribute: 660 case OMPD_target_teams_distribute_simd: 661 case OMPD_target_teams_distribute_parallel_for: 662 case OMPD_target_teams_distribute_parallel_for_simd: 663 case OMPD_parallel: 664 case OMPD_for: 665 case OMPD_parallel_for: 666 case OMPD_parallel_master: 667 case OMPD_parallel_sections: 668 case OMPD_for_simd: 669 case OMPD_parallel_for_simd: 670 case OMPD_cancel: 671 case OMPD_cancellation_point: 672 case OMPD_ordered: 673 case OMPD_threadprivate: 674 case OMPD_allocate: 675 case OMPD_task: 676 case OMPD_simd: 677 case OMPD_sections: 678 case OMPD_section: 679 case OMPD_single: 680 case OMPD_master: 681 case OMPD_critical: 682 case OMPD_taskyield: 683 case OMPD_barrier: 684 case OMPD_taskwait: 685 case OMPD_taskgroup: 686 case OMPD_atomic: 687 case OMPD_flush: 688 case OMPD_depobj: 689 case OMPD_scan: 690 case OMPD_teams: 691 case OMPD_target_data: 692 case OMPD_target_exit_data: 693 case OMPD_target_enter_data: 694 case OMPD_distribute: 695 case OMPD_distribute_simd: 696 case OMPD_distribute_parallel_for: 697 case OMPD_distribute_parallel_for_simd: 698 case OMPD_teams_distribute: 699 case OMPD_teams_distribute_simd: 700 case OMPD_teams_distribute_parallel_for: 701 case OMPD_teams_distribute_parallel_for_simd: 702 case OMPD_target_update: 703 case OMPD_declare_simd: 704 case OMPD_declare_variant: 705 case OMPD_begin_declare_variant: 706 case OMPD_end_declare_variant: 707 case OMPD_declare_target: 708 case OMPD_end_declare_target: 709 case OMPD_declare_reduction: 710 case OMPD_declare_mapper: 711 case OMPD_taskloop: 712 case OMPD_taskloop_simd: 713 case OMPD_master_taskloop: 714 case OMPD_master_taskloop_simd: 715 case OMPD_parallel_master_taskloop: 716 case OMPD_parallel_master_taskloop_simd: 717 case OMPD_requires: 718 case OMPD_unknown: 719 default: 720 llvm_unreachable("Unexpected directive."); 721 } 722 } 723 724 return false; 725} 726 727static bool supportsSPMDExecutionMode(ASTContext &Ctx, 728 const OMPExecutableDirective &D) { 729 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 730 switch (DirectiveKind) { 731 case OMPD_target: 732 case OMPD_target_teams: 733 return hasNestedSPMDDirective(Ctx, D); 734 case OMPD_target_parallel: 735 case OMPD_target_parallel_for: 736 case OMPD_target_parallel_for_simd: 737 case OMPD_target_teams_distribute_parallel_for: 738 case OMPD_target_teams_distribute_parallel_for_simd: 739 case OMPD_target_simd: 740 case OMPD_target_teams_distribute_simd: 741 return true; 742 case OMPD_target_teams_distribute: 743 return false; 744 case OMPD_parallel: 745 case OMPD_for: 746 case OMPD_parallel_for: 747 case OMPD_parallel_master: 748 case OMPD_parallel_sections: 749 case OMPD_for_simd: 750 case OMPD_parallel_for_simd: 751 case OMPD_cancel: 752 case OMPD_cancellation_point: 753 case OMPD_ordered: 754 case OMPD_threadprivate: 755 case OMPD_allocate: 756 case OMPD_task: 757 case OMPD_simd: 758 case OMPD_sections: 759 case OMPD_section: 760 case OMPD_single: 761 case OMPD_master: 762 case OMPD_critical: 763 case OMPD_taskyield: 764 case OMPD_barrier: 765 case OMPD_taskwait: 766 case OMPD_taskgroup: 767 case OMPD_atomic: 768 case OMPD_flush: 769 case OMPD_depobj: 770 case OMPD_scan: 771 case OMPD_teams: 772 case OMPD_target_data: 773 case OMPD_target_exit_data: 774 case OMPD_target_enter_data: 775 case OMPD_distribute: 776 case OMPD_distribute_simd: 777 case OMPD_distribute_parallel_for: 778 case OMPD_distribute_parallel_for_simd: 779 case OMPD_teams_distribute: 780 case OMPD_teams_distribute_simd: 781 case OMPD_teams_distribute_parallel_for: 782 case OMPD_teams_distribute_parallel_for_simd: 783 case OMPD_target_update: 784 case OMPD_declare_simd: 785 case OMPD_declare_variant: 786 case OMPD_begin_declare_variant: 787 case OMPD_end_declare_variant: 788 case OMPD_declare_target: 789 case OMPD_end_declare_target: 790 case OMPD_declare_reduction: 791 case OMPD_declare_mapper: 792 case OMPD_taskloop: 793 case OMPD_taskloop_simd: 794 case OMPD_master_taskloop: 795 case OMPD_master_taskloop_simd: 796 case OMPD_parallel_master_taskloop: 797 case OMPD_parallel_master_taskloop_simd: 798 case OMPD_requires: 799 case OMPD_unknown: 800 default: 801 break; 802 } 803 llvm_unreachable( 804 "Unknown programming model for OpenMP directive on NVPTX target."); 805} 806 807/// Check if the directive is loops based and has schedule clause at all or has 808/// static scheduling. 809static bool hasStaticScheduling(const OMPExecutableDirective &D) { 810 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) && 811 isOpenMPLoopDirective(D.getDirectiveKind()) && 812 "Expected loop-based directive."); 813 return !D.hasClausesOfKind<OMPOrderedClause>() && 814 (!D.hasClausesOfKind<OMPScheduleClause>() || 815 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(), 816 [](const OMPScheduleClause *C) { 817 return C->getScheduleKind() == OMPC_SCHEDULE_static; 818 })); 819} 820 821/// Check for inner (nested) lightweight runtime construct, if any 822static bool hasNestedLightweightDirective(ASTContext &Ctx, 823 const OMPExecutableDirective &D) { 824 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive."); 825 const auto *CS = D.getInnermostCapturedStmt(); 826 const auto *Body = 827 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 828 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 829 830 if (const auto *NestedDir = 831 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 832 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 833 switch (D.getDirectiveKind()) { 834 case OMPD_target: 835 if (isOpenMPParallelDirective(DKind) && 836 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && 837 hasStaticScheduling(*NestedDir)) 838 return true; 839 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd) 840 return true; 841 if (DKind == OMPD_parallel) { 842 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 843 /*IgnoreCaptured=*/true); 844 if (!Body) 845 return false; 846 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 847 if (const auto *NND = 848 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 849 DKind = NND->getDirectiveKind(); 850 if (isOpenMPWorksharingDirective(DKind) && 851 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 852 return true; 853 } 854 } else if (DKind == OMPD_teams) { 855 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 856 /*IgnoreCaptured=*/true); 857 if (!Body) 858 return false; 859 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 860 if (const auto *NND = 861 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 862 DKind = NND->getDirectiveKind(); 863 if (isOpenMPParallelDirective(DKind) && 864 isOpenMPWorksharingDirective(DKind) && 865 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 866 return true; 867 if (DKind == OMPD_parallel) { 868 Body = NND->getInnermostCapturedStmt()->IgnoreContainers( 869 /*IgnoreCaptured=*/true); 870 if (!Body) 871 return false; 872 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 873 if (const auto *NND = 874 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 875 DKind = NND->getDirectiveKind(); 876 if (isOpenMPWorksharingDirective(DKind) && 877 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 878 return true; 879 } 880 } 881 } 882 } 883 return false; 884 case OMPD_target_teams: 885 if (isOpenMPParallelDirective(DKind) && 886 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && 887 hasStaticScheduling(*NestedDir)) 888 return true; 889 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd) 890 return true; 891 if (DKind == OMPD_parallel) { 892 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 893 /*IgnoreCaptured=*/true); 894 if (!Body) 895 return false; 896 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 897 if (const auto *NND = 898 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 899 DKind = NND->getDirectiveKind(); 900 if (isOpenMPWorksharingDirective(DKind) && 901 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) 902 return true; 903 } 904 } 905 return false; 906 case OMPD_target_parallel: 907 if (DKind == OMPD_simd) 908 return true; 909 return isOpenMPWorksharingDirective(DKind) && 910 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); 911 case OMPD_target_teams_distribute: 912 case OMPD_target_simd: 913 case OMPD_target_parallel_for: 914 case OMPD_target_parallel_for_simd: 915 case OMPD_target_teams_distribute_simd: 916 case OMPD_target_teams_distribute_parallel_for: 917 case OMPD_target_teams_distribute_parallel_for_simd: 918 case OMPD_parallel: 919 case OMPD_for: 920 case OMPD_parallel_for: 921 case OMPD_parallel_master: 922 case OMPD_parallel_sections: 923 case OMPD_for_simd: 924 case OMPD_parallel_for_simd: 925 case OMPD_cancel: 926 case OMPD_cancellation_point: 927 case OMPD_ordered: 928 case OMPD_threadprivate: 929 case OMPD_allocate: 930 case OMPD_task: 931 case OMPD_simd: 932 case OMPD_sections: 933 case OMPD_section: 934 case OMPD_single: 935 case OMPD_master: 936 case OMPD_critical: 937 case OMPD_taskyield: 938 case OMPD_barrier: 939 case OMPD_taskwait: 940 case OMPD_taskgroup: 941 case OMPD_atomic: 942 case OMPD_flush: 943 case OMPD_depobj: 944 case OMPD_scan: 945 case OMPD_teams: 946 case OMPD_target_data: 947 case OMPD_target_exit_data: 948 case OMPD_target_enter_data: 949 case OMPD_distribute: 950 case OMPD_distribute_simd: 951 case OMPD_distribute_parallel_for: 952 case OMPD_distribute_parallel_for_simd: 953 case OMPD_teams_distribute: 954 case OMPD_teams_distribute_simd: 955 case OMPD_teams_distribute_parallel_for: 956 case OMPD_teams_distribute_parallel_for_simd: 957 case OMPD_target_update: 958 case OMPD_declare_simd: 959 case OMPD_declare_variant: 960 case OMPD_begin_declare_variant: 961 case OMPD_end_declare_variant: 962 case OMPD_declare_target: 963 case OMPD_end_declare_target: 964 case OMPD_declare_reduction: 965 case OMPD_declare_mapper: 966 case OMPD_taskloop: 967 case OMPD_taskloop_simd: 968 case OMPD_master_taskloop: 969 case OMPD_master_taskloop_simd: 970 case OMPD_parallel_master_taskloop: 971 case OMPD_parallel_master_taskloop_simd: 972 case OMPD_requires: 973 case OMPD_unknown: 974 default: 975 llvm_unreachable("Unexpected directive."); 976 } 977 } 978 979 return false; 980} 981 982/// Checks if the construct supports lightweight runtime. It must be SPMD 983/// construct + inner loop-based construct with static scheduling. 984static bool supportsLightweightRuntime(ASTContext &Ctx, 985 const OMPExecutableDirective &D) { 986 if (!supportsSPMDExecutionMode(Ctx, D)) 987 return false; 988 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 989 switch (DirectiveKind) { 990 case OMPD_target: 991 case OMPD_target_teams: 992 case OMPD_target_parallel: 993 return hasNestedLightweightDirective(Ctx, D); 994 case OMPD_target_parallel_for: 995 case OMPD_target_parallel_for_simd: 996 case OMPD_target_teams_distribute_parallel_for: 997 case OMPD_target_teams_distribute_parallel_for_simd: 998 // (Last|First)-privates must be shared in parallel region. 999 return hasStaticScheduling(D); 1000 case OMPD_target_simd: 1001 case OMPD_target_teams_distribute_simd: 1002 return true; 1003 case OMPD_target_teams_distribute: 1004 return false; 1005 case OMPD_parallel: 1006 case OMPD_for: 1007 case OMPD_parallel_for: 1008 case OMPD_parallel_master: 1009 case OMPD_parallel_sections: 1010 case OMPD_for_simd: 1011 case OMPD_parallel_for_simd: 1012 case OMPD_cancel: 1013 case OMPD_cancellation_point: 1014 case OMPD_ordered: 1015 case OMPD_threadprivate: 1016 case OMPD_allocate: 1017 case OMPD_task: 1018 case OMPD_simd: 1019 case OMPD_sections: 1020 case OMPD_section: 1021 case OMPD_single: 1022 case OMPD_master: 1023 case OMPD_critical: 1024 case OMPD_taskyield: 1025 case OMPD_barrier: 1026 case OMPD_taskwait: 1027 case OMPD_taskgroup: 1028 case OMPD_atomic: 1029 case OMPD_flush: 1030 case OMPD_depobj: 1031 case OMPD_scan: 1032 case OMPD_teams: 1033 case OMPD_target_data: 1034 case OMPD_target_exit_data: 1035 case OMPD_target_enter_data: 1036 case OMPD_distribute: 1037 case OMPD_distribute_simd: 1038 case OMPD_distribute_parallel_for: 1039 case OMPD_distribute_parallel_for_simd: 1040 case OMPD_teams_distribute: 1041 case OMPD_teams_distribute_simd: 1042 case OMPD_teams_distribute_parallel_for: 1043 case OMPD_teams_distribute_parallel_for_simd: 1044 case OMPD_target_update: 1045 case OMPD_declare_simd: 1046 case OMPD_declare_variant: 1047 case OMPD_begin_declare_variant: 1048 case OMPD_end_declare_variant: 1049 case OMPD_declare_target: 1050 case OMPD_end_declare_target: 1051 case OMPD_declare_reduction: 1052 case OMPD_declare_mapper: 1053 case OMPD_taskloop: 1054 case OMPD_taskloop_simd: 1055 case OMPD_master_taskloop: 1056 case OMPD_master_taskloop_simd: 1057 case OMPD_parallel_master_taskloop: 1058 case OMPD_parallel_master_taskloop_simd: 1059 case OMPD_requires: 1060 case OMPD_unknown: 1061 default: 1062 break; 1063 } 1064 llvm_unreachable( 1065 "Unknown programming model for OpenMP directive on NVPTX target."); 1066} 1067 1068void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 1069 StringRef ParentName, 1070 llvm::Function *&OutlinedFn, 1071 llvm::Constant *&OutlinedFnID, 1072 bool IsOffloadEntry, 1073 const RegionCodeGenTy &CodeGen) { 1074 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); 1075 EntryFunctionState EST; 1076 WorkerFunctionState WST(CGM, D.getBeginLoc()); 1077 Work.clear(); 1078 WrapperFunctionsMap.clear(); 1079 1080 // Emit target region as a standalone region. 1081 class NVPTXPrePostActionTy : public PrePostActionTy { 1082 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 1083 CGOpenMPRuntimeGPU::WorkerFunctionState &WST; 1084 1085 public: 1086 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 1087 CGOpenMPRuntimeGPU::WorkerFunctionState &WST) 1088 : EST(EST), WST(WST) {} 1089 void Enter(CodeGenFunction &CGF) override { 1090 auto &RT = 1091 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1092 RT.emitNonSPMDEntryHeader(CGF, EST, WST); 1093 // Skip target region initialization. 1094 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1095 } 1096 void Exit(CodeGenFunction &CGF) override { 1097 auto &RT = 1098 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1099 RT.clearLocThreadIdInsertPt(CGF); 1100 RT.emitNonSPMDEntryFooter(CGF, EST); 1101 } 1102 } Action(EST, WST); 1103 CodeGen.setAction(Action); 1104 IsInTTDRegion = true; 1105 // Reserve place for the globalized memory. 1106 GlobalizedRecords.emplace_back(); 1107 if (!KernelStaticGlobalized) { 1108 KernelStaticGlobalized = new llvm::GlobalVariable( 1109 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, 1110 llvm::GlobalValue::InternalLinkage, 1111 llvm::UndefValue::get(CGM.VoidPtrTy), 1112 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, 1113 llvm::GlobalValue::NotThreadLocal, 1114 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); 1115 } 1116 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 1117 IsOffloadEntry, CodeGen); 1118 IsInTTDRegion = false; 1119 1120 // Now change the name of the worker function to correspond to this target 1121 // region's entry function. 1122 WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); 1123 1124 // Create the worker function 1125 emitWorkerFunction(WST); 1126} 1127 1128// Setup NVPTX threads for master-worker OpenMP scheme. 1129void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF, 1130 EntryFunctionState &EST, 1131 WorkerFunctionState &WST) { 1132 CGBuilderTy &Bld = CGF.Builder; 1133 1134 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); 1135 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); 1136 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); 1137 EST.ExitBB = CGF.createBasicBlock(".exit"); 1138 1139 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1140 llvm::Value *GPUThreadID = RT.getGPUThreadID(CGF); 1141 llvm::Value *ThreadLimit = getThreadLimit(CGF); 1142 llvm::Value *IsWorker = Bld.CreateICmpULT(GPUThreadID, ThreadLimit); 1143 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); 1144 1145 CGF.EmitBlock(WorkerBB); 1146 emitCall(CGF, WST.Loc, WST.WorkerFn); 1147 CGF.EmitBranch(EST.ExitBB); 1148 1149 CGF.EmitBlock(MasterCheckBB); 1150 GPUThreadID = RT.getGPUThreadID(CGF); 1151 llvm::Value *MasterThreadID = getMasterThreadID(CGF); 1152 llvm::Value *IsMaster = Bld.CreateICmpEQ(GPUThreadID, MasterThreadID); 1153 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); 1154 1155 CGF.EmitBlock(MasterBB); 1156 IsInTargetMasterThreadRegion = true; 1157 // SEQUENTIAL (MASTER) REGION START 1158 // First action in sequential region: 1159 // Initialize the state of the OpenMP runtime library on the GPU. 1160 // TODO: Optimize runtime initialization and pass in correct value. 1161 llvm::Value *Args[] = {getThreadLimit(CGF), 1162 Bld.getInt16(/*RequiresOMPRuntime=*/1)}; 1163 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1164 CGM.getModule(), OMPRTL___kmpc_kernel_init), 1165 Args); 1166 1167 // For data sharing, we need to initialize the stack. 1168 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1169 CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); 1170 1171 emitGenericVarsProlog(CGF, WST.Loc); 1172} 1173 1174void CGOpenMPRuntimeGPU::emitNonSPMDEntryFooter(CodeGenFunction &CGF, 1175 EntryFunctionState &EST) { 1176 IsInTargetMasterThreadRegion = false; 1177 if (!CGF.HaveInsertPoint()) 1178 return; 1179 1180 emitGenericVarsEpilog(CGF); 1181 1182 if (!EST.ExitBB) 1183 EST.ExitBB = CGF.createBasicBlock(".exit"); 1184 1185 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); 1186 CGF.EmitBranch(TerminateBB); 1187 1188 CGF.EmitBlock(TerminateBB); 1189 // Signal termination condition. 1190 // TODO: Optimize runtime initialization and pass in correct value. 1191 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; 1192 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1193 CGM.getModule(), OMPRTL___kmpc_kernel_deinit), 1194 Args); 1195 // Barrier to terminate worker threads. 1196 syncCTAThreads(CGF); 1197 // Master thread jumps to exit point. 1198 CGF.EmitBranch(EST.ExitBB); 1199 1200 CGF.EmitBlock(EST.ExitBB); 1201 EST.ExitBB = nullptr; 1202} 1203 1204void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 1205 StringRef ParentName, 1206 llvm::Function *&OutlinedFn, 1207 llvm::Constant *&OutlinedFnID, 1208 bool IsOffloadEntry, 1209 const RegionCodeGenTy &CodeGen) { 1210 ExecutionRuntimeModesRAII ModeRAII( 1211 CurrentExecutionMode, RequiresFullRuntime, 1212 CGM.getLangOpts().OpenMPCUDAForceFullRuntime || 1213 !supportsLightweightRuntime(CGM.getContext(), D)); 1214 EntryFunctionState EST; 1215 1216 // Emit target region as a standalone region. 1217 class NVPTXPrePostActionTy : public PrePostActionTy { 1218 CGOpenMPRuntimeGPU &RT; 1219 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 1220 const OMPExecutableDirective &D; 1221 1222 public: 1223 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 1224 CGOpenMPRuntimeGPU::EntryFunctionState &EST, 1225 const OMPExecutableDirective &D) 1226 : RT(RT), EST(EST), D(D) {} 1227 void Enter(CodeGenFunction &CGF) override { 1228 RT.emitSPMDEntryHeader(CGF, EST, D); 1229 // Skip target region initialization. 1230 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1231 } 1232 void Exit(CodeGenFunction &CGF) override { 1233 RT.clearLocThreadIdInsertPt(CGF); 1234 RT.emitSPMDEntryFooter(CGF, EST); 1235 } 1236 } Action(*this, EST, D); 1237 CodeGen.setAction(Action); 1238 IsInTTDRegion = true; 1239 // Reserve place for the globalized memory. 1240 GlobalizedRecords.emplace_back(); 1241 if (!KernelStaticGlobalized) { 1242 KernelStaticGlobalized = new llvm::GlobalVariable( 1243 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, 1244 llvm::GlobalValue::InternalLinkage, 1245 llvm::UndefValue::get(CGM.VoidPtrTy), 1246 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, 1247 llvm::GlobalValue::NotThreadLocal, 1248 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); 1249 } 1250 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 1251 IsOffloadEntry, CodeGen); 1252 IsInTTDRegion = false; 1253} 1254 1255void CGOpenMPRuntimeGPU::emitSPMDEntryHeader( 1256 CodeGenFunction &CGF, EntryFunctionState &EST, 1257 const OMPExecutableDirective &D) { 1258 CGBuilderTy &Bld = CGF.Builder; 1259 1260 // Setup BBs in entry function. 1261 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); 1262 EST.ExitBB = CGF.createBasicBlock(".exit"); 1263 1264 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), 1265 /*RequiresOMPRuntime=*/ 1266 Bld.getInt16(RequiresFullRuntime ? 1 : 0)}; 1267 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1268 CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), 1269 Args); 1270 1271 if (RequiresFullRuntime) { 1272 // For data sharing, we need to initialize the stack. 1273 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1274 CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); 1275 } 1276 1277 CGF.EmitBranch(ExecuteBB); 1278 1279 CGF.EmitBlock(ExecuteBB); 1280 1281 IsInTargetMasterThreadRegion = true; 1282} 1283 1284void CGOpenMPRuntimeGPU::emitSPMDEntryFooter(CodeGenFunction &CGF, 1285 EntryFunctionState &EST) { 1286 IsInTargetMasterThreadRegion = false; 1287 if (!CGF.HaveInsertPoint()) 1288 return; 1289 1290 if (!EST.ExitBB) 1291 EST.ExitBB = CGF.createBasicBlock(".exit"); 1292 1293 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); 1294 CGF.EmitBranch(OMPDeInitBB); 1295 1296 CGF.EmitBlock(OMPDeInitBB); 1297 // DeInitialize the OMP state in the runtime; called by all active threads. 1298 llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ 1299 CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; 1300 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1301 CGM.getModule(), OMPRTL___kmpc_spmd_kernel_deinit_v2), 1302 Args); 1303 CGF.EmitBranch(EST.ExitBB); 1304 1305 CGF.EmitBlock(EST.ExitBB); 1306 EST.ExitBB = nullptr; 1307} 1308 1309// Create a unique global variable to indicate the execution mode of this target 1310// region. The execution mode is either 'generic', or 'spmd' depending on the 1311// target directive. This variable is picked up by the offload library to setup 1312// the device appropriately before kernel launch. If the execution mode is 1313// 'generic', the runtime reserves one warp for the master, otherwise, all 1314// warps participate in parallel work. 1315static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, 1316 bool Mode) { 1317 auto *GVMode = 1318 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, 1319 llvm::GlobalValue::WeakAnyLinkage, 1320 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), 1321 Twine(Name, "_exec_mode")); 1322 CGM.addCompilerUsedGlobal(GVMode); 1323} 1324 1325void CGOpenMPRuntimeGPU::emitWorkerFunction(WorkerFunctionState &WST) { 1326 ASTContext &Ctx = CGM.getContext(); 1327 1328 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1329 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, 1330 WST.Loc, WST.Loc); 1331 emitWorkerLoop(CGF, WST); 1332 CGF.FinishFunction(); 1333} 1334 1335void CGOpenMPRuntimeGPU::emitWorkerLoop(CodeGenFunction &CGF, 1336 WorkerFunctionState &WST) { 1337 // 1338 // The workers enter this loop and wait for parallel work from the master. 1339 // When the master encounters a parallel region it sets up the work + variable 1340 // arguments, and wakes up the workers. The workers first check to see if 1341 // they are required for the parallel region, i.e., within the # of requested 1342 // parallel threads. The activated workers load the variable arguments and 1343 // execute the parallel work. 1344 // 1345 1346 CGBuilderTy &Bld = CGF.Builder; 1347 1348 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); 1349 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); 1350 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); 1351 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); 1352 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); 1353 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 1354 1355 CGF.EmitBranch(AwaitBB); 1356 1357 // Workers wait for work from master. 1358 CGF.EmitBlock(AwaitBB); 1359 // Wait for parallel work 1360 syncCTAThreads(CGF); 1361 1362 Address WorkFn = 1363 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); 1364 Address ExecStatus = 1365 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); 1366 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); 1367 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); 1368 1369 // TODO: Optimize runtime initialization and pass in correct value. 1370 llvm::Value *Args[] = {WorkFn.getPointer()}; 1371 llvm::Value *Ret = 1372 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1373 CGM.getModule(), OMPRTL___kmpc_kernel_parallel), 1374 Args); 1375 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); 1376 1377 // On termination condition (workid == 0), exit loop. 1378 llvm::Value *WorkID = Bld.CreateLoad(WorkFn); 1379 llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); 1380 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); 1381 1382 // Activate requested workers. 1383 CGF.EmitBlock(SelectWorkersBB); 1384 llvm::Value *IsActive = 1385 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); 1386 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); 1387 1388 // Signal start of parallel region. 1389 CGF.EmitBlock(ExecuteBB); 1390 // Skip initialization. 1391 setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 1392 1393 // Process work items: outlined parallel functions. 1394 for (llvm::Function *W : Work) { 1395 // Try to match this outlined function. 1396 llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); 1397 1398 llvm::Value *WorkFnMatch = 1399 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); 1400 1401 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); 1402 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); 1403 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); 1404 1405 // Execute this outlined function. 1406 CGF.EmitBlock(ExecuteFNBB); 1407 1408 // Insert call to work function via shared wrapper. The shared 1409 // wrapper takes two arguments: 1410 // - the parallelism level; 1411 // - the thread ID; 1412 emitCall(CGF, WST.Loc, W, 1413 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); 1414 1415 // Go to end of parallel region. 1416 CGF.EmitBranch(TerminateBB); 1417 1418 CGF.EmitBlock(CheckNextBB); 1419 } 1420 // Default case: call to outlined function through pointer if the target 1421 // region makes a declare target call that may contain an orphaned parallel 1422 // directive. 1423 auto *ParallelFnTy = 1424 llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, 1425 /*isVarArg=*/false); 1426 llvm::Value *WorkFnCast = 1427 Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo()); 1428 // Insert call to work function via shared wrapper. The shared 1429 // wrapper takes two arguments: 1430 // - the parallelism level; 1431 // - the thread ID; 1432 emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast}, 1433 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); 1434 // Go to end of parallel region. 1435 CGF.EmitBranch(TerminateBB); 1436 1437 // Signal end of parallel region. 1438 CGF.EmitBlock(TerminateBB); 1439 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1440 CGM.getModule(), OMPRTL___kmpc_kernel_end_parallel), 1441 llvm::None); 1442 CGF.EmitBranch(BarrierBB); 1443 1444 // All active and inactive workers wait at a barrier after parallel region. 1445 CGF.EmitBlock(BarrierBB); 1446 // Barrier after parallel region. 1447 syncCTAThreads(CGF); 1448 CGF.EmitBranch(AwaitBB); 1449 1450 // Exit target region. 1451 CGF.EmitBlock(ExitBB); 1452 // Skip initialization. 1453 clearLocThreadIdInsertPt(CGF); 1454} 1455 1456void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID, 1457 llvm::Constant *Addr, 1458 uint64_t Size, int32_t, 1459 llvm::GlobalValue::LinkageTypes) { 1460 // TODO: Add support for global variables on the device after declare target 1461 // support. 1462 if (!isa<llvm::Function>(Addr)) 1463 return; 1464 llvm::Module &M = CGM.getModule(); 1465 llvm::LLVMContext &Ctx = CGM.getLLVMContext(); 1466 1467 // Get "nvvm.annotations" metadata node 1468 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 1469 1470 llvm::Metadata *MDVals[] = { 1471 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"), 1472 llvm::ConstantAsMetadata::get( 1473 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; 1474 // Append metadata to nvvm.annotations 1475 MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); 1476} 1477 1478void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 1479 const OMPExecutableDirective &D, StringRef ParentName, 1480 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 1481 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 1482 if (!IsOffloadEntry) // Nothing to do. 1483 return; 1484 1485 assert(!ParentName.empty() && "Invalid target region parent name!"); 1486 1487 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 1488 if (Mode) 1489 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 1490 CodeGen); 1491 else 1492 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 1493 CodeGen); 1494 1495 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); 1496} 1497 1498namespace { 1499LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 1500/// Enum for accesseing the reserved_2 field of the ident_t struct. 1501enum ModeFlagsTy : unsigned { 1502 /// Bit set to 1 when in SPMD mode. 1503 KMP_IDENT_SPMD_MODE = 0x01, 1504 /// Bit set to 1 when a simplified runtime is used. 1505 KMP_IDENT_SIMPLE_RT_MODE = 0x02, 1506 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE) 1507}; 1508 1509/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime. 1510static const ModeFlagsTy UndefinedMode = 1511 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE; 1512} // anonymous namespace 1513 1514unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const { 1515 switch (getExecutionMode()) { 1516 case EM_SPMD: 1517 if (requiresFullRuntime()) 1518 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE); 1519 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE; 1520 case EM_NonSPMD: 1521 assert(requiresFullRuntime() && "Expected full runtime."); 1522 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE); 1523 case EM_Unknown: 1524 return UndefinedMode; 1525 } 1526 llvm_unreachable("Unknown flags are requested."); 1527} 1528 1529CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 1530 : CGOpenMPRuntime(CGM, "_", "$") { 1531 if (!CGM.getLangOpts().OpenMPIsDevice) 1532 llvm_unreachable("OpenMP NVPTX can only handle device code."); 1533} 1534 1535void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 1536 ProcBindKind ProcBind, 1537 SourceLocation Loc) { 1538 // Do nothing in case of SPMD mode and L0 parallel. 1539 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1540 return; 1541 1542 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); 1543} 1544 1545void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 1546 llvm::Value *NumThreads, 1547 SourceLocation Loc) { 1548 // Do nothing in case of SPMD mode and L0 parallel. 1549 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1550 return; 1551 1552 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); 1553} 1554 1555void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 1556 const Expr *NumTeams, 1557 const Expr *ThreadLimit, 1558 SourceLocation Loc) {} 1559 1560llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 1561 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, 1562 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { 1563 // Emit target region as a standalone region. 1564 class NVPTXPrePostActionTy : public PrePostActionTy { 1565 bool &IsInParallelRegion; 1566 bool PrevIsInParallelRegion; 1567 1568 public: 1569 NVPTXPrePostActionTy(bool &IsInParallelRegion) 1570 : IsInParallelRegion(IsInParallelRegion) {} 1571 void Enter(CodeGenFunction &CGF) override { 1572 PrevIsInParallelRegion = IsInParallelRegion; 1573 IsInParallelRegion = true; 1574 } 1575 void Exit(CodeGenFunction &CGF) override { 1576 IsInParallelRegion = PrevIsInParallelRegion; 1577 } 1578 } Action(IsInParallelRegion); 1579 CodeGen.setAction(Action); 1580 bool PrevIsInTTDRegion = IsInTTDRegion; 1581 IsInTTDRegion = false; 1582 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; 1583 IsInTargetMasterThreadRegion = false; 1584 auto *OutlinedFun = 1585 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 1586 D, ThreadIDVar, InnermostKind, CodeGen)); 1587 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; 1588 IsInTTDRegion = PrevIsInTTDRegion; 1589 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD && 1590 !IsInParallelRegion) { 1591 llvm::Function *WrapperFun = 1592 createParallelDataSharingWrapper(OutlinedFun, D); 1593 WrapperFunctionsMap[OutlinedFun] = WrapperFun; 1594 } 1595 1596 return OutlinedFun; 1597} 1598 1599/// Get list of lastprivate variables from the teams distribute ... or 1600/// teams {distribute ...} directives. 1601static void 1602getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 1603 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 1604 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 1605 "expected teams directive."); 1606 const OMPExecutableDirective *Dir = &D; 1607 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 1608 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 1609 Ctx, 1610 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 1611 /*IgnoreCaptured=*/true))) { 1612 Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 1613 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 1614 Dir = nullptr; 1615 } 1616 } 1617 if (!Dir) 1618 return; 1619 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 1620 for (const Expr *E : C->getVarRefs()) 1621 Vars.push_back(getPrivateItem(E)); 1622 } 1623} 1624 1625/// Get list of reduction variables from the teams ... directives. 1626static void 1627getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 1628 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 1629 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 1630 "expected teams directive."); 1631 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 1632 for (const Expr *E : C->privates()) 1633 Vars.push_back(getPrivateItem(E)); 1634 } 1635} 1636 1637llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 1638 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, 1639 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { 1640 SourceLocation Loc = D.getBeginLoc(); 1641 1642 const RecordDecl *GlobalizedRD = nullptr; 1643 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 1644 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 1645 unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 1646 // Globalize team reductions variable unconditionally in all modes. 1647 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1648 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 1649 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 1650 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 1651 if (!LastPrivatesReductions.empty()) { 1652 GlobalizedRD = ::buildRecordForGlobalizedVars( 1653 CGM.getContext(), llvm::None, LastPrivatesReductions, 1654 MappedDeclsFields, WarpSize); 1655 } 1656 } else if (!LastPrivatesReductions.empty()) { 1657 assert(!TeamAndReductions.first && 1658 "Previous team declaration is not expected."); 1659 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 1660 std::swap(TeamAndReductions.second, LastPrivatesReductions); 1661 } 1662 1663 // Emit target region as a standalone region. 1664 class NVPTXPrePostActionTy : public PrePostActionTy { 1665 SourceLocation &Loc; 1666 const RecordDecl *GlobalizedRD; 1667 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1668 &MappedDeclsFields; 1669 1670 public: 1671 NVPTXPrePostActionTy( 1672 SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1673 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1674 &MappedDeclsFields) 1675 : Loc(Loc), GlobalizedRD(GlobalizedRD), 1676 MappedDeclsFields(MappedDeclsFields) {} 1677 void Enter(CodeGenFunction &CGF) override { 1678 auto &Rt = 1679 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1680 if (GlobalizedRD) { 1681 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1682 I->getSecond().GlobalRecord = GlobalizedRD; 1683 I->getSecond().MappedParams = 1684 std::make_unique<CodeGenFunction::OMPMapVars>(); 1685 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1686 for (const auto &Pair : MappedDeclsFields) { 1687 assert(Pair.getFirst()->isCanonicalDecl() && 1688 "Expected canonical declaration"); 1689 Data.insert(std::make_pair(Pair.getFirst(), 1690 MappedVarData(Pair.getSecond(), 1691 /*IsOnePerTeam=*/true))); 1692 } 1693 } 1694 Rt.emitGenericVarsProlog(CGF, Loc); 1695 } 1696 void Exit(CodeGenFunction &CGF) override { 1697 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1698 .emitGenericVarsEpilog(CGF); 1699 } 1700 } Action(Loc, GlobalizedRD, MappedDeclsFields); 1701 CodeGen.setAction(Action); 1702 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1703 D, ThreadIDVar, InnermostKind, CodeGen); 1704 1705 return OutlinedFun; 1706} 1707 1708void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1709 SourceLocation Loc, 1710 bool WithSPMDCheck) { 1711 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 1712 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1713 return; 1714 1715 CGBuilderTy &Bld = CGF.Builder; 1716 1717 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1718 if (I == FunctionGlobalizedDecls.end()) 1719 return; 1720 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { 1721 QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); 1722 QualType SecGlobalRecTy; 1723 1724 // Recover pointer to this function's global record. The runtime will 1725 // handle the specifics of the allocation of the memory. 1726 // Use actual memory size of the record including the padding 1727 // for alignment purposes. 1728 unsigned Alignment = 1729 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); 1730 unsigned GlobalRecordSize = 1731 CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); 1732 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); 1733 1734 llvm::PointerType *GlobalRecPtrTy = 1735 CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); 1736 llvm::Value *GlobalRecCastAddr; 1737 llvm::Value *IsTTD = nullptr; 1738 if (!IsInTTDRegion && 1739 (WithSPMDCheck || 1740 getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 1741 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 1742 llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); 1743 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); 1744 if (I->getSecond().SecondaryGlobalRecord.hasValue()) { 1745 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1746 llvm::Value *ThreadID = getThreadID(CGF, Loc); 1747 llvm::Value *PL = CGF.EmitRuntimeCall( 1748 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), 1749 OMPRTL___kmpc_parallel_level), 1750 {RTLoc, ThreadID}); 1751 IsTTD = Bld.CreateIsNull(PL); 1752 } 1753 llvm::Value *IsSPMD = Bld.CreateIsNotNull( 1754 CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1755 CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); 1756 Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); 1757 // There is no need to emit line number for unconditional branch. 1758 (void)ApplyDebugLocation::CreateEmpty(CGF); 1759 CGF.EmitBlock(SPMDBB); 1760 Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), 1761 CharUnits::fromQuantity(Alignment)); 1762 CGF.EmitBranch(ExitBB); 1763 // There is no need to emit line number for unconditional branch. 1764 (void)ApplyDebugLocation::CreateEmpty(CGF); 1765 CGF.EmitBlock(NonSPMDBB); 1766 llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); 1767 if (const RecordDecl *SecGlobalizedVarsRecord = 1768 I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { 1769 SecGlobalRecTy = 1770 CGM.getContext().getRecordType(SecGlobalizedVarsRecord); 1771 1772 // Recover pointer to this function's global record. The runtime will 1773 // handle the specifics of the allocation of the memory. 1774 // Use actual memory size of the record including the padding 1775 // for alignment purposes. 1776 unsigned Alignment = 1777 CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); 1778 unsigned GlobalRecordSize = 1779 CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); 1780 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); 1781 Size = Bld.CreateSelect( 1782 IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); 1783 } 1784 // TODO: allow the usage of shared memory to be controlled by 1785 // the user, for now, default to global. 1786 llvm::Value *GlobalRecordSizeArg[] = { 1787 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; 1788 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1789 OMPBuilder.getOrCreateRuntimeFunction( 1790 CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1791 GlobalRecordSizeArg); 1792 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1793 GlobalRecValue, GlobalRecPtrTy); 1794 CGF.EmitBlock(ExitBB); 1795 auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, 1796 /*NumReservedValues=*/2, "_select_stack"); 1797 Phi->addIncoming(RecPtr.getPointer(), SPMDBB); 1798 Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); 1799 GlobalRecCastAddr = Phi; 1800 I->getSecond().GlobalRecordAddr = Phi; 1801 I->getSecond().IsInSPMDModeFlag = IsSPMD; 1802 } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { 1803 assert(GlobalizedRecords.back().Records.size() < 2 && 1804 "Expected less than 2 globalized records: one for target and one " 1805 "for teams."); 1806 unsigned Offset = 0; 1807 for (const RecordDecl *RD : GlobalizedRecords.back().Records) { 1808 QualType RDTy = CGM.getContext().getRecordType(RD); 1809 unsigned Alignment = 1810 CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); 1811 unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); 1812 Offset = 1813 llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); 1814 } 1815 unsigned Alignment = 1816 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); 1817 Offset = llvm::alignTo(Offset, Alignment); 1818 GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); 1819 ++GlobalizedRecords.back().RegionCounter; 1820 if (GlobalizedRecords.back().Records.size() == 1) { 1821 assert(KernelStaticGlobalized && 1822 "Kernel static pointer must be initialized already."); 1823 auto *UseSharedMemory = new llvm::GlobalVariable( 1824 CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, 1825 llvm::GlobalValue::InternalLinkage, nullptr, 1826 "_openmp_static_kernel$is_shared"); 1827 UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 1828 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( 1829 /*DestWidth=*/16, /*Signed=*/0); 1830 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( 1831 Address(UseSharedMemory, 1832 CGM.getContext().getTypeAlignInChars(Int16Ty)), 1833 /*Volatile=*/false, Int16Ty, Loc); 1834 auto *StaticGlobalized = new llvm::GlobalVariable( 1835 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, 1836 llvm::GlobalValue::CommonLinkage, nullptr); 1837 auto *RecSize = new llvm::GlobalVariable( 1838 CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, 1839 llvm::GlobalValue::InternalLinkage, nullptr, 1840 "_openmp_static_kernel$size"); 1841 RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 1842 llvm::Value *Ld = CGF.EmitLoadOfScalar( 1843 Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, 1844 CGM.getContext().getSizeType(), Loc); 1845 llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1846 KernelStaticGlobalized, CGM.VoidPtrPtrTy); 1847 llvm::Value *GlobalRecordSizeArg[] = { 1848 llvm::ConstantInt::get( 1849 CGM.Int16Ty, 1850 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), 1851 StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; 1852 CGF.EmitRuntimeCall( 1853 OMPBuilder.getOrCreateRuntimeFunction( 1854 CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), 1855 GlobalRecordSizeArg); 1856 GlobalizedRecords.back().Buffer = StaticGlobalized; 1857 GlobalizedRecords.back().RecSize = RecSize; 1858 GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; 1859 GlobalizedRecords.back().Loc = Loc; 1860 } 1861 assert(KernelStaticGlobalized && "Global address must be set already."); 1862 Address FrameAddr = CGF.EmitLoadOfPointer( 1863 Address(KernelStaticGlobalized, CGM.getPointerAlign()), 1864 CGM.getContext() 1865 .getPointerType(CGM.getContext().VoidPtrTy) 1866 .castAs<PointerType>()); 1867 llvm::Value *GlobalRecValue = 1868 Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); 1869 I->getSecond().GlobalRecordAddr = GlobalRecValue; 1870 I->getSecond().IsInSPMDModeFlag = nullptr; 1871 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1872 GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); 1873 } else { 1874 // TODO: allow the usage of shared memory to be controlled by 1875 // the user, for now, default to global. 1876 bool UseSharedMemory = 1877 IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; 1878 llvm::Value *GlobalRecordSizeArg[] = { 1879 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), 1880 CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; 1881 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1882 OMPBuilder.getOrCreateRuntimeFunction( 1883 CGM.getModule(), 1884 IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack 1885 : OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1886 GlobalRecordSizeArg); 1887 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1888 GlobalRecValue, GlobalRecPtrTy); 1889 I->getSecond().GlobalRecordAddr = GlobalRecValue; 1890 I->getSecond().IsInSPMDModeFlag = nullptr; 1891 } 1892 LValue Base = 1893 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); 1894 1895 // Emit the "global alloca" which is a GEP from the global declaration 1896 // record using the pointer returned by the runtime. 1897 LValue SecBase; 1898 decltype(I->getSecond().LocalVarData)::const_iterator SecIt; 1899 if (IsTTD) { 1900 SecIt = I->getSecond().SecondaryLocalVarData->begin(); 1901 llvm::PointerType *SecGlobalRecPtrTy = 1902 CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); 1903 SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( 1904 Bld.CreatePointerBitCastOrAddrSpaceCast( 1905 I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), 1906 SecGlobalRecTy); 1907 } 1908 for (auto &Rec : I->getSecond().LocalVarData) { 1909 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1910 llvm::Value *ParValue; 1911 if (EscapedParam) { 1912 const auto *VD = cast<VarDecl>(Rec.first); 1913 LValue ParLVal = 1914 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1915 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1916 } 1917 LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); 1918 // Emit VarAddr basing on lane-id if required. 1919 QualType VarTy; 1920 if (Rec.second.IsOnePerTeam) { 1921 VarTy = Rec.second.FD->getType(); 1922 } else { 1923 Address Addr = VarAddr.getAddress(CGF); 1924 llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( 1925 Addr.getElementType(), Addr.getPointer(), 1926 {Bld.getInt32(0), getNVPTXLaneID(CGF)}); 1927 VarTy = 1928 Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); 1929 VarAddr = CGF.MakeAddrLValue( 1930 Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, 1931 AlignmentSource::Decl); 1932 } 1933 Rec.second.PrivateAddr = VarAddr.getAddress(CGF); 1934 if (!IsInTTDRegion && 1935 (WithSPMDCheck || 1936 getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 1937 assert(I->getSecond().IsInSPMDModeFlag && 1938 "Expected unknown execution mode or required SPMD check."); 1939 if (IsTTD) { 1940 assert(SecIt->second.IsOnePerTeam && 1941 "Secondary glob data must be one per team."); 1942 LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); 1943 VarAddr.setAddress( 1944 Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), 1945 VarAddr.getPointer(CGF)), 1946 VarAddr.getAlignment())); 1947 Rec.second.PrivateAddr = VarAddr.getAddress(CGF); 1948 } 1949 Address GlobalPtr = Rec.second.PrivateAddr; 1950 Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); 1951 Rec.second.PrivateAddr = Address( 1952 Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, 1953 LocalAddr.getPointer(), GlobalPtr.getPointer()), 1954 LocalAddr.getAlignment()); 1955 } 1956 if (EscapedParam) { 1957 const auto *VD = cast<VarDecl>(Rec.first); 1958 CGF.EmitStoreOfScalar(ParValue, VarAddr); 1959 I->getSecond().MappedParams->setVarAddr(CGF, VD, 1960 VarAddr.getAddress(CGF)); 1961 } 1962 if (IsTTD) 1963 ++SecIt; 1964 } 1965 } 1966 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { 1967 // Recover pointer to this function's global record. The runtime will 1968 // handle the specifics of the allocation of the memory. 1969 // Use actual memory size of the record including the padding 1970 // for alignment purposes. 1971 CGBuilderTy &Bld = CGF.Builder; 1972 llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1973 CharUnits Align = CGM.getContext().getDeclAlign(VD); 1974 Size = Bld.CreateNUWAdd( 1975 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1976 llvm::Value *AlignVal = 1977 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1978 Size = Bld.CreateUDiv(Size, AlignVal); 1979 Size = Bld.CreateNUWMul(Size, AlignVal); 1980 // TODO: allow the usage of shared memory to be controlled by 1981 // the user, for now, default to global. 1982 llvm::Value *GlobalRecordSizeArg[] = { 1983 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; 1984 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( 1985 OMPBuilder.getOrCreateRuntimeFunction( 1986 CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), 1987 GlobalRecordSizeArg); 1988 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1989 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); 1990 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), 1991 CGM.getContext().getDeclAlign(VD), 1992 AlignmentSource::Decl); 1993 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD), 1994 Base.getAddress(CGF)); 1995 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); 1996 } 1997 I->getSecond().MappedParams->apply(CGF); 1998} 1999 2000void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, 2001 bool WithSPMDCheck) { 2002 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 2003 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 2004 return; 2005 2006 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2007 if (I != FunctionGlobalizedDecls.end()) { 2008 I->getSecond().MappedParams->restore(CGF); 2009 if (!CGF.HaveInsertPoint()) 2010 return; 2011 for (llvm::Value *Addr : 2012 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 2013 CGF.EmitRuntimeCall( 2014 OMPBuilder.getOrCreateRuntimeFunction( 2015 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2016 Addr); 2017 } 2018 if (I->getSecond().GlobalRecordAddr) { 2019 if (!IsInTTDRegion && 2020 (WithSPMDCheck || 2021 getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { 2022 CGBuilderTy &Bld = CGF.Builder; 2023 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); 2024 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); 2025 Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); 2026 // There is no need to emit line number for unconditional branch. 2027 (void)ApplyDebugLocation::CreateEmpty(CGF); 2028 CGF.EmitBlock(NonSPMDBB); 2029 CGF.EmitRuntimeCall( 2030 OMPBuilder.getOrCreateRuntimeFunction( 2031 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2032 CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); 2033 CGF.EmitBlock(ExitBB); 2034 } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { 2035 assert(GlobalizedRecords.back().RegionCounter > 0 && 2036 "region counter must be > 0."); 2037 --GlobalizedRecords.back().RegionCounter; 2038 // Emit the restore function only in the target region. 2039 if (GlobalizedRecords.back().RegionCounter == 0) { 2040 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( 2041 /*DestWidth=*/16, /*Signed=*/0); 2042 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( 2043 Address(GlobalizedRecords.back().UseSharedMemory, 2044 CGM.getContext().getTypeAlignInChars(Int16Ty)), 2045 /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); 2046 llvm::Value *Args[] = { 2047 llvm::ConstantInt::get( 2048 CGM.Int16Ty, 2049 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), 2050 IsInSharedMemory}; 2051 CGF.EmitRuntimeCall( 2052 OMPBuilder.getOrCreateRuntimeFunction( 2053 CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), 2054 Args); 2055 } 2056 } else { 2057 CGF.EmitRuntimeCall( 2058 OMPBuilder.getOrCreateRuntimeFunction( 2059 CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), 2060 I->getSecond().GlobalRecordAddr); 2061 } 2062 } 2063 } 2064} 2065 2066void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 2067 const OMPExecutableDirective &D, 2068 SourceLocation Loc, 2069 llvm::Function *OutlinedFn, 2070 ArrayRef<llvm::Value *> CapturedVars) { 2071 if (!CGF.HaveInsertPoint()) 2072 return; 2073 2074 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 2075 /*Name=*/".zero.addr"); 2076 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); 2077 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 2078 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); 2079 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 2080 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 2081 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 2082} 2083 2084void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 2085 SourceLocation Loc, 2086 llvm::Function *OutlinedFn, 2087 ArrayRef<llvm::Value *> CapturedVars, 2088 const Expr *IfCond) { 2089 if (!CGF.HaveInsertPoint()) 2090 return; 2091 2092 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, 2093 IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) { 2094 CGBuilderTy &Bld = CGF.Builder; 2095 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 2096 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 2097 if (WFn) { 2098 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 2099 // Remember for post-processing in worker loop. 2100 Work.emplace_back(WFn); 2101 } 2102 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 2103 2104 // Create a private scope that will globalize the arguments 2105 // passed from the outside of the target region. 2106 // TODO: Is that needed? 2107 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 2108 2109 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 2110 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 2111 "captured_vars_addrs"); 2112 // There's something to share. 2113 if (!CapturedVars.empty()) { 2114 // Prepare for parallel region. Indicate the outlined function. 2115 ASTContext &Ctx = CGF.getContext(); 2116 unsigned Idx = 0; 2117 for (llvm::Value *V : CapturedVars) { 2118 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 2119 llvm::Value *PtrV; 2120 if (V->getType()->isIntegerTy()) 2121 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 2122 else 2123 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 2124 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 2125 Ctx.getPointerType(Ctx.VoidPtrTy)); 2126 ++Idx; 2127 } 2128 } 2129 2130 llvm::Value *IfCondVal = nullptr; 2131 if (IfCond) 2132 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 2133 /* isSigned */ false); 2134 else 2135 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 2136 2137 assert(IfCondVal && "Expected a value"); 2138 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 2139 llvm::Value *Args[] = { 2140 RTLoc, 2141 getThreadID(CGF, Loc), 2142 IfCondVal, 2143 llvm::ConstantInt::get(CGF.Int32Ty, -1), 2144 llvm::ConstantInt::get(CGF.Int32Ty, -1), 2145 FnPtr, 2146 ID, 2147 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(), 2148 CGF.VoidPtrPtrTy), 2149 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 2150 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2151 CGM.getModule(), OMPRTL___kmpc_parallel_51), 2152 Args); 2153 }; 2154 2155 RegionCodeGenTy RCG(ParallelGen); 2156 RCG(CGF); 2157} 2158 2159void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 2160 // Always emit simple barriers! 2161 if (!CGF.HaveInsertPoint()) 2162 return; 2163 // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 2164 // This function does not use parameters, so we can emit just default values. 2165 llvm::Value *Args[] = { 2166 llvm::ConstantPointerNull::get( 2167 cast<llvm::PointerType>(getIdentTyPointerTy())), 2168 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 2169 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2170 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 2171 Args); 2172} 2173 2174void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 2175 SourceLocation Loc, 2176 OpenMPDirectiveKind Kind, bool, 2177 bool) { 2178 // Always emit simple barriers! 2179 if (!CGF.HaveInsertPoint()) 2180 return; 2181 // Build call __kmpc_cancel_barrier(loc, thread_id); 2182 unsigned Flags = getDefaultFlagsForBarriers(Kind); 2183 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 2184 getThreadID(CGF, Loc)}; 2185 2186 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2187 CGM.getModule(), OMPRTL___kmpc_barrier), 2188 Args); 2189} 2190 2191void CGOpenMPRuntimeGPU::emitCriticalRegion( 2192 CodeGenFunction &CGF, StringRef CriticalName, 2193 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 2194 const Expr *Hint) { 2195 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 2196 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 2197 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 2198 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 2199 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 2200 2201 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2202 2203 // Get the mask of active threads in the warp. 2204 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2205 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 2206 // Fetch team-local id of the thread. 2207 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 2208 2209 // Get the width of the team. 2210 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 2211 2212 // Initialize the counter variable for the loop. 2213 QualType Int32Ty = 2214 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 2215 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 2216 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 2217 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 2218 /*isInit=*/true); 2219 2220 // Block checks if loop counter exceeds upper bound. 2221 CGF.EmitBlock(LoopBB); 2222 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 2223 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 2224 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 2225 2226 // Block tests which single thread should execute region, and which threads 2227 // should go straight to synchronisation point. 2228 CGF.EmitBlock(TestBB); 2229 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 2230 llvm::Value *CmpThreadToCounter = 2231 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 2232 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 2233 2234 // Block emits the body of the critical region. 2235 CGF.EmitBlock(BodyBB); 2236 2237 // Output the critical statement. 2238 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 2239 Hint); 2240 2241 // After the body surrounded by the critical region, the single executing 2242 // thread will jump to the synchronisation point. 2243 // Block waits for all threads in current team to finish then increments the 2244 // counter variable and returns to the loop. 2245 CGF.EmitBlock(SyncBB); 2246 // Reconverge active threads in the warp. 2247 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 2248 CGM.getModule(), OMPRTL___kmpc_syncwarp), 2249 Mask); 2250 2251 llvm::Value *IncCounterVal = 2252 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 2253 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 2254 CGF.EmitBranch(LoopBB); 2255 2256 // Block that is reached when all threads in the team complete the region. 2257 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 2258} 2259 2260/// Cast value to the specified type. 2261static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 2262 QualType ValTy, QualType CastTy, 2263 SourceLocation Loc) { 2264 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 2265 "Cast type must sized."); 2266 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 2267 "Val type must sized."); 2268 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 2269 if (ValTy == CastTy) 2270 return Val; 2271 if (CGF.getContext().getTypeSizeInChars(ValTy) == 2272 CGF.getContext().getTypeSizeInChars(CastTy)) 2273 return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 2274 if (CastTy->isIntegerType() && ValTy->isIntegerType()) 2275 return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 2276 CastTy->hasSignedIntegerRepresentation()); 2277 Address CastItem = CGF.CreateMemTemp(CastTy); 2278 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2279 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace())); 2280 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 2281 LValueBaseInfo(AlignmentSource::Type), 2282 TBAAAccessInfo()); 2283 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 2284 LValueBaseInfo(AlignmentSource::Type), 2285 TBAAAccessInfo()); 2286} 2287 2288/// This function creates calls to one of two shuffle functions to copy 2289/// variables between lanes in a warp. 2290static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, 2291 llvm::Value *Elem, 2292 QualType ElemType, 2293 llvm::Value *Offset, 2294 SourceLocation Loc) { 2295 CodeGenModule &CGM = CGF.CGM; 2296 CGBuilderTy &Bld = CGF.Builder; 2297 CGOpenMPRuntimeGPU &RT = 2298 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime())); 2299 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder(); 2300 2301 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 2302 assert(Size.getQuantity() <= 8 && 2303 "Unsupported bitwidth in shuffle instruction."); 2304 2305 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4 2306 ? OMPRTL___kmpc_shuffle_int32 2307 : OMPRTL___kmpc_shuffle_int64; 2308 2309 // Cast all types to 32- or 64-bit values before calling shuffle routines. 2310 QualType CastTy = CGF.getContext().getIntTypeForBitwidth( 2311 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); 2312 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); 2313 llvm::Value *WarpSize = 2314 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); 2315 2316 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( 2317 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn), 2318 {ElemCast, Offset, WarpSize}); 2319 2320 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc); 2321} 2322 2323static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, 2324 Address DestAddr, QualType ElemType, 2325 llvm::Value *Offset, SourceLocation Loc) { 2326 CGBuilderTy &Bld = CGF.Builder; 2327 2328 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 2329 // Create the loop over the big sized data. 2330 // ptr = (void*)Elem; 2331 // ptrEnd = (void*) Elem + 1; 2332 // Step = 8; 2333 // while (ptr + Step < ptrEnd) 2334 // shuffle((int64_t)*ptr); 2335 // Step = 4; 2336 // while (ptr + Step < ptrEnd) 2337 // shuffle((int32_t)*ptr); 2338 // ... 2339 Address ElemPtr = DestAddr; 2340 Address Ptr = SrcAddr; 2341 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( 2342 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy); 2343 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { 2344 if (Size < CharUnits::fromQuantity(IntSize)) 2345 continue; 2346 QualType IntType = CGF.getContext().getIntTypeForBitwidth( 2347 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)), 2348 /*Signed=*/1); 2349 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType); 2350 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo()); 2351 ElemPtr = 2352 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo()); 2353 if (Size.getQuantity() / IntSize > 1) { 2354 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond"); 2355 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then"); 2356 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit"); 2357 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock(); 2358 CGF.EmitBlock(PreCondBB); 2359 llvm::PHINode *PhiSrc = 2360 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2); 2361 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB); 2362 llvm::PHINode *PhiDest = 2363 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2); 2364 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB); 2365 Ptr = Address(PhiSrc, Ptr.getAlignment()); 2366 ElemPtr = Address(PhiDest, ElemPtr.getAlignment()); 2367 llvm::Value *PtrDiff = Bld.CreatePtrDiff( 2368 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast( 2369 Ptr.getPointer(), CGF.VoidPtrTy)); 2370 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)), 2371 ThenBB, ExitBB); 2372 CGF.EmitBlock(ThenBB); 2373 llvm::Value *Res = createRuntimeShuffleFunction( 2374 CGF, 2375 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 2376 LValueBaseInfo(AlignmentSource::Type), 2377 TBAAAccessInfo()), 2378 IntType, Offset, Loc); 2379 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 2380 LValueBaseInfo(AlignmentSource::Type), 2381 TBAAAccessInfo()); 2382 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); 2383 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 2384 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); 2385 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); 2386 CGF.EmitBranch(PreCondBB); 2387 CGF.EmitBlock(ExitBB); 2388 } else { 2389 llvm::Value *Res = createRuntimeShuffleFunction( 2390 CGF, 2391 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 2392 LValueBaseInfo(AlignmentSource::Type), 2393 TBAAAccessInfo()), 2394 IntType, Offset, Loc); 2395 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 2396 LValueBaseInfo(AlignmentSource::Type), 2397 TBAAAccessInfo()); 2398 Ptr = Bld.CreateConstGEP(Ptr, 1); 2399 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 2400 } 2401 Size = Size % IntSize; 2402 } 2403} 2404 2405namespace { 2406enum CopyAction : unsigned { 2407 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 2408 // the warp using shuffle instructions. 2409 RemoteLaneToThread, 2410 // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 2411 ThreadCopy, 2412 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. 2413 ThreadToScratchpad, 2414 // ScratchpadToThread: Copy from a scratchpad array in global memory 2415 // containing team-reduced data to a thread's stack. 2416 ScratchpadToThread, 2417}; 2418} // namespace 2419 2420struct CopyOptionsTy { 2421 llvm::Value *RemoteLaneOffset; 2422 llvm::Value *ScratchpadIndex; 2423 llvm::Value *ScratchpadWidth; 2424}; 2425 2426/// Emit instructions to copy a Reduce list, which contains partially 2427/// aggregated values, in the specified direction. 2428static void emitReductionListCopy( 2429 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, 2430 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase, 2431 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { 2432 2433 CodeGenModule &CGM = CGF.CGM; 2434 ASTContext &C = CGM.getContext(); 2435 CGBuilderTy &Bld = CGF.Builder; 2436 2437 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; 2438 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex; 2439 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth; 2440 2441 // Iterates, element-by-element, through the source Reduce list and 2442 // make a copy. 2443 unsigned Idx = 0; 2444 unsigned Size = Privates.size(); 2445 for (const Expr *Private : Privates) { 2446 Address SrcElementAddr = Address::invalid(); 2447 Address DestElementAddr = Address::invalid(); 2448 Address DestElementPtrAddr = Address::invalid(); 2449 // Should we shuffle in an element from a remote lane? 2450 bool ShuffleInElement = false; 2451 // Set to true to update the pointer in the dest Reduce list to a 2452 // newly created element. 2453 bool UpdateDestListPtr = false; 2454 // Increment the src or dest pointer to the scratchpad, for each 2455 // new element. 2456 bool IncrScratchpadSrc = false; 2457 bool IncrScratchpadDest = false; 2458 2459 switch (Action) { 2460 case RemoteLaneToThread: { 2461 // Step 1.1: Get the address for the src element in the Reduce list. 2462 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2463 SrcElementAddr = CGF.EmitLoadOfPointer( 2464 SrcElementPtrAddr, 2465 C.getPointerType(Private->getType())->castAs<PointerType>()); 2466 2467 // Step 1.2: Create a temporary to store the element in the destination 2468 // Reduce list. 2469 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2470 DestElementAddr = 2471 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 2472 ShuffleInElement = true; 2473 UpdateDestListPtr = true; 2474 break; 2475 } 2476 case ThreadCopy: { 2477 // Step 1.1: Get the address for the src element in the Reduce list. 2478 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2479 SrcElementAddr = CGF.EmitLoadOfPointer( 2480 SrcElementPtrAddr, 2481 C.getPointerType(Private->getType())->castAs<PointerType>()); 2482 2483 // Step 1.2: Get the address for dest element. The destination 2484 // element has already been created on the thread's stack. 2485 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2486 DestElementAddr = CGF.EmitLoadOfPointer( 2487 DestElementPtrAddr, 2488 C.getPointerType(Private->getType())->castAs<PointerType>()); 2489 break; 2490 } 2491 case ThreadToScratchpad: { 2492 // Step 1.1: Get the address for the src element in the Reduce list. 2493 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 2494 SrcElementAddr = CGF.EmitLoadOfPointer( 2495 SrcElementPtrAddr, 2496 C.getPointerType(Private->getType())->castAs<PointerType>()); 2497 2498 // Step 1.2: Get the address for dest element: 2499 // address = base + index * ElementSizeInChars. 2500 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2501 llvm::Value *CurrentOffset = 2502 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 2503 llvm::Value *ScratchPadElemAbsolutePtrVal = 2504 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset); 2505 ScratchPadElemAbsolutePtrVal = 2506 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 2507 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, 2508 C.getTypeAlignInChars(Private->getType())); 2509 IncrScratchpadDest = true; 2510 break; 2511 } 2512 case ScratchpadToThread: { 2513 // Step 1.1: Get the address for the src element in the scratchpad. 2514 // address = base + index * ElementSizeInChars. 2515 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2516 llvm::Value *CurrentOffset = 2517 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 2518 llvm::Value *ScratchPadElemAbsolutePtrVal = 2519 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset); 2520 ScratchPadElemAbsolutePtrVal = 2521 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 2522 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, 2523 C.getTypeAlignInChars(Private->getType())); 2524 IncrScratchpadSrc = true; 2525 2526 // Step 1.2: Create a temporary to store the element in the destination 2527 // Reduce list. 2528 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 2529 DestElementAddr = 2530 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 2531 UpdateDestListPtr = true; 2532 break; 2533 } 2534 } 2535 2536 // Regardless of src and dest of copy, we emit the load of src 2537 // element as this is required in all directions 2538 SrcElementAddr = Bld.CreateElementBitCast( 2539 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); 2540 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr, 2541 SrcElementAddr.getElementType()); 2542 2543 // Now that all active lanes have read the element in the 2544 // Reduce list, shuffle over the value from the remote lane. 2545 if (ShuffleInElement) { 2546 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), 2547 RemoteLaneOffset, Private->getExprLoc()); 2548 } else { 2549 switch (CGF.getEvaluationKind(Private->getType())) { 2550 case TEK_Scalar: { 2551 llvm::Value *Elem = CGF.EmitLoadOfScalar( 2552 SrcElementAddr, /*Volatile=*/false, Private->getType(), 2553 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), 2554 TBAAAccessInfo()); 2555 // Store the source element value to the dest element address. 2556 CGF.EmitStoreOfScalar( 2557 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), 2558 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 2559 break; 2560 } 2561 case TEK_Complex: { 2562 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( 2563 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 2564 Private->getExprLoc()); 2565 CGF.EmitStoreOfComplex( 2566 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 2567 /*isInit=*/false); 2568 break; 2569 } 2570 case TEK_Aggregate: 2571 CGF.EmitAggregateCopy( 2572 CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 2573 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 2574 Private->getType(), AggValueSlot::DoesNotOverlap); 2575 break; 2576 } 2577 } 2578 2579 // Step 3.1: Modify reference in dest Reduce list as needed. 2580 // Modifying the reference in Reduce list to point to the newly 2581 // created element. The element is live in the current function 2582 // scope and that of functions it invokes (i.e., reduce_function). 2583 // RemoteReduceData[i] = (void*)&RemoteElem 2584 if (UpdateDestListPtr) { 2585 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( 2586 DestElementAddr.getPointer(), CGF.VoidPtrTy), 2587 DestElementPtrAddr, /*Volatile=*/false, 2588 C.VoidPtrTy); 2589 } 2590 2591 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting 2592 // address of the next element in scratchpad memory, unless we're currently 2593 // processing the last one. Memory alignment is also taken care of here. 2594 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { 2595 llvm::Value *ScratchpadBasePtr = 2596 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); 2597 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 2598 ScratchpadBasePtr = Bld.CreateNUWAdd( 2599 ScratchpadBasePtr, 2600 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars)); 2601 2602 // Take care of global memory alignment for performance 2603 ScratchpadBasePtr = Bld.CreateNUWSub( 2604 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 2605 ScratchpadBasePtr = Bld.CreateUDiv( 2606 ScratchpadBasePtr, 2607 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 2608 ScratchpadBasePtr = Bld.CreateNUWAdd( 2609 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 2610 ScratchpadBasePtr = Bld.CreateNUWMul( 2611 ScratchpadBasePtr, 2612 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 2613 2614 if (IncrScratchpadDest) 2615 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); 2616 else /* IncrScratchpadSrc = true */ 2617 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); 2618 } 2619 2620 ++Idx; 2621 } 2622} 2623 2624/// This function emits a helper that gathers Reduce lists from the first 2625/// lane of every active warp to lanes in the first warp. 2626/// 2627/// void inter_warp_copy_func(void* reduce_data, num_warps) 2628/// shared smem[warp_size]; 2629/// For all data entries D in reduce_data: 2630/// sync 2631/// If (I am the first lane in each warp) 2632/// Copy my local D to smem[warp_id] 2633/// sync 2634/// if (I am the first warp) 2635/// Copy smem[thread_id] to my local D 2636static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, 2637 ArrayRef<const Expr *> Privates, 2638 QualType ReductionArrayTy, 2639 SourceLocation Loc) { 2640 ASTContext &C = CGM.getContext(); 2641 llvm::Module &M = CGM.getModule(); 2642 2643 // ReduceList: thread local Reduce list. 2644 // At the stage of the computation when this function is called, partially 2645 // aggregated values reside in the first lane of every active warp. 2646 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2647 C.VoidPtrTy, ImplicitParamDecl::Other); 2648 // NumWarps: number of warps active in the parallel region. This could 2649 // be smaller than 32 (max warps in a CTA) for partial block reduction. 2650 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2651 C.getIntTypeForBitwidth(32, /* Signed */ true), 2652 ImplicitParamDecl::Other); 2653 FunctionArgList Args; 2654 Args.push_back(&ReduceListArg); 2655 Args.push_back(&NumWarpsArg); 2656 2657 const CGFunctionInfo &CGFI = 2658 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2659 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), 2660 llvm::GlobalValue::InternalLinkage, 2661 "_omp_reduction_inter_warp_copy_func", &M); 2662 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2663 Fn->setDoesNotRecurse(); 2664 CodeGenFunction CGF(CGM); 2665 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2666 2667 CGBuilderTy &Bld = CGF.Builder; 2668 2669 // This array is used as a medium to transfer, one reduce element at a time, 2670 // the data from the first lane of every warp to lanes in the first warp 2671 // in order to perform the final step of a reduction in a parallel region 2672 // (reduction across warps). The array is placed in NVPTX __shared__ memory 2673 // for reduced latency, as well as to have a distinct copy for concurrently 2674 // executing target regions. The array is declared with common linkage so 2675 // as to be shared across compilation units. 2676 StringRef TransferMediumName = 2677 "__openmp_nvptx_data_transfer_temporary_storage"; 2678 llvm::GlobalVariable *TransferMedium = 2679 M.getGlobalVariable(TransferMediumName); 2680 unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size); 2681 if (!TransferMedium) { 2682 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); 2683 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); 2684 TransferMedium = new llvm::GlobalVariable( 2685 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage, 2686 llvm::UndefValue::get(Ty), TransferMediumName, 2687 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, 2688 SharedAddressSpace); 2689 CGM.addCompilerUsedGlobal(TransferMedium); 2690 } 2691 2692 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2693 // Get the CUDA thread id of the current OpenMP thread on the GPU. 2694 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 2695 // nvptx_lane_id = nvptx_id % warpsize 2696 llvm::Value *LaneID = getNVPTXLaneID(CGF); 2697 // nvptx_warp_id = nvptx_id / warpsize 2698 llvm::Value *WarpID = getNVPTXWarpID(CGF); 2699 2700 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2701 Address LocalReduceList( 2702 Bld.CreatePointerBitCastOrAddrSpaceCast( 2703 CGF.EmitLoadOfScalar( 2704 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, 2705 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), 2706 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 2707 CGF.getPointerAlign()); 2708 2709 unsigned Idx = 0; 2710 for (const Expr *Private : Privates) { 2711 // 2712 // Warp master copies reduce element to transfer medium in __shared__ 2713 // memory. 2714 // 2715 unsigned RealTySize = 2716 C.getTypeSizeInChars(Private->getType()) 2717 .alignTo(C.getTypeAlignInChars(Private->getType())) 2718 .getQuantity(); 2719 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { 2720 unsigned NumIters = RealTySize / TySize; 2721 if (NumIters == 0) 2722 continue; 2723 QualType CType = C.getIntTypeForBitwidth( 2724 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); 2725 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); 2726 CharUnits Align = CharUnits::fromQuantity(TySize); 2727 llvm::Value *Cnt = nullptr; 2728 Address CntAddr = Address::invalid(); 2729 llvm::BasicBlock *PrecondBB = nullptr; 2730 llvm::BasicBlock *ExitBB = nullptr; 2731 if (NumIters > 1) { 2732 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); 2733 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, 2734 /*Volatile=*/false, C.IntTy); 2735 PrecondBB = CGF.createBasicBlock("precond"); 2736 ExitBB = CGF.createBasicBlock("exit"); 2737 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); 2738 // There is no need to emit line number for unconditional branch. 2739 (void)ApplyDebugLocation::CreateEmpty(CGF); 2740 CGF.EmitBlock(PrecondBB); 2741 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); 2742 llvm::Value *Cmp = 2743 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); 2744 Bld.CreateCondBr(Cmp, BodyBB, ExitBB); 2745 CGF.EmitBlock(BodyBB); 2746 } 2747 // kmpc_barrier. 2748 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 2749 /*EmitChecks=*/false, 2750 /*ForceSimpleCall=*/true); 2751 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 2752 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 2753 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 2754 2755 // if (lane_id == 0) 2756 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); 2757 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); 2758 CGF.EmitBlock(ThenBB); 2759 2760 // Reduce element = LocalReduceList[i] 2761 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2762 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 2763 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 2764 // elemptr = ((CopyType*)(elemptrptr)) + I 2765 Address ElemPtr = Address(ElemPtrPtr, Align); 2766 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType); 2767 if (NumIters > 1) { 2768 ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt), 2769 ElemPtr.getAlignment()); 2770 } 2771 2772 // Get pointer to location in transfer medium. 2773 // MediumPtr = &medium[warp_id] 2774 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( 2775 TransferMedium->getValueType(), TransferMedium, 2776 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); 2777 Address MediumPtr(MediumPtrVal, Align); 2778 // Casting to actual data type. 2779 // MediumPtr = (CopyType*)MediumPtrAddr; 2780 MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType); 2781 2782 // elem = *elemptr 2783 //*MediumPtr = elem 2784 llvm::Value *Elem = CGF.EmitLoadOfScalar( 2785 ElemPtr, /*Volatile=*/false, CType, Loc, 2786 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 2787 // Store the source element value to the dest element address. 2788 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType, 2789 LValueBaseInfo(AlignmentSource::Type), 2790 TBAAAccessInfo()); 2791 2792 Bld.CreateBr(MergeBB); 2793 2794 CGF.EmitBlock(ElseBB); 2795 Bld.CreateBr(MergeBB); 2796 2797 CGF.EmitBlock(MergeBB); 2798 2799 // kmpc_barrier. 2800 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 2801 /*EmitChecks=*/false, 2802 /*ForceSimpleCall=*/true); 2803 2804 // 2805 // Warp 0 copies reduce element from transfer medium. 2806 // 2807 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); 2808 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); 2809 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); 2810 2811 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); 2812 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( 2813 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); 2814 2815 // Up to 32 threads in warp 0 are active. 2816 llvm::Value *IsActiveThread = 2817 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); 2818 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); 2819 2820 CGF.EmitBlock(W0ThenBB); 2821 2822 // SrcMediumPtr = &medium[tid] 2823 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( 2824 TransferMedium->getValueType(), TransferMedium, 2825 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); 2826 Address SrcMediumPtr(SrcMediumPtrVal, Align); 2827 // SrcMediumVal = *SrcMediumPtr; 2828 SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); 2829 2830 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I 2831 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2832 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( 2833 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); 2834 Address TargetElemPtr = Address(TargetElemPtrVal, Align); 2835 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType); 2836 if (NumIters > 1) { 2837 TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt), 2838 TargetElemPtr.getAlignment()); 2839 } 2840 2841 // *TargetElemPtr = SrcMediumVal; 2842 llvm::Value *SrcMediumValue = 2843 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); 2844 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, 2845 CType); 2846 Bld.CreateBr(W0MergeBB); 2847 2848 CGF.EmitBlock(W0ElseBB); 2849 Bld.CreateBr(W0MergeBB); 2850 2851 CGF.EmitBlock(W0MergeBB); 2852 2853 if (NumIters > 1) { 2854 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); 2855 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); 2856 CGF.EmitBranch(PrecondBB); 2857 (void)ApplyDebugLocation::CreateEmpty(CGF); 2858 CGF.EmitBlock(ExitBB); 2859 } 2860 RealTySize %= TySize; 2861 } 2862 ++Idx; 2863 } 2864 2865 CGF.FinishFunction(); 2866 return Fn; 2867} 2868 2869/// Emit a helper that reduces data across two OpenMP threads (lanes) 2870/// in the same warp. It uses shuffle instructions to copy over data from 2871/// a remote lane's stack. The reduction algorithm performed is specified 2872/// by the fourth parameter. 2873/// 2874/// Algorithm Versions. 2875/// Full Warp Reduce (argument value 0): 2876/// This algorithm assumes that all 32 lanes are active and gathers 2877/// data from these 32 lanes, producing a single resultant value. 2878/// Contiguous Partial Warp Reduce (argument value 1): 2879/// This algorithm assumes that only a *contiguous* subset of lanes 2880/// are active. This happens for the last warp in a parallel region 2881/// when the user specified num_threads is not an integer multiple of 2882/// 32. This contiguous subset always starts with the zeroth lane. 2883/// Partial Warp Reduce (argument value 2): 2884/// This algorithm gathers data from any number of lanes at any position. 2885/// All reduced values are stored in the lowest possible lane. The set 2886/// of problems every algorithm addresses is a super set of those 2887/// addressable by algorithms with a lower version number. Overhead 2888/// increases as algorithm version increases. 2889/// 2890/// Terminology 2891/// Reduce element: 2892/// Reduce element refers to the individual data field with primitive 2893/// data types to be combined and reduced across threads. 2894/// Reduce list: 2895/// Reduce list refers to a collection of local, thread-private 2896/// reduce elements. 2897/// Remote Reduce list: 2898/// Remote Reduce list refers to a collection of remote (relative to 2899/// the current thread) reduce elements. 2900/// 2901/// We distinguish between three states of threads that are important to 2902/// the implementation of this function. 2903/// Alive threads: 2904/// Threads in a warp executing the SIMT instruction, as distinguished from 2905/// threads that are inactive due to divergent control flow. 2906/// Active threads: 2907/// The minimal set of threads that has to be alive upon entry to this 2908/// function. The computation is correct iff active threads are alive. 2909/// Some threads are alive but they are not active because they do not 2910/// contribute to the computation in any useful manner. Turning them off 2911/// may introduce control flow overheads without any tangible benefits. 2912/// Effective threads: 2913/// In order to comply with the argument requirements of the shuffle 2914/// function, we must keep all lanes holding data alive. But at most 2915/// half of them perform value aggregation; we refer to this half of 2916/// threads as effective. The other half is simply handing off their 2917/// data. 2918/// 2919/// Procedure 2920/// Value shuffle: 2921/// In this step active threads transfer data from higher lane positions 2922/// in the warp to lower lane positions, creating Remote Reduce list. 2923/// Value aggregation: 2924/// In this step, effective threads combine their thread local Reduce list 2925/// with Remote Reduce list and store the result in the thread local 2926/// Reduce list. 2927/// Value copy: 2928/// In this step, we deal with the assumption made by algorithm 2 2929/// (i.e. contiguity assumption). When we have an odd number of lanes 2930/// active, say 2k+1, only k threads will be effective and therefore k 2931/// new values will be produced. However, the Reduce list owned by the 2932/// (2k+1)th thread is ignored in the value aggregation. Therefore 2933/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 2934/// that the contiguity assumption still holds. 2935static llvm::Function *emitShuffleAndReduceFunction( 2936 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2937 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) { 2938 ASTContext &C = CGM.getContext(); 2939 2940 // Thread local Reduce list used to host the values of data to be reduced. 2941 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2942 C.VoidPtrTy, ImplicitParamDecl::Other); 2943 // Current lane id; could be logical. 2944 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy, 2945 ImplicitParamDecl::Other); 2946 // Offset of the remote source lane relative to the current lane. 2947 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2948 C.ShortTy, ImplicitParamDecl::Other); 2949 // Algorithm version. This is expected to be known at compile time. 2950 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2951 C.ShortTy, ImplicitParamDecl::Other); 2952 FunctionArgList Args; 2953 Args.push_back(&ReduceListArg); 2954 Args.push_back(&LaneIDArg); 2955 Args.push_back(&RemoteLaneOffsetArg); 2956 Args.push_back(&AlgoVerArg); 2957 2958 const CGFunctionInfo &CGFI = 2959 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2960 auto *Fn = llvm::Function::Create( 2961 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2962 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); 2963 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2964 Fn->setDoesNotRecurse(); 2965 2966 CodeGenFunction CGF(CGM); 2967 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2968 2969 CGBuilderTy &Bld = CGF.Builder; 2970 2971 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2972 Address LocalReduceList( 2973 Bld.CreatePointerBitCastOrAddrSpaceCast( 2974 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 2975 C.VoidPtrTy, SourceLocation()), 2976 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 2977 CGF.getPointerAlign()); 2978 2979 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); 2980 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( 2981 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2982 2983 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); 2984 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( 2985 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2986 2987 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); 2988 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( 2989 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2990 2991 // Create a local thread-private variable to host the Reduce list 2992 // from a remote lane. 2993 Address RemoteReduceList = 2994 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); 2995 2996 // This loop iterates through the list of reduce elements and copies, 2997 // element by element, from a remote lane in the warp to RemoteReduceList, 2998 // hosted on the thread's stack. 2999 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, 3000 LocalReduceList, RemoteReduceList, 3001 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal, 3002 /*ScratchpadIndex=*/nullptr, 3003 /*ScratchpadWidth=*/nullptr}); 3004 3005 // The actions to be performed on the Remote Reduce list is dependent 3006 // on the algorithm version. 3007 // 3008 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && 3009 // LaneId % 2 == 0 && Offset > 0): 3010 // do the reduction value aggregation 3011 // 3012 // The thread local variable Reduce list is mutated in place to host the 3013 // reduced data, which is the aggregated value produced from local and 3014 // remote lanes. 3015 // 3016 // Note that AlgoVer is expected to be a constant integer known at compile 3017 // time. 3018 // When AlgoVer==0, the first conjunction evaluates to true, making 3019 // the entire predicate true during compile time. 3020 // When AlgoVer==1, the second conjunction has only the second part to be 3021 // evaluated during runtime. Other conjunctions evaluates to false 3022 // during compile time. 3023 // When AlgoVer==2, the third conjunction has only the second part to be 3024 // evaluated during runtime. Other conjunctions evaluates to false 3025 // during compile time. 3026 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal); 3027 3028 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 3029 llvm::Value *CondAlgo1 = Bld.CreateAnd( 3030 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); 3031 3032 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)); 3033 llvm::Value *CondAlgo2 = Bld.CreateAnd( 3034 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)))); 3035 CondAlgo2 = Bld.CreateAnd( 3036 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); 3037 3038 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); 3039 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); 3040 3041 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 3042 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 3043 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 3044 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); 3045 3046 CGF.EmitBlock(ThenBB); 3047 // reduce_function(LocalReduceList, RemoteReduceList) 3048 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3049 LocalReduceList.getPointer(), CGF.VoidPtrTy); 3050 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3051 RemoteReduceList.getPointer(), CGF.VoidPtrTy); 3052 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3053 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); 3054 Bld.CreateBr(MergeBB); 3055 3056 CGF.EmitBlock(ElseBB); 3057 Bld.CreateBr(MergeBB); 3058 3059 CGF.EmitBlock(MergeBB); 3060 3061 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local 3062 // Reduce list. 3063 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 3064 llvm::Value *CondCopy = Bld.CreateAnd( 3065 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); 3066 3067 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); 3068 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); 3069 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); 3070 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); 3071 3072 CGF.EmitBlock(CpyThenBB); 3073 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, 3074 RemoteReduceList, LocalReduceList); 3075 Bld.CreateBr(CpyMergeBB); 3076 3077 CGF.EmitBlock(CpyElseBB); 3078 Bld.CreateBr(CpyMergeBB); 3079 3080 CGF.EmitBlock(CpyMergeBB); 3081 3082 CGF.FinishFunction(); 3083 return Fn; 3084} 3085 3086/// This function emits a helper that copies all the reduction variables from 3087/// the team into the provided global buffer for the reduction variables. 3088/// 3089/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 3090/// For all data entries D in reduce_data: 3091/// Copy local D to buffer.D[Idx] 3092static llvm::Value *emitListToGlobalCopyFunction( 3093 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3094 QualType ReductionArrayTy, SourceLocation Loc, 3095 const RecordDecl *TeamReductionRec, 3096 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3097 &VarFieldMap) { 3098 ASTContext &C = CGM.getContext(); 3099 3100 // Buffer: global reduction buffer. 3101 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3102 C.VoidPtrTy, ImplicitParamDecl::Other); 3103 // Idx: index of the buffer. 3104 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3105 ImplicitParamDecl::Other); 3106 // ReduceList: thread local Reduce list. 3107 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3108 C.VoidPtrTy, ImplicitParamDecl::Other); 3109 FunctionArgList Args; 3110 Args.push_back(&BufferArg); 3111 Args.push_back(&IdxArg); 3112 Args.push_back(&ReduceListArg); 3113 3114 const CGFunctionInfo &CGFI = 3115 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3116 auto *Fn = llvm::Function::Create( 3117 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3118 "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); 3119 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3120 Fn->setDoesNotRecurse(); 3121 CodeGenFunction CGF(CGM); 3122 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3123 3124 CGBuilderTy &Bld = CGF.Builder; 3125 3126 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3127 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3128 Address LocalReduceList( 3129 Bld.CreatePointerBitCastOrAddrSpaceCast( 3130 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 3131 C.VoidPtrTy, Loc), 3132 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 3133 CGF.getPointerAlign()); 3134 QualType StaticTy = C.getRecordType(TeamReductionRec); 3135 llvm::Type *LLVMReductionsBufferTy = 3136 CGM.getTypes().ConvertTypeForMem(StaticTy); 3137 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3138 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3139 LLVMReductionsBufferTy->getPointerTo()); 3140 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3141 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3142 /*Volatile=*/false, C.IntTy, 3143 Loc)}; 3144 unsigned Idx = 0; 3145 for (const Expr *Private : Privates) { 3146 // Reduce element = LocalReduceList[i] 3147 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 3148 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 3149 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 3150 // elemptr = ((CopyType*)(elemptrptr)) + I 3151 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3152 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); 3153 Address ElemPtr = 3154 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); 3155 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 3156 // Global = Buffer.VD[Idx]; 3157 const FieldDecl *FD = VarFieldMap.lookup(VD); 3158 LValue GlobLVal = CGF.EmitLValueForField( 3159 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3160 Address GlobAddr = GlobLVal.getAddress(CGF); 3161 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3162 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3163 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment())); 3164 switch (CGF.getEvaluationKind(Private->getType())) { 3165 case TEK_Scalar: { 3166 llvm::Value *V = CGF.EmitLoadOfScalar( 3167 ElemPtr, /*Volatile=*/false, Private->getType(), Loc, 3168 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 3169 CGF.EmitStoreOfScalar(V, GlobLVal); 3170 break; 3171 } 3172 case TEK_Complex: { 3173 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( 3174 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); 3175 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); 3176 break; 3177 } 3178 case TEK_Aggregate: 3179 CGF.EmitAggregateCopy(GlobLVal, 3180 CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3181 Private->getType(), AggValueSlot::DoesNotOverlap); 3182 break; 3183 } 3184 ++Idx; 3185 } 3186 3187 CGF.FinishFunction(); 3188 return Fn; 3189} 3190 3191/// This function emits a helper that reduces all the reduction variables from 3192/// the team into the provided global buffer for the reduction variables. 3193/// 3194/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 3195/// void *GlobPtrs[]; 3196/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 3197/// ... 3198/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 3199/// reduce_function(GlobPtrs, reduce_data); 3200static llvm::Value *emitListToGlobalReduceFunction( 3201 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3202 QualType ReductionArrayTy, SourceLocation Loc, 3203 const RecordDecl *TeamReductionRec, 3204 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3205 &VarFieldMap, 3206 llvm::Function *ReduceFn) { 3207 ASTContext &C = CGM.getContext(); 3208 3209 // Buffer: global reduction buffer. 3210 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3211 C.VoidPtrTy, ImplicitParamDecl::Other); 3212 // Idx: index of the buffer. 3213 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3214 ImplicitParamDecl::Other); 3215 // ReduceList: thread local Reduce list. 3216 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3217 C.VoidPtrTy, ImplicitParamDecl::Other); 3218 FunctionArgList Args; 3219 Args.push_back(&BufferArg); 3220 Args.push_back(&IdxArg); 3221 Args.push_back(&ReduceListArg); 3222 3223 const CGFunctionInfo &CGFI = 3224 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3225 auto *Fn = llvm::Function::Create( 3226 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3227 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); 3228 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3229 Fn->setDoesNotRecurse(); 3230 CodeGenFunction CGF(CGM); 3231 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3232 3233 CGBuilderTy &Bld = CGF.Builder; 3234 3235 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3236 QualType StaticTy = C.getRecordType(TeamReductionRec); 3237 llvm::Type *LLVMReductionsBufferTy = 3238 CGM.getTypes().ConvertTypeForMem(StaticTy); 3239 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3240 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3241 LLVMReductionsBufferTy->getPointerTo()); 3242 3243 // 1. Build a list of reduction variables. 3244 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3245 Address ReductionList = 3246 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3247 auto IPriv = Privates.begin(); 3248 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3249 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3250 /*Volatile=*/false, C.IntTy, 3251 Loc)}; 3252 unsigned Idx = 0; 3253 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 3254 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3255 // Global = Buffer.VD[Idx]; 3256 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 3257 const FieldDecl *FD = VarFieldMap.lookup(VD); 3258 LValue GlobLVal = CGF.EmitLValueForField( 3259 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3260 Address GlobAddr = GlobLVal.getAddress(CGF); 3261 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3262 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3263 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); 3264 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); 3265 if ((*IPriv)->getType()->isVariablyModifiedType()) { 3266 // Store array size. 3267 ++Idx; 3268 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3269 llvm::Value *Size = CGF.Builder.CreateIntCast( 3270 CGF.getVLASize( 3271 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3272 .NumElts, 3273 CGF.SizeTy, /*isSigned=*/false); 3274 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3275 Elem); 3276 } 3277 } 3278 3279 // Call reduce_function(GlobalReduceList, ReduceList) 3280 llvm::Value *GlobalReduceList = 3281 CGF.EmitCastToVoidPtr(ReductionList.getPointer()); 3282 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3283 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 3284 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 3285 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3286 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); 3287 CGF.FinishFunction(); 3288 return Fn; 3289} 3290 3291/// This function emits a helper that copies all the reduction variables from 3292/// the team into the provided global buffer for the reduction variables. 3293/// 3294/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 3295/// For all data entries D in reduce_data: 3296/// Copy buffer.D[Idx] to local D; 3297static llvm::Value *emitGlobalToListCopyFunction( 3298 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3299 QualType ReductionArrayTy, SourceLocation Loc, 3300 const RecordDecl *TeamReductionRec, 3301 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3302 &VarFieldMap) { 3303 ASTContext &C = CGM.getContext(); 3304 3305 // Buffer: global reduction buffer. 3306 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3307 C.VoidPtrTy, ImplicitParamDecl::Other); 3308 // Idx: index of the buffer. 3309 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3310 ImplicitParamDecl::Other); 3311 // ReduceList: thread local Reduce list. 3312 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3313 C.VoidPtrTy, ImplicitParamDecl::Other); 3314 FunctionArgList Args; 3315 Args.push_back(&BufferArg); 3316 Args.push_back(&IdxArg); 3317 Args.push_back(&ReduceListArg); 3318 3319 const CGFunctionInfo &CGFI = 3320 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3321 auto *Fn = llvm::Function::Create( 3322 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3323 "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); 3324 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3325 Fn->setDoesNotRecurse(); 3326 CodeGenFunction CGF(CGM); 3327 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3328 3329 CGBuilderTy &Bld = CGF.Builder; 3330 3331 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3332 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3333 Address LocalReduceList( 3334 Bld.CreatePointerBitCastOrAddrSpaceCast( 3335 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 3336 C.VoidPtrTy, Loc), 3337 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), 3338 CGF.getPointerAlign()); 3339 QualType StaticTy = C.getRecordType(TeamReductionRec); 3340 llvm::Type *LLVMReductionsBufferTy = 3341 CGM.getTypes().ConvertTypeForMem(StaticTy); 3342 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3343 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3344 LLVMReductionsBufferTy->getPointerTo()); 3345 3346 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3347 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3348 /*Volatile=*/false, C.IntTy, 3349 Loc)}; 3350 unsigned Idx = 0; 3351 for (const Expr *Private : Privates) { 3352 // Reduce element = LocalReduceList[i] 3353 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 3354 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 3355 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 3356 // elemptr = ((CopyType*)(elemptrptr)) + I 3357 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3358 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); 3359 Address ElemPtr = 3360 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); 3361 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 3362 // Global = Buffer.VD[Idx]; 3363 const FieldDecl *FD = VarFieldMap.lookup(VD); 3364 LValue GlobLVal = CGF.EmitLValueForField( 3365 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3366 Address GlobAddr = GlobLVal.getAddress(CGF); 3367 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3368 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3369 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment())); 3370 switch (CGF.getEvaluationKind(Private->getType())) { 3371 case TEK_Scalar: { 3372 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); 3373 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(), 3374 LValueBaseInfo(AlignmentSource::Type), 3375 TBAAAccessInfo()); 3376 break; 3377 } 3378 case TEK_Complex: { 3379 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); 3380 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3381 /*isInit=*/false); 3382 break; 3383 } 3384 case TEK_Aggregate: 3385 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), 3386 GlobLVal, Private->getType(), 3387 AggValueSlot::DoesNotOverlap); 3388 break; 3389 } 3390 ++Idx; 3391 } 3392 3393 CGF.FinishFunction(); 3394 return Fn; 3395} 3396 3397/// This function emits a helper that reduces all the reduction variables from 3398/// the team into the provided global buffer for the reduction variables. 3399/// 3400/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 3401/// void *GlobPtrs[]; 3402/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 3403/// ... 3404/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 3405/// reduce_function(reduce_data, GlobPtrs); 3406static llvm::Value *emitGlobalToListReduceFunction( 3407 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 3408 QualType ReductionArrayTy, SourceLocation Loc, 3409 const RecordDecl *TeamReductionRec, 3410 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 3411 &VarFieldMap, 3412 llvm::Function *ReduceFn) { 3413 ASTContext &C = CGM.getContext(); 3414 3415 // Buffer: global reduction buffer. 3416 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3417 C.VoidPtrTy, ImplicitParamDecl::Other); 3418 // Idx: index of the buffer. 3419 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 3420 ImplicitParamDecl::Other); 3421 // ReduceList: thread local Reduce list. 3422 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 3423 C.VoidPtrTy, ImplicitParamDecl::Other); 3424 FunctionArgList Args; 3425 Args.push_back(&BufferArg); 3426 Args.push_back(&IdxArg); 3427 Args.push_back(&ReduceListArg); 3428 3429 const CGFunctionInfo &CGFI = 3430 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 3431 auto *Fn = llvm::Function::Create( 3432 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3433 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); 3434 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3435 Fn->setDoesNotRecurse(); 3436 CodeGenFunction CGF(CGM); 3437 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 3438 3439 CGBuilderTy &Bld = CGF.Builder; 3440 3441 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 3442 QualType StaticTy = C.getRecordType(TeamReductionRec); 3443 llvm::Type *LLVMReductionsBufferTy = 3444 CGM.getTypes().ConvertTypeForMem(StaticTy); 3445 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 3446 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 3447 LLVMReductionsBufferTy->getPointerTo()); 3448 3449 // 1. Build a list of reduction variables. 3450 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3451 Address ReductionList = 3452 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3453 auto IPriv = Privates.begin(); 3454 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 3455 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 3456 /*Volatile=*/false, C.IntTy, 3457 Loc)}; 3458 unsigned Idx = 0; 3459 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 3460 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3461 // Global = Buffer.VD[Idx]; 3462 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 3463 const FieldDecl *FD = VarFieldMap.lookup(VD); 3464 LValue GlobLVal = CGF.EmitLValueForField( 3465 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 3466 Address GlobAddr = GlobLVal.getAddress(CGF); 3467 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 3468 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 3469 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); 3470 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); 3471 if ((*IPriv)->getType()->isVariablyModifiedType()) { 3472 // Store array size. 3473 ++Idx; 3474 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3475 llvm::Value *Size = CGF.Builder.CreateIntCast( 3476 CGF.getVLASize( 3477 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3478 .NumElts, 3479 CGF.SizeTy, /*isSigned=*/false); 3480 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3481 Elem); 3482 } 3483 } 3484 3485 // Call reduce_function(ReduceList, GlobalReduceList) 3486 llvm::Value *GlobalReduceList = 3487 CGF.EmitCastToVoidPtr(ReductionList.getPointer()); 3488 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 3489 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 3490 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 3491 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 3492 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); 3493 CGF.FinishFunction(); 3494 return Fn; 3495} 3496 3497/// 3498/// Design of OpenMP reductions on the GPU 3499/// 3500/// Consider a typical OpenMP program with one or more reduction 3501/// clauses: 3502/// 3503/// float foo; 3504/// double bar; 3505/// #pragma omp target teams distribute parallel for \ 3506/// reduction(+:foo) reduction(*:bar) 3507/// for (int i = 0; i < N; i++) { 3508/// foo += A[i]; bar *= B[i]; 3509/// } 3510/// 3511/// where 'foo' and 'bar' are reduced across all OpenMP threads in 3512/// all teams. In our OpenMP implementation on the NVPTX device an 3513/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 3514/// within a team are mapped to CUDA threads within a threadblock. 3515/// Our goal is to efficiently aggregate values across all OpenMP 3516/// threads such that: 3517/// 3518/// - the compiler and runtime are logically concise, and 3519/// - the reduction is performed efficiently in a hierarchical 3520/// manner as follows: within OpenMP threads in the same warp, 3521/// across warps in a threadblock, and finally across teams on 3522/// the NVPTX device. 3523/// 3524/// Introduction to Decoupling 3525/// 3526/// We would like to decouple the compiler and the runtime so that the 3527/// latter is ignorant of the reduction variables (number, data types) 3528/// and the reduction operators. This allows a simpler interface 3529/// and implementation while still attaining good performance. 3530/// 3531/// Pseudocode for the aforementioned OpenMP program generated by the 3532/// compiler is as follows: 3533/// 3534/// 1. Create private copies of reduction variables on each OpenMP 3535/// thread: 'foo_private', 'bar_private' 3536/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 3537/// to it and writes the result in 'foo_private' and 'bar_private' 3538/// respectively. 3539/// 3. Call the OpenMP runtime on the GPU to reduce within a team 3540/// and store the result on the team master: 3541/// 3542/// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 3543/// reduceData, shuffleReduceFn, interWarpCpyFn) 3544/// 3545/// where: 3546/// struct ReduceData { 3547/// double *foo; 3548/// double *bar; 3549/// } reduceData 3550/// reduceData.foo = &foo_private 3551/// reduceData.bar = &bar_private 3552/// 3553/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 3554/// auxiliary functions generated by the compiler that operate on 3555/// variables of type 'ReduceData'. They aid the runtime perform 3556/// algorithmic steps in a data agnostic manner. 3557/// 3558/// 'shuffleReduceFn' is a pointer to a function that reduces data 3559/// of type 'ReduceData' across two OpenMP threads (lanes) in the 3560/// same warp. It takes the following arguments as input: 3561/// 3562/// a. variable of type 'ReduceData' on the calling lane, 3563/// b. its lane_id, 3564/// c. an offset relative to the current lane_id to generate a 3565/// remote_lane_id. The remote lane contains the second 3566/// variable of type 'ReduceData' that is to be reduced. 3567/// d. an algorithm version parameter determining which reduction 3568/// algorithm to use. 3569/// 3570/// 'shuffleReduceFn' retrieves data from the remote lane using 3571/// efficient GPU shuffle intrinsics and reduces, using the 3572/// algorithm specified by the 4th parameter, the two operands 3573/// element-wise. The result is written to the first operand. 3574/// 3575/// Different reduction algorithms are implemented in different 3576/// runtime functions, all calling 'shuffleReduceFn' to perform 3577/// the essential reduction step. Therefore, based on the 4th 3578/// parameter, this function behaves slightly differently to 3579/// cooperate with the runtime to ensure correctness under 3580/// different circumstances. 3581/// 3582/// 'InterWarpCpyFn' is a pointer to a function that transfers 3583/// reduced variables across warps. It tunnels, through CUDA 3584/// shared memory, the thread-private data of type 'ReduceData' 3585/// from lane 0 of each warp to a lane in the first warp. 3586/// 4. Call the OpenMP runtime on the GPU to reduce across teams. 3587/// The last team writes the global reduced value to memory. 3588/// 3589/// ret = __kmpc_nvptx_teams_reduce_nowait(..., 3590/// reduceData, shuffleReduceFn, interWarpCpyFn, 3591/// scratchpadCopyFn, loadAndReduceFn) 3592/// 3593/// 'scratchpadCopyFn' is a helper that stores reduced 3594/// data from the team master to a scratchpad array in 3595/// global memory. 3596/// 3597/// 'loadAndReduceFn' is a helper that loads data from 3598/// the scratchpad array and reduces it with the input 3599/// operand. 3600/// 3601/// These compiler generated functions hide address 3602/// calculation and alignment information from the runtime. 3603/// 5. if ret == 1: 3604/// The team master of the last team stores the reduced 3605/// result to the globals in memory. 3606/// foo += reduceData.foo; bar *= reduceData.bar 3607/// 3608/// 3609/// Warp Reduction Algorithms 3610/// 3611/// On the warp level, we have three algorithms implemented in the 3612/// OpenMP runtime depending on the number of active lanes: 3613/// 3614/// Full Warp Reduction 3615/// 3616/// The reduce algorithm within a warp where all lanes are active 3617/// is implemented in the runtime as follows: 3618/// 3619/// full_warp_reduce(void *reduce_data, 3620/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 3621/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 3622/// ShuffleReduceFn(reduce_data, 0, offset, 0); 3623/// } 3624/// 3625/// The algorithm completes in log(2, WARPSIZE) steps. 3626/// 3627/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 3628/// not used therefore we save instructions by not retrieving lane_id 3629/// from the corresponding special registers. The 4th parameter, which 3630/// represents the version of the algorithm being used, is set to 0 to 3631/// signify full warp reduction. 3632/// 3633/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3634/// 3635/// #reduce_elem refers to an element in the local lane's data structure 3636/// #remote_elem is retrieved from a remote lane 3637/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3638/// reduce_elem = reduce_elem REDUCE_OP remote_elem; 3639/// 3640/// Contiguous Partial Warp Reduction 3641/// 3642/// This reduce algorithm is used within a warp where only the first 3643/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 3644/// number of OpenMP threads in a parallel region is not a multiple of 3645/// WARPSIZE. The algorithm is implemented in the runtime as follows: 3646/// 3647/// void 3648/// contiguous_partial_reduce(void *reduce_data, 3649/// kmp_ShuffleReductFctPtr ShuffleReduceFn, 3650/// int size, int lane_id) { 3651/// int curr_size; 3652/// int offset; 3653/// curr_size = size; 3654/// mask = curr_size/2; 3655/// while (offset>0) { 3656/// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 3657/// curr_size = (curr_size+1)/2; 3658/// offset = curr_size/2; 3659/// } 3660/// } 3661/// 3662/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3663/// 3664/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3665/// if (lane_id < offset) 3666/// reduce_elem = reduce_elem REDUCE_OP remote_elem 3667/// else 3668/// reduce_elem = remote_elem 3669/// 3670/// This algorithm assumes that the data to be reduced are located in a 3671/// contiguous subset of lanes starting from the first. When there is 3672/// an odd number of active lanes, the data in the last lane is not 3673/// aggregated with any other lane's dat but is instead copied over. 3674/// 3675/// Dispersed Partial Warp Reduction 3676/// 3677/// This algorithm is used within a warp when any discontiguous subset of 3678/// lanes are active. It is used to implement the reduction operation 3679/// across lanes in an OpenMP simd region or in a nested parallel region. 3680/// 3681/// void 3682/// dispersed_partial_reduce(void *reduce_data, 3683/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 3684/// int size, remote_id; 3685/// int logical_lane_id = number_of_active_lanes_before_me() * 2; 3686/// do { 3687/// remote_id = next_active_lane_id_right_after_me(); 3688/// # the above function returns 0 of no active lane 3689/// # is present right after the current lane. 3690/// size = number_of_active_lanes_in_this_warp(); 3691/// logical_lane_id /= 2; 3692/// ShuffleReduceFn(reduce_data, logical_lane_id, 3693/// remote_id-1-threadIdx.x, 2); 3694/// } while (logical_lane_id % 2 == 0 && size > 1); 3695/// } 3696/// 3697/// There is no assumption made about the initial state of the reduction. 3698/// Any number of lanes (>=1) could be active at any position. The reduction 3699/// result is returned in the first active lane. 3700/// 3701/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 3702/// 3703/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 3704/// if (lane_id % 2 == 0 && offset > 0) 3705/// reduce_elem = reduce_elem REDUCE_OP remote_elem 3706/// else 3707/// reduce_elem = remote_elem 3708/// 3709/// 3710/// Intra-Team Reduction 3711/// 3712/// This function, as implemented in the runtime call 3713/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 3714/// threads in a team. It first reduces within a warp using the 3715/// aforementioned algorithms. We then proceed to gather all such 3716/// reduced values at the first warp. 3717/// 3718/// The runtime makes use of the function 'InterWarpCpyFn', which copies 3719/// data from each of the "warp master" (zeroth lane of each warp, where 3720/// warp-reduced data is held) to the zeroth warp. This step reduces (in 3721/// a mathematical sense) the problem of reduction across warp masters in 3722/// a block to the problem of warp reduction. 3723/// 3724/// 3725/// Inter-Team Reduction 3726/// 3727/// Once a team has reduced its data to a single value, it is stored in 3728/// a global scratchpad array. Since each team has a distinct slot, this 3729/// can be done without locking. 3730/// 3731/// The last team to write to the scratchpad array proceeds to reduce the 3732/// scratchpad array. One or more workers in the last team use the helper 3733/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 3734/// the k'th worker reduces every k'th element. 3735/// 3736/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 3737/// reduce across workers and compute a globally reduced value. 3738/// 3739void CGOpenMPRuntimeGPU::emitReduction( 3740 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 3741 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 3742 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 3743 if (!CGF.HaveInsertPoint()) 3744 return; 3745 3746 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 3747#ifndef NDEBUG 3748 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 3749#endif 3750 3751 if (Options.SimpleReduction) { 3752 assert(!TeamsReduction && !ParallelReduction && 3753 "Invalid reduction selection in emitReduction."); 3754 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 3755 ReductionOps, Options); 3756 return; 3757 } 3758 3759 assert((TeamsReduction || ParallelReduction) && 3760 "Invalid reduction selection in emitReduction."); 3761 3762 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), 3763 // RedList, shuffle_reduce_func, interwarp_copy_func); 3764 // or 3765 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); 3766 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 3767 llvm::Value *ThreadId = getThreadID(CGF, Loc); 3768 3769 llvm::Value *Res; 3770 ASTContext &C = CGM.getContext(); 3771 // 1. Build a list of reduction variables. 3772 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 3773 auto Size = RHSExprs.size(); 3774 for (const Expr *E : Privates) { 3775 if (E->getType()->isVariablyModifiedType()) 3776 // Reserve place for array size. 3777 ++Size; 3778 } 3779 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); 3780 QualType ReductionArrayTy = 3781 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, 3782 /*IndexTypeQuals=*/0); 3783 Address ReductionList = 3784 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 3785 auto IPriv = Privates.begin(); 3786 unsigned Idx = 0; 3787 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { 3788 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3789 CGF.Builder.CreateStore( 3790 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3791 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy), 3792 Elem); 3793 if ((*IPriv)->getType()->isVariablyModifiedType()) { 3794 // Store array size. 3795 ++Idx; 3796 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 3797 llvm::Value *Size = CGF.Builder.CreateIntCast( 3798 CGF.getVLASize( 3799 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 3800 .NumElts, 3801 CGF.SizeTy, /*isSigned=*/false); 3802 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 3803 Elem); 3804 } 3805 } 3806 3807 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3808 ReductionList.getPointer(), CGF.VoidPtrTy); 3809 llvm::Function *ReductionFn = emitReductionFunction( 3810 Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, 3811 LHSExprs, RHSExprs, ReductionOps); 3812 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); 3813 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( 3814 CGM, Privates, ReductionArrayTy, ReductionFn, Loc); 3815 llvm::Value *InterWarpCopyFn = 3816 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); 3817 3818 if (ParallelReduction) { 3819 llvm::Value *Args[] = {RTLoc, 3820 ThreadId, 3821 CGF.Builder.getInt32(RHSExprs.size()), 3822 ReductionArrayTySize, 3823 RL, 3824 ShuffleAndReduceFn, 3825 InterWarpCopyFn}; 3826 3827 Res = CGF.EmitRuntimeCall( 3828 OMPBuilder.getOrCreateRuntimeFunction( 3829 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2), 3830 Args); 3831 } else { 3832 assert(TeamsReduction && "expected teams reduction."); 3833 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 3834 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 3835 int Cnt = 0; 3836 for (const Expr *DRE : Privates) { 3837 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 3838 ++Cnt; 3839 } 3840 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( 3841 CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap, 3842 C.getLangOpts().OpenMPCUDAReductionBufNum); 3843 TeamsReductions.push_back(TeamReductionRec); 3844 if (!KernelTeamsReductionPtr) { 3845 KernelTeamsReductionPtr = new llvm::GlobalVariable( 3846 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, 3847 llvm::GlobalValue::InternalLinkage, nullptr, 3848 "_openmp_teams_reductions_buffer_$_$ptr"); 3849 } 3850 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( 3851 Address(KernelTeamsReductionPtr, CGM.getPointerAlign()), 3852 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); 3853 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( 3854 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3855 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( 3856 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3857 ReductionFn); 3858 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( 3859 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3860 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( 3861 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3862 ReductionFn); 3863 3864 llvm::Value *Args[] = { 3865 RTLoc, 3866 ThreadId, 3867 GlobalBufferPtr, 3868 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), 3869 RL, 3870 ShuffleAndReduceFn, 3871 InterWarpCopyFn, 3872 GlobalToBufferCpyFn, 3873 GlobalToBufferRedFn, 3874 BufferToGlobalCpyFn, 3875 BufferToGlobalRedFn}; 3876 3877 Res = CGF.EmitRuntimeCall( 3878 OMPBuilder.getOrCreateRuntimeFunction( 3879 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2), 3880 Args); 3881 } 3882 3883 // 5. Build if (res == 1) 3884 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); 3885 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); 3886 llvm::Value *Cond = CGF.Builder.CreateICmpEQ( 3887 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); 3888 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); 3889 3890 // 6. Build then branch: where we have reduced values in the master 3891 // thread in each team. 3892 // __kmpc_end_reduce{_nowait}(<gtid>); 3893 // break; 3894 CGF.EmitBlock(ThenBB); 3895 3896 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); 3897 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps, 3898 this](CodeGenFunction &CGF, PrePostActionTy &Action) { 3899 auto IPriv = Privates.begin(); 3900 auto ILHS = LHSExprs.begin(); 3901 auto IRHS = RHSExprs.begin(); 3902 for (const Expr *E : ReductionOps) { 3903 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), 3904 cast<DeclRefExpr>(*IRHS)); 3905 ++IPriv; 3906 ++ILHS; 3907 ++IRHS; 3908 } 3909 }; 3910 llvm::Value *EndArgs[] = {ThreadId}; 3911 RegionCodeGenTy RCG(CodeGen); 3912 NVPTXActionTy Action( 3913 nullptr, llvm::None, 3914 OMPBuilder.getOrCreateRuntimeFunction( 3915 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait), 3916 EndArgs); 3917 RCG.setAction(Action); 3918 RCG(CGF); 3919 // There is no need to emit line number for unconditional branch. 3920 (void)ApplyDebugLocation::CreateEmpty(CGF); 3921 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 3922} 3923 3924const VarDecl * 3925CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 3926 const VarDecl *NativeParam) const { 3927 if (!NativeParam->getType()->isReferenceType()) 3928 return NativeParam; 3929 QualType ArgType = NativeParam->getType(); 3930 QualifierCollector QC; 3931 const Type *NonQualTy = QC.strip(ArgType); 3932 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3933 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 3934 if (Attr->getCaptureKind() == OMPC_map) { 3935 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 3936 LangAS::opencl_global); 3937 } else if (Attr->getCaptureKind() == OMPC_firstprivate && 3938 PointeeTy.isConstant(CGM.getContext())) { 3939 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 3940 LangAS::opencl_generic); 3941 } 3942 } 3943 ArgType = CGM.getContext().getPointerType(PointeeTy); 3944 QC.addRestrict(); 3945 enum { NVPTX_local_addr = 5 }; 3946 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 3947 ArgType = QC.apply(CGM.getContext(), ArgType); 3948 if (isa<ImplicitParamDecl>(NativeParam)) 3949 return ImplicitParamDecl::Create( 3950 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 3951 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); 3952 return ParmVarDecl::Create( 3953 CGM.getContext(), 3954 const_cast<DeclContext *>(NativeParam->getDeclContext()), 3955 NativeParam->getBeginLoc(), NativeParam->getLocation(), 3956 NativeParam->getIdentifier(), ArgType, 3957 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 3958} 3959 3960Address 3961CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 3962 const VarDecl *NativeParam, 3963 const VarDecl *TargetParam) const { 3964 assert(NativeParam != TargetParam && 3965 NativeParam->getType()->isReferenceType() && 3966 "Native arg must not be the same as target arg."); 3967 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 3968 QualType NativeParamType = NativeParam->getType(); 3969 QualifierCollector QC; 3970 const Type *NonQualTy = QC.strip(NativeParamType); 3971 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3972 unsigned NativePointeeAddrSpace = 3973 CGF.getContext().getTargetAddressSpace(NativePointeeTy); 3974 QualType TargetTy = TargetParam->getType(); 3975 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar( 3976 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation()); 3977 // First cast to generic. 3978 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3979 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( 3980 /*AddrSpace=*/0)); 3981 // Cast from generic to native address space. 3982 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3983 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo( 3984 NativePointeeAddrSpace)); 3985 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 3986 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 3987 NativeParamType); 3988 return NativeParamAddr; 3989} 3990 3991void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 3992 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 3993 ArrayRef<llvm::Value *> Args) const { 3994 SmallVector<llvm::Value *, 4> TargetArgs; 3995 TargetArgs.reserve(Args.size()); 3996 auto *FnType = OutlinedFn.getFunctionType(); 3997 for (unsigned I = 0, E = Args.size(); I < E; ++I) { 3998 if (FnType->isVarArg() && FnType->getNumParams() <= I) { 3999 TargetArgs.append(std::next(Args.begin(), I), Args.end()); 4000 break; 4001 } 4002 llvm::Type *TargetType = FnType->getParamType(I); 4003 llvm::Value *NativeArg = Args[I]; 4004 if (!TargetType->isPointerTy()) { 4005 TargetArgs.emplace_back(NativeArg); 4006 continue; 4007 } 4008 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 4009 NativeArg, 4010 NativeArg->getType()->getPointerElementType()->getPointerTo()); 4011 TargetArgs.emplace_back( 4012 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); 4013 } 4014 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 4015} 4016 4017/// Emit function which wraps the outline parallel region 4018/// and controls the arguments which are passed to this function. 4019/// The wrapper ensures that the outlined function is called 4020/// with the correct arguments when data is shared. 4021llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 4022 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 4023 ASTContext &Ctx = CGM.getContext(); 4024 const auto &CS = *D.getCapturedStmt(OMPD_parallel); 4025 4026 // Create a function that takes as argument the source thread. 4027 FunctionArgList WrapperArgs; 4028 QualType Int16QTy = 4029 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 4030 QualType Int32QTy = 4031 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 4032 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 4033 /*Id=*/nullptr, Int16QTy, 4034 ImplicitParamDecl::Other); 4035 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 4036 /*Id=*/nullptr, Int32QTy, 4037 ImplicitParamDecl::Other); 4038 WrapperArgs.emplace_back(&ParallelLevelArg); 4039 WrapperArgs.emplace_back(&WrapperArg); 4040 4041 const CGFunctionInfo &CGFI = 4042 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 4043 4044 auto *Fn = llvm::Function::Create( 4045 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 4046 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 4047 4048 // Ensure we do not inline the function. This is trivially true for the ones 4049 // passed to __kmpc_fork_call but the ones calles in serialized regions 4050 // could be inlined. This is not a perfect but it is closer to the invariant 4051 // we want, namely, every data environment starts with a new function. 4052 // TODO: We should pass the if condition to the runtime function and do the 4053 // handling there. Much cleaner code. 4054 Fn->addFnAttr(llvm::Attribute::NoInline); 4055 4056 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 4057 Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 4058 Fn->setDoesNotRecurse(); 4059 4060 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 4061 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 4062 D.getBeginLoc(), D.getBeginLoc()); 4063 4064 const auto *RD = CS.getCapturedRecordDecl(); 4065 auto CurField = RD->field_begin(); 4066 4067 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 4068 /*Name=*/".zero.addr"); 4069 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); 4070 // Get the array of arguments. 4071 SmallVector<llvm::Value *, 8> Args; 4072 4073 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer()); 4074 Args.emplace_back(ZeroAddr.getPointer()); 4075 4076 CGBuilderTy &Bld = CGF.Builder; 4077 auto CI = CS.capture_begin(); 4078 4079 // Use global memory for data sharing. 4080 // Handle passing of global args to workers. 4081 Address GlobalArgs = 4082 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 4083 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 4084 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 4085 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 4086 CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 4087 DataSharingArgs); 4088 4089 // Retrieve the shared variables from the list of references returned 4090 // by the runtime. Pass the variables to the outlined function. 4091 Address SharedArgListAddress = Address::invalid(); 4092 if (CS.capture_size() > 0 || 4093 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 4094 SharedArgListAddress = CGF.EmitLoadOfPointer( 4095 GlobalArgs, CGF.getContext() 4096 .getPointerType(CGF.getContext().getPointerType( 4097 CGF.getContext().VoidPtrTy)) 4098 .castAs<PointerType>()); 4099 } 4100 unsigned Idx = 0; 4101 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 4102 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 4103 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4104 Src, CGF.SizeTy->getPointerTo()); 4105 llvm::Value *LB = CGF.EmitLoadOfScalar( 4106 TypedAddress, 4107 /*Volatile=*/false, 4108 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 4109 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 4110 Args.emplace_back(LB); 4111 ++Idx; 4112 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 4113 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4114 Src, CGF.SizeTy->getPointerTo()); 4115 llvm::Value *UB = CGF.EmitLoadOfScalar( 4116 TypedAddress, 4117 /*Volatile=*/false, 4118 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 4119 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 4120 Args.emplace_back(UB); 4121 ++Idx; 4122 } 4123 if (CS.capture_size() > 0) { 4124 ASTContext &CGFContext = CGF.getContext(); 4125 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 4126 QualType ElemTy = CurField->getType(); 4127 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 4128 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 4129 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); 4130 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 4131 /*Volatile=*/false, 4132 CGFContext.getPointerType(ElemTy), 4133 CI->getLocation()); 4134 if (CI->capturesVariableByCopy() && 4135 !CI->getCapturedVar()->getType()->isAnyPointerType()) { 4136 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 4137 CI->getLocation()); 4138 } 4139 Args.emplace_back(Arg); 4140 } 4141 } 4142 4143 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 4144 CGF.FinishFunction(); 4145 return Fn; 4146} 4147 4148void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 4149 const Decl *D) { 4150 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 4151 return; 4152 4153 assert(D && "Expected function or captured|block decl."); 4154 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 4155 "Function is registered already."); 4156 assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 4157 "Team is set but not processed."); 4158 const Stmt *Body = nullptr; 4159 bool NeedToDelayGlobalization = false; 4160 if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 4161 Body = FD->getBody(); 4162 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 4163 Body = BD->getBody(); 4164 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 4165 Body = CD->getBody(); 4166 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 4167 if (NeedToDelayGlobalization && 4168 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 4169 return; 4170 } 4171 if (!Body) 4172 return; 4173 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 4174 VarChecker.Visit(Body); 4175 const RecordDecl *GlobalizedVarsRecord = 4176 VarChecker.getGlobalizedRecord(IsInTTDRegion); 4177 TeamAndReductions.first = nullptr; 4178 TeamAndReductions.second.clear(); 4179 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 4180 VarChecker.getEscapedVariableLengthDecls(); 4181 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) 4182 return; 4183 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 4184 I->getSecond().MappedParams = 4185 std::make_unique<CodeGenFunction::OMPMapVars>(); 4186 I->getSecond().GlobalRecord = GlobalizedVarsRecord; 4187 I->getSecond().EscapedParameters.insert( 4188 VarChecker.getEscapedParameters().begin(), 4189 VarChecker.getEscapedParameters().end()); 4190 I->getSecond().EscapedVariableLengthDecls.append( 4191 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 4192 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 4193 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 4194 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 4195 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); 4196 Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); 4197 } 4198 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { 4199 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); 4200 VarChecker.Visit(Body); 4201 I->getSecond().SecondaryGlobalRecord = 4202 VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); 4203 I->getSecond().SecondaryLocalVarData.emplace(); 4204 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); 4205 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 4206 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 4207 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); 4208 Data.insert( 4209 std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); 4210 } 4211 } 4212 if (!NeedToDelayGlobalization) { 4213 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); 4214 struct GlobalizationScope final : EHScopeStack::Cleanup { 4215 GlobalizationScope() = default; 4216 4217 void Emit(CodeGenFunction &CGF, Flags flags) override { 4218 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 4219 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); 4220 } 4221 }; 4222 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 4223 } 4224} 4225 4226Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 4227 const VarDecl *VD) { 4228 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 4229 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 4230 auto AS = LangAS::Default; 4231 switch (A->getAllocatorType()) { 4232 // Use the default allocator here as by default local vars are 4233 // threadlocal. 4234 case OMPAllocateDeclAttr::OMPNullMemAlloc: 4235 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 4236 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 4237 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 4238 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 4239 // Follow the user decision - use default allocation. 4240 return Address::invalid(); 4241 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 4242 // TODO: implement aupport for user-defined allocators. 4243 return Address::invalid(); 4244 case OMPAllocateDeclAttr::OMPConstMemAlloc: 4245 AS = LangAS::cuda_constant; 4246 break; 4247 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 4248 AS = LangAS::cuda_shared; 4249 break; 4250 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 4251 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 4252 break; 4253 } 4254 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 4255 auto *GV = new llvm::GlobalVariable( 4256 CGM.getModule(), VarTy, /*isConstant=*/false, 4257 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy), 4258 VD->getName(), 4259 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 4260 CGM.getContext().getTargetAddressSpace(AS)); 4261 CharUnits Align = CGM.getContext().getDeclAlign(VD); 4262 GV->setAlignment(Align.getAsAlign()); 4263 return Address( 4264 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 4265 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( 4266 VD->getType().getAddressSpace()))), 4267 Align); 4268 } 4269 4270 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 4271 return Address::invalid(); 4272 4273 VD = VD->getCanonicalDecl(); 4274 auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 4275 if (I == FunctionGlobalizedDecls.end()) 4276 return Address::invalid(); 4277 auto VDI = I->getSecond().LocalVarData.find(VD); 4278 if (VDI != I->getSecond().LocalVarData.end()) 4279 return VDI->second.PrivateAddr; 4280 if (VD->hasAttrs()) { 4281 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 4282 E(VD->attr_end()); 4283 IT != E; ++IT) { 4284 auto VDI = I->getSecond().LocalVarData.find( 4285 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 4286 ->getCanonicalDecl()); 4287 if (VDI != I->getSecond().LocalVarData.end()) 4288 return VDI->second.PrivateAddr; 4289 } 4290 } 4291 4292 return Address::invalid(); 4293} 4294 4295void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 4296 FunctionGlobalizedDecls.erase(CGF.CurFn); 4297 CGOpenMPRuntime::functionFinished(CGF); 4298} 4299 4300void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 4301 CodeGenFunction &CGF, const OMPLoopDirective &S, 4302 OpenMPDistScheduleClauseKind &ScheduleKind, 4303 llvm::Value *&Chunk) const { 4304 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 4305 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 4306 ScheduleKind = OMPC_DIST_SCHEDULE_static; 4307 Chunk = CGF.EmitScalarConversion( 4308 RT.getGPUNumThreads(CGF), 4309 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 4310 S.getIterationVariable()->getType(), S.getBeginLoc()); 4311 return; 4312 } 4313 CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 4314 CGF, S, ScheduleKind, Chunk); 4315} 4316 4317void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 4318 CodeGenFunction &CGF, const OMPLoopDirective &S, 4319 OpenMPScheduleClauseKind &ScheduleKind, 4320 const Expr *&ChunkExpr) const { 4321 ScheduleKind = OMPC_SCHEDULE_static; 4322 // Chunk size is 1 in this case. 4323 llvm::APInt ChunkSize(32, 1); 4324 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 4325 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 4326 SourceLocation()); 4327} 4328 4329void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 4330 CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 4331 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 4332 " Expected target-based directive."); 4333 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 4334 for (const CapturedStmt::Capture &C : CS->captures()) { 4335 // Capture variables captured by reference in lambdas for target-based 4336 // directives. 4337 if (!C.capturesVariable()) 4338 continue; 4339 const VarDecl *VD = C.getCapturedVar(); 4340 const auto *RD = VD->getType() 4341 .getCanonicalType() 4342 .getNonReferenceType() 4343 ->getAsCXXRecordDecl(); 4344 if (!RD || !RD->isLambda()) 4345 continue; 4346 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 4347 LValue VDLVal; 4348 if (VD->getType().getCanonicalType()->isReferenceType()) 4349 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 4350 else 4351 VDLVal = CGF.MakeAddrLValue( 4352 VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 4353 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures; 4354 FieldDecl *ThisCapture = nullptr; 4355 RD->getCaptureFields(Captures, ThisCapture); 4356 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 4357 LValue ThisLVal = 4358 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 4359 llvm::Value *CXXThis = CGF.LoadCXXThis(); 4360 CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 4361 } 4362 for (const LambdaCapture &LC : RD->captures()) { 4363 if (LC.getCaptureKind() != LCK_ByRef) 4364 continue; 4365 const VarDecl *VD = LC.getCapturedVar(); 4366 if (!CS->capturesVariable(VD)) 4367 continue; 4368 auto It = Captures.find(VD); 4369 assert(It != Captures.end() && "Found lambda capture without field."); 4370 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 4371 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 4372 if (VD->getType().getCanonicalType()->isReferenceType()) 4373 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 4374 VD->getType().getCanonicalType()) 4375 .getAddress(CGF); 4376 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal); 4377 } 4378 } 4379} 4380 4381unsigned CGOpenMPRuntimeGPU::getDefaultFirstprivateAddressSpace() const { 4382 return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant); 4383} 4384 4385bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 4386 LangAS &AS) { 4387 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 4388 return false; 4389 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 4390 switch(A->getAllocatorType()) { 4391 case OMPAllocateDeclAttr::OMPNullMemAlloc: 4392 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 4393 // Not supported, fallback to the default mem space. 4394 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 4395 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 4396 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 4397 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 4398 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 4399 AS = LangAS::Default; 4400 return true; 4401 case OMPAllocateDeclAttr::OMPConstMemAlloc: 4402 AS = LangAS::cuda_constant; 4403 return true; 4404 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 4405 AS = LangAS::cuda_shared; 4406 return true; 4407 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 4408 llvm_unreachable("Expected predefined allocator for the variables with the " 4409 "static storage."); 4410 } 4411 return false; 4412} 4413 4414// Get current CudaArch and ignore any unknown values 4415static CudaArch getCudaArch(CodeGenModule &CGM) { 4416 if (!CGM.getTarget().hasFeature("ptx")) 4417 return CudaArch::UNKNOWN; 4418 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 4419 if (Feature.getValue()) { 4420 CudaArch Arch = StringToCudaArch(Feature.getKey()); 4421 if (Arch != CudaArch::UNKNOWN) 4422 return Arch; 4423 } 4424 } 4425 return CudaArch::UNKNOWN; 4426} 4427 4428/// Check to see if target architecture supports unified addressing which is 4429/// a restriction for OpenMP requires clause "unified_shared_memory". 4430void CGOpenMPRuntimeGPU::processRequiresDirective( 4431 const OMPRequiresDecl *D) { 4432 for (const OMPClause *Clause : D->clauselists()) { 4433 if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 4434 CudaArch Arch = getCudaArch(CGM); 4435 switch (Arch) { 4436 case CudaArch::SM_20: 4437 case CudaArch::SM_21: 4438 case CudaArch::SM_30: 4439 case CudaArch::SM_32: 4440 case CudaArch::SM_35: 4441 case CudaArch::SM_37: 4442 case CudaArch::SM_50: 4443 case CudaArch::SM_52: 4444 case CudaArch::SM_53: { 4445 SmallString<256> Buffer; 4446 llvm::raw_svector_ostream Out(Buffer); 4447 Out << "Target architecture " << CudaArchToString(Arch) 4448 << " does not support unified addressing"; 4449 CGM.Error(Clause->getBeginLoc(), Out.str()); 4450 return; 4451 } 4452 case CudaArch::SM_60: 4453 case CudaArch::SM_61: 4454 case CudaArch::SM_62: 4455 case CudaArch::SM_70: 4456 case CudaArch::SM_72: 4457 case CudaArch::SM_75: 4458 case CudaArch::SM_80: 4459 case CudaArch::SM_86: 4460 case CudaArch::GFX600: 4461 case CudaArch::GFX601: 4462 case CudaArch::GFX602: 4463 case CudaArch::GFX700: 4464 case CudaArch::GFX701: 4465 case CudaArch::GFX702: 4466 case CudaArch::GFX703: 4467 case CudaArch::GFX704: 4468 case CudaArch::GFX705: 4469 case CudaArch::GFX801: 4470 case CudaArch::GFX802: 4471 case CudaArch::GFX803: 4472 case CudaArch::GFX805: 4473 case CudaArch::GFX810: 4474 case CudaArch::GFX900: 4475 case CudaArch::GFX902: 4476 case CudaArch::GFX904: 4477 case CudaArch::GFX906: 4478 case CudaArch::GFX908: 4479 case CudaArch::GFX909: 4480 case CudaArch::GFX90a: 4481 case CudaArch::GFX90c: 4482 case CudaArch::GFX1010: 4483 case CudaArch::GFX1011: 4484 case CudaArch::GFX1012: 4485 case CudaArch::GFX1030: 4486 case CudaArch::GFX1031: 4487 case CudaArch::GFX1032: 4488 case CudaArch::GFX1033: 4489 case CudaArch::GFX1034: 4490 case CudaArch::UNUSED: 4491 case CudaArch::UNKNOWN: 4492 break; 4493 case CudaArch::LAST: 4494 llvm_unreachable("Unexpected Cuda arch."); 4495 } 4496 } 4497 } 4498 CGOpenMPRuntime::processRequiresDirective(D); 4499} 4500 4501/// Get number of SMs and number of blocks per SM. 4502static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { 4503 std::pair<unsigned, unsigned> Data; 4504 if (CGM.getLangOpts().OpenMPCUDANumSMs) 4505 Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; 4506 if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) 4507 Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; 4508 if (Data.first && Data.second) 4509 return Data; 4510 switch (getCudaArch(CGM)) { 4511 case CudaArch::SM_20: 4512 case CudaArch::SM_21: 4513 case CudaArch::SM_30: 4514 case CudaArch::SM_32: 4515 case CudaArch::SM_35: 4516 case CudaArch::SM_37: 4517 case CudaArch::SM_50: 4518 case CudaArch::SM_52: 4519 case CudaArch::SM_53: 4520 return {16, 16}; 4521 case CudaArch::SM_60: 4522 case CudaArch::SM_61: 4523 case CudaArch::SM_62: 4524 return {56, 32}; 4525 case CudaArch::SM_70: 4526 case CudaArch::SM_72: 4527 case CudaArch::SM_75: 4528 case CudaArch::SM_80: 4529 case CudaArch::SM_86: 4530 return {84, 32}; 4531 case CudaArch::GFX600: 4532 case CudaArch::GFX601: 4533 case CudaArch::GFX602: 4534 case CudaArch::GFX700: 4535 case CudaArch::GFX701: 4536 case CudaArch::GFX702: 4537 case CudaArch::GFX703: 4538 case CudaArch::GFX704: 4539 case CudaArch::GFX705: 4540 case CudaArch::GFX801: 4541 case CudaArch::GFX802: 4542 case CudaArch::GFX803: 4543 case CudaArch::GFX805: 4544 case CudaArch::GFX810: 4545 case CudaArch::GFX900: 4546 case CudaArch::GFX902: 4547 case CudaArch::GFX904: 4548 case CudaArch::GFX906: 4549 case CudaArch::GFX908: 4550 case CudaArch::GFX909: 4551 case CudaArch::GFX90a: 4552 case CudaArch::GFX90c: 4553 case CudaArch::GFX1010: 4554 case CudaArch::GFX1011: 4555 case CudaArch::GFX1012: 4556 case CudaArch::GFX1030: 4557 case CudaArch::GFX1031: 4558 case CudaArch::GFX1032: 4559 case CudaArch::GFX1033: 4560 case CudaArch::GFX1034: 4561 case CudaArch::UNUSED: 4562 case CudaArch::UNKNOWN: 4563 break; 4564 case CudaArch::LAST: 4565 llvm_unreachable("Unexpected Cuda arch."); 4566 } 4567 llvm_unreachable("Unexpected NVPTX target without ptx feature."); 4568} 4569 4570void CGOpenMPRuntimeGPU::clear() { 4571 if (!GlobalizedRecords.empty() && 4572 !CGM.getLangOpts().OpenMPCUDATargetParallel) { 4573 ASTContext &C = CGM.getContext(); 4574 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs; 4575 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs; 4576 RecordDecl *StaticRD = C.buildImplicitRecord( 4577 "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); 4578 StaticRD->startDefinition(); 4579 RecordDecl *SharedStaticRD = C.buildImplicitRecord( 4580 "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); 4581 SharedStaticRD->startDefinition(); 4582 for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { 4583 if (Records.Records.empty()) 4584 continue; 4585 unsigned Size = 0; 4586 unsigned RecAlignment = 0; 4587 for (const RecordDecl *RD : Records.Records) { 4588 QualType RDTy = C.getRecordType(RD); 4589 unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); 4590 RecAlignment = std::max(RecAlignment, Alignment); 4591 unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); 4592 Size = 4593 llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); 4594 } 4595 Size = llvm::alignTo(Size, RecAlignment); 4596 llvm::APInt ArySize(/*numBits=*/64, Size); 4597 QualType SubTy = C.getConstantArrayType( 4598 C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); 4599 const bool UseSharedMemory = Size <= SharedMemorySize; 4600 auto *Field = 4601 FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, 4602 SourceLocation(), SourceLocation(), nullptr, SubTy, 4603 C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), 4604 /*BW=*/nullptr, /*Mutable=*/false, 4605 /*InitStyle=*/ICIS_NoInit); 4606 Field->setAccess(AS_public); 4607 if (UseSharedMemory) { 4608 SharedStaticRD->addDecl(Field); 4609 SharedRecs.push_back(&Records); 4610 } else { 4611 StaticRD->addDecl(Field); 4612 GlobalRecs.push_back(&Records); 4613 } 4614 Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); 4615 Records.UseSharedMemory->setInitializer( 4616 llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); 4617 } 4618 // Allocate SharedMemorySize buffer for the shared memory. 4619 // FIXME: nvlink does not handle weak linkage correctly (object with the 4620 // different size are reported as erroneous). 4621 // Restore this code as sson as nvlink is fixed. 4622 if (!SharedStaticRD->field_empty()) { 4623 llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); 4624 QualType SubTy = C.getConstantArrayType( 4625 C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); 4626 auto *Field = FieldDecl::Create( 4627 C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, 4628 C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), 4629 /*BW=*/nullptr, /*Mutable=*/false, 4630 /*InitStyle=*/ICIS_NoInit); 4631 Field->setAccess(AS_public); 4632 SharedStaticRD->addDecl(Field); 4633 } 4634 SharedStaticRD->completeDefinition(); 4635 if (!SharedStaticRD->field_empty()) { 4636 QualType StaticTy = C.getRecordType(SharedStaticRD); 4637 llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); 4638 auto *GV = new llvm::GlobalVariable( 4639 CGM.getModule(), LLVMStaticTy, 4640 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, 4641 llvm::UndefValue::get(LLVMStaticTy), 4642 "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, 4643 llvm::GlobalValue::NotThreadLocal, 4644 C.getTargetAddressSpace(LangAS::cuda_shared)); 4645 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( 4646 GV, CGM.VoidPtrTy); 4647 for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { 4648 Rec->Buffer->replaceAllUsesWith(Replacement); 4649 Rec->Buffer->eraseFromParent(); 4650 } 4651 } 4652 StaticRD->completeDefinition(); 4653 if (!StaticRD->field_empty()) { 4654 QualType StaticTy = C.getRecordType(StaticRD); 4655 std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); 4656 llvm::APInt Size1(32, SMsBlockPerSM.second); 4657 QualType Arr1Ty = 4658 C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, 4659 /*IndexTypeQuals=*/0); 4660 llvm::APInt Size2(32, SMsBlockPerSM.first); 4661 QualType Arr2Ty = 4662 C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, 4663 /*IndexTypeQuals=*/0); 4664 llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); 4665 // FIXME: nvlink does not handle weak linkage correctly (object with the 4666 // different size are reported as erroneous). 4667 // Restore CommonLinkage as soon as nvlink is fixed. 4668 auto *GV = new llvm::GlobalVariable( 4669 CGM.getModule(), LLVMArr2Ty, 4670 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, 4671 llvm::Constant::getNullValue(LLVMArr2Ty), 4672 "_openmp_static_glob_rd_$_"); 4673 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( 4674 GV, CGM.VoidPtrTy); 4675 for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { 4676 Rec->Buffer->replaceAllUsesWith(Replacement); 4677 Rec->Buffer->eraseFromParent(); 4678 } 4679 } 4680 } 4681 if (!TeamsReductions.empty()) { 4682 ASTContext &C = CGM.getContext(); 4683 RecordDecl *StaticRD = C.buildImplicitRecord( 4684 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); 4685 StaticRD->startDefinition(); 4686 for (const RecordDecl *TeamReductionRec : TeamsReductions) { 4687 QualType RecTy = C.getRecordType(TeamReductionRec); 4688 auto *Field = FieldDecl::Create( 4689 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 4690 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 4691 /*BW=*/nullptr, /*Mutable=*/false, 4692 /*InitStyle=*/ICIS_NoInit); 4693 Field->setAccess(AS_public); 4694 StaticRD->addDecl(Field); 4695 } 4696 StaticRD->completeDefinition(); 4697 QualType StaticTy = C.getRecordType(StaticRD); 4698 llvm::Type *LLVMReductionsBufferTy = 4699 CGM.getTypes().ConvertTypeForMem(StaticTy); 4700 // FIXME: nvlink does not handle weak linkage correctly (object with the 4701 // different size are reported as erroneous). 4702 // Restore CommonLinkage as soon as nvlink is fixed. 4703 auto *GV = new llvm::GlobalVariable( 4704 CGM.getModule(), LLVMReductionsBufferTy, 4705 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, 4706 llvm::Constant::getNullValue(LLVMReductionsBufferTy), 4707 "_openmp_teams_reductions_buffer_$_"); 4708 KernelTeamsReductionPtr->setInitializer( 4709 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, 4710 CGM.VoidPtrTy)); 4711 } 4712 CGOpenMPRuntime::clear(); 4713} 4714