1//===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#include "Cuda.h" 10#include "CommonArgs.h" 11#include "InputInfo.h" 12#include "clang/Basic/Cuda.h" 13#include "clang/Config/config.h" 14#include "clang/Driver/Compilation.h" 15#include "clang/Driver/Distro.h" 16#include "clang/Driver/Driver.h" 17#include "clang/Driver/DriverDiagnostic.h" 18#include "clang/Driver/Options.h" 19#include "llvm/ADT/Optional.h" 20#include "llvm/Option/ArgList.h" 21#include "llvm/Support/FileSystem.h" 22#include "llvm/Support/Host.h" 23#include "llvm/Support/Path.h" 24#include "llvm/Support/Process.h" 25#include "llvm/Support/Program.h" 26#include "llvm/Support/TargetParser.h" 27#include "llvm/Support/VirtualFileSystem.h" 28#include <system_error> 29 30using namespace clang::driver; 31using namespace clang::driver::toolchains; 32using namespace clang::driver::tools; 33using namespace clang; 34using namespace llvm::opt; 35 36namespace { 37struct CudaVersionInfo { 38 std::string DetectedVersion; 39 CudaVersion Version; 40}; 41// Parses the contents of version.txt in an CUDA installation. It should 42// contain one line of the from e.g. "CUDA Version 7.5.2". 43CudaVersionInfo parseCudaVersionFile(llvm::StringRef V) { 44 V = V.trim(); 45 if (!V.startswith("CUDA Version ")) 46 return {V.str(), CudaVersion::UNKNOWN}; 47 V = V.substr(strlen("CUDA Version ")); 48 SmallVector<StringRef,4> VersionParts; 49 V.split(VersionParts, '.'); 50 return {"version.txt: " + V.str() + ".", 51 VersionParts.size() < 2 52 ? CudaVersion::UNKNOWN 53 : CudaStringToVersion( 54 join_items(".", VersionParts[0], VersionParts[1]))}; 55} 56 57CudaVersion getCudaVersion(uint32_t raw_version) { 58 if (raw_version < 7050) 59 return CudaVersion::CUDA_70; 60 if (raw_version < 8000) 61 return CudaVersion::CUDA_75; 62 if (raw_version < 9000) 63 return CudaVersion::CUDA_80; 64 if (raw_version < 9010) 65 return CudaVersion::CUDA_90; 66 if (raw_version < 9020) 67 return CudaVersion::CUDA_91; 68 if (raw_version < 10000) 69 return CudaVersion::CUDA_92; 70 if (raw_version < 10010) 71 return CudaVersion::CUDA_100; 72 if (raw_version < 10020) 73 return CudaVersion::CUDA_101; 74 if (raw_version < 11000) 75 return CudaVersion::CUDA_102; 76 if (raw_version < 11010) 77 return CudaVersion::CUDA_110; 78 if (raw_version < 11020) 79 return CudaVersion::CUDA_111; 80 return CudaVersion::LATEST; 81} 82 83CudaVersionInfo parseCudaHFile(llvm::StringRef Input) { 84 // Helper lambda which skips the words if the line starts with them or returns 85 // None otherwise. 86 auto StartsWithWords = 87 [](llvm::StringRef Line, 88 const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> { 89 for (StringRef word : words) { 90 if (!Line.consume_front(word)) 91 return {}; 92 Line = Line.ltrim(); 93 } 94 return Line; 95 }; 96 97 Input = Input.ltrim(); 98 while (!Input.empty()) { 99 if (auto Line = 100 StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) { 101 uint32_t RawVersion; 102 Line->consumeInteger(10, RawVersion); 103 return {"cuda.h: CUDA_VERSION=" + Twine(RawVersion).str() + ".", 104 getCudaVersion(RawVersion)}; 105 } 106 // Find next non-empty line. 107 Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim(); 108 } 109 return {"cuda.h: CUDA_VERSION not found.", CudaVersion::UNKNOWN}; 110} 111} // namespace 112 113void CudaInstallationDetector::WarnIfUnsupportedVersion() { 114 if (DetectedVersionIsNotSupported) 115 D.Diag(diag::warn_drv_unknown_cuda_version) 116 << DetectedVersion 117 << CudaVersionToString(CudaVersion::LATEST_SUPPORTED); 118} 119 120CudaInstallationDetector::CudaInstallationDetector( 121 const Driver &D, const llvm::Triple &HostTriple, 122 const llvm::opt::ArgList &Args) 123 : D(D) { 124 struct Candidate { 125 std::string Path; 126 bool StrictChecking; 127 128 Candidate(std::string Path, bool StrictChecking = false) 129 : Path(Path), StrictChecking(StrictChecking) {} 130 }; 131 SmallVector<Candidate, 4> Candidates; 132 133 // In decreasing order so we prefer newer versions to older versions. 134 std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"}; 135 auto &FS = D.getVFS(); 136 137 if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { 138 Candidates.emplace_back( 139 Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str()); 140 } else if (HostTriple.isOSWindows()) { 141 for (const char *Ver : Versions) 142 Candidates.emplace_back( 143 D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" + 144 Ver); 145 } else { 146 if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) { 147 // Try to find ptxas binary. If the executable is located in a directory 148 // called 'bin/', its parent directory might be a good guess for a valid 149 // CUDA installation. 150 // However, some distributions might installs 'ptxas' to /usr/bin. In that 151 // case the candidate would be '/usr' which passes the following checks 152 // because '/usr/include' exists as well. To avoid this case, we always 153 // check for the directory potentially containing files for libdevice, 154 // even if the user passes -nocudalib. 155 if (llvm::ErrorOr<std::string> ptxas = 156 llvm::sys::findProgramByName("ptxas")) { 157 SmallString<256> ptxasAbsolutePath; 158 llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath); 159 160 StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath); 161 if (llvm::sys::path::filename(ptxasDir) == "bin") 162 Candidates.emplace_back( 163 std::string(llvm::sys::path::parent_path(ptxasDir)), 164 /*StrictChecking=*/true); 165 } 166 } 167 168 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda"); 169 for (const char *Ver : Versions) 170 Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver); 171 172 Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple())); 173 if (Dist.IsDebian() || Dist.IsUbuntu()) 174 // Special case for Debian to have nvidia-cuda-toolkit work 175 // out of the box. More info on http://bugs.debian.org/882505 176 Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda"); 177 } 178 179 bool NoCudaLib = Args.hasArg(options::OPT_nogpulib); 180 181 for (const auto &Candidate : Candidates) { 182 InstallPath = Candidate.Path; 183 if (InstallPath.empty() || !FS.exists(InstallPath)) 184 continue; 185 186 BinPath = InstallPath + "/bin"; 187 IncludePath = InstallPath + "/include"; 188 LibDevicePath = InstallPath + "/nvvm/libdevice"; 189 190 if (!(FS.exists(IncludePath) && FS.exists(BinPath))) 191 continue; 192 bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking); 193 if (CheckLibDevice && !FS.exists(LibDevicePath)) 194 continue; 195 196 // On Linux, we have both lib and lib64 directories, and we need to choose 197 // based on our triple. On MacOS, we have only a lib directory. 198 // 199 // It's sufficient for our purposes to be flexible: If both lib and lib64 200 // exist, we choose whichever one matches our triple. Otherwise, if only 201 // lib exists, we use it. 202 if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64")) 203 LibPath = InstallPath + "/lib64"; 204 else if (FS.exists(InstallPath + "/lib")) 205 LibPath = InstallPath + "/lib"; 206 else 207 continue; 208 209 CudaVersionInfo VersionInfo = {"", CudaVersion::UNKNOWN}; 210 if (auto VersionFile = FS.getBufferForFile(InstallPath + "/version.txt")) 211 VersionInfo = parseCudaVersionFile((*VersionFile)->getBuffer()); 212 // If version file didn't give us the version, try to find it in cuda.h 213 if (VersionInfo.Version == CudaVersion::UNKNOWN) 214 if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h")) 215 VersionInfo = parseCudaHFile((*CudaHFile)->getBuffer()); 216 // As the last resort, make an educated guess between CUDA-7.0, (which had 217 // no version.txt file and had old-style libdevice bitcode ) and an unknown 218 // recent CUDA version (no version.txt, new style bitcode). 219 if (VersionInfo.Version == CudaVersion::UNKNOWN) { 220 VersionInfo.Version = (FS.exists(LibDevicePath + "/libdevice.10.bc")) 221 ? Version = CudaVersion::LATEST 222 : Version = CudaVersion::CUDA_70; 223 VersionInfo.DetectedVersion = 224 "No version found in version.txt or cuda.h."; 225 } 226 227 Version = VersionInfo.Version; 228 DetectedVersion = VersionInfo.DetectedVersion; 229 230 // TODO(tra): remove the warning once we have all features of 10.2 231 // and 11.0 implemented. 232 DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED; 233 234 if (Version >= CudaVersion::CUDA_90) { 235 // CUDA-9+ uses single libdevice file for all GPU variants. 236 std::string FilePath = LibDevicePath + "/libdevice.10.bc"; 237 if (FS.exists(FilePath)) { 238 for (int Arch = (int)CudaArch::SM_30, E = (int)CudaArch::LAST; Arch < E; 239 ++Arch) { 240 CudaArch GpuArch = static_cast<CudaArch>(Arch); 241 if (!IsNVIDIAGpuArch(GpuArch)) 242 continue; 243 std::string GpuArchName(CudaArchToString(GpuArch)); 244 LibDeviceMap[GpuArchName] = FilePath; 245 } 246 } 247 } else { 248 std::error_code EC; 249 for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC), 250 LE; 251 !EC && LI != LE; LI = LI.increment(EC)) { 252 StringRef FilePath = LI->path(); 253 StringRef FileName = llvm::sys::path::filename(FilePath); 254 // Process all bitcode filenames that look like 255 // libdevice.compute_XX.YY.bc 256 const StringRef LibDeviceName = "libdevice."; 257 if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc"))) 258 continue; 259 StringRef GpuArch = FileName.slice( 260 LibDeviceName.size(), FileName.find('.', LibDeviceName.size())); 261 LibDeviceMap[GpuArch] = FilePath.str(); 262 // Insert map entries for specific devices with this compute 263 // capability. NVCC's choice of the libdevice library version is 264 // rather peculiar and depends on the CUDA version. 265 if (GpuArch == "compute_20") { 266 LibDeviceMap["sm_20"] = std::string(FilePath); 267 LibDeviceMap["sm_21"] = std::string(FilePath); 268 LibDeviceMap["sm_32"] = std::string(FilePath); 269 } else if (GpuArch == "compute_30") { 270 LibDeviceMap["sm_30"] = std::string(FilePath); 271 if (Version < CudaVersion::CUDA_80) { 272 LibDeviceMap["sm_50"] = std::string(FilePath); 273 LibDeviceMap["sm_52"] = std::string(FilePath); 274 LibDeviceMap["sm_53"] = std::string(FilePath); 275 } 276 LibDeviceMap["sm_60"] = std::string(FilePath); 277 LibDeviceMap["sm_61"] = std::string(FilePath); 278 LibDeviceMap["sm_62"] = std::string(FilePath); 279 } else if (GpuArch == "compute_35") { 280 LibDeviceMap["sm_35"] = std::string(FilePath); 281 LibDeviceMap["sm_37"] = std::string(FilePath); 282 } else if (GpuArch == "compute_50") { 283 if (Version >= CudaVersion::CUDA_80) { 284 LibDeviceMap["sm_50"] = std::string(FilePath); 285 LibDeviceMap["sm_52"] = std::string(FilePath); 286 LibDeviceMap["sm_53"] = std::string(FilePath); 287 } 288 } 289 } 290 } 291 292 // Check that we have found at least one libdevice that we can link in if 293 // -nocudalib hasn't been specified. 294 if (LibDeviceMap.empty() && !NoCudaLib) 295 continue; 296 297 IsValid = true; 298 break; 299 } 300} 301 302void CudaInstallationDetector::AddCudaIncludeArgs( 303 const ArgList &DriverArgs, ArgStringList &CC1Args) const { 304 if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { 305 // Add cuda_wrappers/* to our system include path. This lets us wrap 306 // standard library headers. 307 SmallString<128> P(D.ResourceDir); 308 llvm::sys::path::append(P, "include"); 309 llvm::sys::path::append(P, "cuda_wrappers"); 310 CC1Args.push_back("-internal-isystem"); 311 CC1Args.push_back(DriverArgs.MakeArgString(P)); 312 } 313 314 if (DriverArgs.hasArg(options::OPT_nogpuinc)) 315 return; 316 317 if (!isValid()) { 318 D.Diag(diag::err_drv_no_cuda_installation); 319 return; 320 } 321 322 CC1Args.push_back("-internal-isystem"); 323 CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath())); 324 CC1Args.push_back("-include"); 325 CC1Args.push_back("__clang_cuda_runtime_wrapper.h"); 326} 327 328void CudaInstallationDetector::CheckCudaVersionSupportsArch( 329 CudaArch Arch) const { 330 if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN || 331 ArchsWithBadVersion[(int)Arch]) 332 return; 333 334 auto MinVersion = MinVersionForCudaArch(Arch); 335 auto MaxVersion = MaxVersionForCudaArch(Arch); 336 if (Version < MinVersion || Version > MaxVersion) { 337 ArchsWithBadVersion[(int)Arch] = true; 338 D.Diag(diag::err_drv_cuda_version_unsupported) 339 << CudaArchToString(Arch) << CudaVersionToString(MinVersion) 340 << CudaVersionToString(MaxVersion) << InstallPath 341 << CudaVersionToString(Version); 342 } 343} 344 345void CudaInstallationDetector::print(raw_ostream &OS) const { 346 if (isValid()) 347 OS << "Found CUDA installation: " << InstallPath << ", version " 348 << CudaVersionToString(Version) << "\n"; 349} 350 351namespace { 352/// Debug info level for the NVPTX devices. We may need to emit different debug 353/// info level for the host and for the device itselfi. This type controls 354/// emission of the debug info for the devices. It either prohibits disable info 355/// emission completely, or emits debug directives only, or emits same debug 356/// info as for the host. 357enum DeviceDebugInfoLevel { 358 DisableDebugInfo, /// Do not emit debug info for the devices. 359 DebugDirectivesOnly, /// Emit only debug directives. 360 EmitSameDebugInfoAsHost, /// Use the same debug info level just like for the 361 /// host. 362}; 363} // anonymous namespace 364 365/// Define debug info level for the NVPTX devices. If the debug info for both 366/// the host and device are disabled (-g0/-ggdb0 or no debug options at all). If 367/// only debug directives are requested for the both host and device 368/// (-gline-directvies-only), or the debug info only for the device is disabled 369/// (optimization is on and --cuda-noopt-device-debug was not specified), the 370/// debug directves only must be emitted for the device. Otherwise, use the same 371/// debug info level just like for the host (with the limitations of only 372/// supported DWARF2 standard). 373static DeviceDebugInfoLevel mustEmitDebugInfo(const ArgList &Args) { 374 const Arg *A = Args.getLastArg(options::OPT_O_Group); 375 bool IsDebugEnabled = !A || A->getOption().matches(options::OPT_O0) || 376 Args.hasFlag(options::OPT_cuda_noopt_device_debug, 377 options::OPT_no_cuda_noopt_device_debug, 378 /*Default=*/false); 379 if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) { 380 const Option &Opt = A->getOption(); 381 if (Opt.matches(options::OPT_gN_Group)) { 382 if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0)) 383 return DisableDebugInfo; 384 if (Opt.matches(options::OPT_gline_directives_only)) 385 return DebugDirectivesOnly; 386 } 387 return IsDebugEnabled ? EmitSameDebugInfoAsHost : DebugDirectivesOnly; 388 } 389 return willEmitRemarks(Args) ? DebugDirectivesOnly : DisableDebugInfo; 390} 391 392void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, 393 const InputInfo &Output, 394 const InputInfoList &Inputs, 395 const ArgList &Args, 396 const char *LinkingOutput) const { 397 const auto &TC = 398 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 399 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 400 401 StringRef GPUArchName; 402 // If this is an OpenMP action we need to extract the device architecture 403 // from the -march=arch option. This option may come from -Xopenmp-target 404 // flag or the default value. 405 if (JA.isDeviceOffloading(Action::OFK_OpenMP)) { 406 GPUArchName = Args.getLastArgValue(options::OPT_march_EQ); 407 assert(!GPUArchName.empty() && "Must have an architecture passed in."); 408 } else 409 GPUArchName = JA.getOffloadingArch(); 410 411 // Obtain architecture from the action. 412 CudaArch gpu_arch = StringToCudaArch(GPUArchName); 413 assert(gpu_arch != CudaArch::UNKNOWN && 414 "Device action expected to have an architecture."); 415 416 // Check that our installation's ptxas supports gpu_arch. 417 if (!Args.hasArg(options::OPT_no_cuda_version_check)) { 418 TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch); 419 } 420 421 ArgStringList CmdArgs; 422 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); 423 DeviceDebugInfoLevel DIKind = mustEmitDebugInfo(Args); 424 if (DIKind == EmitSameDebugInfoAsHost) { 425 // ptxas does not accept -g option if optimization is enabled, so 426 // we ignore the compiler's -O* options if we want debug info. 427 CmdArgs.push_back("-g"); 428 CmdArgs.push_back("--dont-merge-basicblocks"); 429 CmdArgs.push_back("--return-at-end"); 430 } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { 431 // Map the -O we received to -O{0,1,2,3}. 432 // 433 // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's 434 // default, so it may correspond more closely to the spirit of clang -O2. 435 436 // -O3 seems like the least-bad option when -Osomething is specified to 437 // clang but it isn't handled below. 438 StringRef OOpt = "3"; 439 if (A->getOption().matches(options::OPT_O4) || 440 A->getOption().matches(options::OPT_Ofast)) 441 OOpt = "3"; 442 else if (A->getOption().matches(options::OPT_O0)) 443 OOpt = "0"; 444 else if (A->getOption().matches(options::OPT_O)) { 445 // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options. 446 OOpt = llvm::StringSwitch<const char *>(A->getValue()) 447 .Case("1", "1") 448 .Case("2", "2") 449 .Case("3", "3") 450 .Case("s", "2") 451 .Case("z", "2") 452 .Default("2"); 453 } 454 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); 455 } else { 456 // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond 457 // to no optimizations, but ptxas's default is -O3. 458 CmdArgs.push_back("-O0"); 459 } 460 if (DIKind == DebugDirectivesOnly) 461 CmdArgs.push_back("-lineinfo"); 462 463 // Pass -v to ptxas if it was passed to the driver. 464 if (Args.hasArg(options::OPT_v)) 465 CmdArgs.push_back("-v"); 466 467 CmdArgs.push_back("--gpu-name"); 468 CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); 469 CmdArgs.push_back("--output-file"); 470 CmdArgs.push_back(Args.MakeArgString(TC.getInputFilename(Output))); 471 for (const auto& II : Inputs) 472 CmdArgs.push_back(Args.MakeArgString(II.getFilename())); 473 474 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) 475 CmdArgs.push_back(Args.MakeArgString(A)); 476 477 bool Relocatable = false; 478 if (JA.isOffloading(Action::OFK_OpenMP)) 479 // In OpenMP we need to generate relocatable code. 480 Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target, 481 options::OPT_fnoopenmp_relocatable_target, 482 /*Default=*/true); 483 else if (JA.isOffloading(Action::OFK_Cuda)) 484 Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, 485 options::OPT_fno_gpu_rdc, /*Default=*/false); 486 487 if (Relocatable) 488 CmdArgs.push_back("-c"); 489 490 const char *Exec; 491 if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ)) 492 Exec = A->getValue(); 493 else 494 Exec = Args.MakeArgString(TC.GetProgramPath("ptxas")); 495 C.addCommand(std::make_unique<Command>( 496 JA, *this, 497 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 498 "--options-file"}, 499 Exec, CmdArgs, Inputs, Output)); 500} 501 502static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) { 503 bool includePTX = true; 504 for (Arg *A : Args) { 505 if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) || 506 A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ))) 507 continue; 508 A->claim(); 509 const StringRef ArchStr = A->getValue(); 510 if (ArchStr == "all" || ArchStr == gpu_arch) { 511 includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ); 512 continue; 513 } 514 } 515 return includePTX; 516} 517 518// All inputs to this linker must be from CudaDeviceActions, as we need to look 519// at the Inputs' Actions in order to figure out which GPU architecture they 520// correspond to. 521void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, 522 const InputInfo &Output, 523 const InputInfoList &Inputs, 524 const ArgList &Args, 525 const char *LinkingOutput) const { 526 const auto &TC = 527 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 528 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 529 530 ArgStringList CmdArgs; 531 if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100) 532 CmdArgs.push_back("--cuda"); 533 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); 534 CmdArgs.push_back(Args.MakeArgString("--create")); 535 CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); 536 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost) 537 CmdArgs.push_back("-g"); 538 539 for (const auto& II : Inputs) { 540 auto *A = II.getAction(); 541 assert(A->getInputs().size() == 1 && 542 "Device offload action is expected to have a single input"); 543 const char *gpu_arch_str = A->getOffloadingArch(); 544 assert(gpu_arch_str && 545 "Device action expected to have associated a GPU architecture!"); 546 CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); 547 548 if (II.getType() == types::TY_PP_Asm && 549 !shouldIncludePTX(Args, gpu_arch_str)) 550 continue; 551 // We need to pass an Arch of the form "sm_XX" for cubin files and 552 // "compute_XX" for ptx. 553 const char *Arch = (II.getType() == types::TY_PP_Asm) 554 ? CudaArchToVirtualArchString(gpu_arch) 555 : gpu_arch_str; 556 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + 557 Arch + ",file=" + II.getFilename())); 558 } 559 560 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) 561 CmdArgs.push_back(Args.MakeArgString(A)); 562 563 const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); 564 C.addCommand(std::make_unique<Command>( 565 JA, *this, 566 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 567 "--options-file"}, 568 Exec, CmdArgs, Inputs, Output)); 569} 570 571void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA, 572 const InputInfo &Output, 573 const InputInfoList &Inputs, 574 const ArgList &Args, 575 const char *LinkingOutput) const { 576 const auto &TC = 577 static_cast<const toolchains::CudaToolChain &>(getToolChain()); 578 assert(TC.getTriple().isNVPTX() && "Wrong platform"); 579 580 ArgStringList CmdArgs; 581 582 // OpenMP uses nvlink to link cubin files. The result will be embedded in the 583 // host binary by the host linker. 584 assert(!JA.isHostOffloading(Action::OFK_OpenMP) && 585 "CUDA toolchain not expected for an OpenMP host device."); 586 587 if (Output.isFilename()) { 588 CmdArgs.push_back("-o"); 589 CmdArgs.push_back(Output.getFilename()); 590 } else 591 assert(Output.isNothing() && "Invalid output."); 592 if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost) 593 CmdArgs.push_back("-g"); 594 595 if (Args.hasArg(options::OPT_v)) 596 CmdArgs.push_back("-v"); 597 598 StringRef GPUArch = 599 Args.getLastArgValue(options::OPT_march_EQ); 600 assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas."); 601 602 CmdArgs.push_back("-arch"); 603 CmdArgs.push_back(Args.MakeArgString(GPUArch)); 604 605 // Add paths specified in LIBRARY_PATH environment variable as -L options. 606 addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); 607 608 // Add paths for the default clang library path. 609 SmallString<256> DefaultLibPath = 610 llvm::sys::path::parent_path(TC.getDriver().Dir); 611 llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX); 612 CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath)); 613 614 for (const auto &II : Inputs) { 615 if (II.getType() == types::TY_LLVM_IR || 616 II.getType() == types::TY_LTO_IR || 617 II.getType() == types::TY_LTO_BC || 618 II.getType() == types::TY_LLVM_BC) { 619 C.getDriver().Diag(diag::err_drv_no_linker_llvm_support) 620 << getToolChain().getTripleString(); 621 continue; 622 } 623 624 // Currently, we only pass the input files to the linker, we do not pass 625 // any libraries that may be valid only for the host. 626 if (!II.isFilename()) 627 continue; 628 629 const char *CubinF = C.addTempFile( 630 C.getArgs().MakeArgString(getToolChain().getInputFilename(II))); 631 632 CmdArgs.push_back(CubinF); 633 } 634 635 const char *Exec = 636 Args.MakeArgString(getToolChain().GetProgramPath("nvlink")); 637 C.addCommand(std::make_unique<Command>( 638 JA, *this, 639 ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, 640 "--options-file"}, 641 Exec, CmdArgs, Inputs, Output)); 642} 643 644/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, 645/// which isn't properly a linker but nonetheless performs the step of stitching 646/// together object files from the assembler into a single blob. 647 648CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, 649 const ToolChain &HostTC, const ArgList &Args, 650 const Action::OffloadKind OK) 651 : ToolChain(D, Triple, Args), HostTC(HostTC), 652 CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) { 653 if (CudaInstallation.isValid()) { 654 CudaInstallation.WarnIfUnsupportedVersion(); 655 getProgramPaths().push_back(std::string(CudaInstallation.getBinPath())); 656 } 657 // Lookup binaries into the driver directory, this is used to 658 // discover the clang-offload-bundler executable. 659 getProgramPaths().push_back(getDriver().Dir); 660} 661 662std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { 663 // Only object files are changed, for example assembly files keep their .s 664 // extensions. CUDA also continues to use .o as they don't use nvlink but 665 // fatbinary. 666 if (!(OK == Action::OFK_OpenMP && Input.getType() == types::TY_Object)) 667 return ToolChain::getInputFilename(Input); 668 669 // Replace extension for object files with cubin because nvlink relies on 670 // these particular file names. 671 SmallString<256> Filename(ToolChain::getInputFilename(Input)); 672 llvm::sys::path::replace_extension(Filename, "cubin"); 673 return std::string(Filename.str()); 674} 675 676void CudaToolChain::addClangTargetOptions( 677 const llvm::opt::ArgList &DriverArgs, 678 llvm::opt::ArgStringList &CC1Args, 679 Action::OffloadKind DeviceOffloadingKind) const { 680 HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); 681 682 StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); 683 assert(!GpuArch.empty() && "Must have an explicit GPU arch."); 684 assert((DeviceOffloadingKind == Action::OFK_OpenMP || 685 DeviceOffloadingKind == Action::OFK_Cuda) && 686 "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs."); 687 688 if (DeviceOffloadingKind == Action::OFK_Cuda) { 689 CC1Args.push_back("-fcuda-is-device"); 690 691 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, 692 options::OPT_fno_cuda_approx_transcendentals, false)) 693 CC1Args.push_back("-fcuda-approx-transcendentals"); 694 } 695 696 if (DriverArgs.hasArg(options::OPT_nogpulib)) 697 return; 698 699 if (DeviceOffloadingKind == Action::OFK_OpenMP && 700 DriverArgs.hasArg(options::OPT_S)) 701 return; 702 703 std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); 704 if (LibDeviceFile.empty()) { 705 getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; 706 return; 707 } 708 709 CC1Args.push_back("-mlink-builtin-bitcode"); 710 CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); 711 712 clang::CudaVersion CudaInstallationVersion = CudaInstallation.version(); 713 714 // New CUDA versions often introduce new instructions that are only supported 715 // by new PTX version, so we need to raise PTX level to enable them in NVPTX 716 // back-end. 717 const char *PtxFeature = nullptr; 718 switch (CudaInstallationVersion) { 719#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \ 720 case CudaVersion::CUDA_##CUDA_VER: \ 721 PtxFeature = "+ptx" #PTX_VER; \ 722 break; 723 CASE_CUDA_VERSION(112, 72); 724 CASE_CUDA_VERSION(111, 71); 725 CASE_CUDA_VERSION(110, 70); 726 CASE_CUDA_VERSION(102, 65); 727 CASE_CUDA_VERSION(101, 64); 728 CASE_CUDA_VERSION(100, 63); 729 CASE_CUDA_VERSION(92, 61); 730 CASE_CUDA_VERSION(91, 61); 731 CASE_CUDA_VERSION(90, 60); 732#undef CASE_CUDA_VERSION 733 default: 734 PtxFeature = "+ptx42"; 735 } 736 CC1Args.append({"-target-feature", PtxFeature}); 737 if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, 738 options::OPT_fno_cuda_short_ptr, false)) 739 CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); 740 741 if (CudaInstallationVersion >= CudaVersion::UNKNOWN) 742 CC1Args.push_back( 743 DriverArgs.MakeArgString(Twine("-target-sdk-version=") + 744 CudaVersionToString(CudaInstallationVersion))); 745 746 if (DeviceOffloadingKind == Action::OFK_OpenMP) { 747 if (CudaInstallationVersion < CudaVersion::CUDA_92) { 748 getDriver().Diag( 749 diag::err_drv_omp_offload_target_cuda_version_not_support) 750 << CudaVersionToString(CudaInstallationVersion); 751 return; 752 } 753 754 std::string BitcodeSuffix = "nvptx-" + GpuArch.str(); 755 addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, 756 getTriple()); 757 } 758} 759 760llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType( 761 const llvm::opt::ArgList &DriverArgs, const JobAction &JA, 762 const llvm::fltSemantics *FPType) const { 763 if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) { 764 if (FPType && FPType == &llvm::APFloat::IEEEsingle() && 765 DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero, 766 options::OPT_fno_gpu_flush_denormals_to_zero, false)) 767 return llvm::DenormalMode::getPreserveSign(); 768 } 769 770 assert(JA.getOffloadingDeviceKind() != Action::OFK_Host); 771 return llvm::DenormalMode::getIEEE(); 772} 773 774bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { 775 const Option &O = A->getOption(); 776 return (O.matches(options::OPT_gN_Group) && 777 !O.matches(options::OPT_gmodules)) || 778 O.matches(options::OPT_g_Flag) || 779 O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) || 780 O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) || 781 O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) || 782 O.matches(options::OPT_gdwarf_5) || 783 O.matches(options::OPT_gcolumn_info); 784} 785 786void CudaToolChain::adjustDebugInfoKind( 787 codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const { 788 switch (mustEmitDebugInfo(Args)) { 789 case DisableDebugInfo: 790 DebugInfoKind = codegenoptions::NoDebugInfo; 791 break; 792 case DebugDirectivesOnly: 793 DebugInfoKind = codegenoptions::DebugDirectivesOnly; 794 break; 795 case EmitSameDebugInfoAsHost: 796 // Use same debug info level as the host. 797 break; 798 } 799} 800 801void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, 802 ArgStringList &CC1Args) const { 803 // Check our CUDA version if we're going to include the CUDA headers. 804 if (!DriverArgs.hasArg(options::OPT_nogpuinc) && 805 !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) { 806 StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ); 807 assert(!Arch.empty() && "Must have an explicit GPU arch."); 808 CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch)); 809 } 810 CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args); 811} 812 813llvm::opt::DerivedArgList * 814CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, 815 StringRef BoundArch, 816 Action::OffloadKind DeviceOffloadKind) const { 817 DerivedArgList *DAL = 818 HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind); 819 if (!DAL) 820 DAL = new DerivedArgList(Args.getBaseArgs()); 821 822 const OptTable &Opts = getDriver().getOpts(); 823 824 // For OpenMP device offloading, append derived arguments. Make sure 825 // flags are not duplicated. 826 // Also append the compute capability. 827 if (DeviceOffloadKind == Action::OFK_OpenMP) { 828 for (Arg *A : Args) { 829 bool IsDuplicate = false; 830 for (Arg *DALArg : *DAL) { 831 if (A == DALArg) { 832 IsDuplicate = true; 833 break; 834 } 835 } 836 if (!IsDuplicate) 837 DAL->append(A); 838 } 839 840 StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ); 841 if (Arch.empty()) 842 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), 843 CLANG_OPENMP_NVPTX_DEFAULT_ARCH); 844 845 return DAL; 846 } 847 848 for (Arg *A : Args) { 849 DAL->append(A); 850 } 851 852 if (!BoundArch.empty()) { 853 DAL->eraseArg(options::OPT_march_EQ); 854 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); 855 } 856 return DAL; 857} 858 859Tool *CudaToolChain::buildAssembler() const { 860 return new tools::NVPTX::Assembler(*this); 861} 862 863Tool *CudaToolChain::buildLinker() const { 864 if (OK == Action::OFK_OpenMP) 865 return new tools::NVPTX::OpenMPLinker(*this); 866 return new tools::NVPTX::Linker(*this); 867} 868 869void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const { 870 HostTC.addClangWarningOptions(CC1Args); 871} 872 873ToolChain::CXXStdlibType 874CudaToolChain::GetCXXStdlibType(const ArgList &Args) const { 875 return HostTC.GetCXXStdlibType(Args); 876} 877 878void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, 879 ArgStringList &CC1Args) const { 880 HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args); 881} 882 883void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args, 884 ArgStringList &CC1Args) const { 885 HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args); 886} 887 888void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args, 889 ArgStringList &CC1Args) const { 890 HostTC.AddIAMCUIncludeArgs(Args, CC1Args); 891} 892 893SanitizerMask CudaToolChain::getSupportedSanitizers() const { 894 // The CudaToolChain only supports sanitizers in the sense that it allows 895 // sanitizer arguments on the command line if they are supported by the host 896 // toolchain. The CudaToolChain will actually ignore any command line 897 // arguments for any of these "supported" sanitizers. That means that no 898 // sanitization of device code is actually supported at this time. 899 // 900 // This behavior is necessary because the host and device toolchains 901 // invocations often share the command line, so the device toolchain must 902 // tolerate flags meant only for the host toolchain. 903 return HostTC.getSupportedSanitizers(); 904} 905 906VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D, 907 const ArgList &Args) const { 908 return HostTC.computeMSVCVersion(D, Args); 909} 910