__clang_cuda_intrinsics.h revision 341825
1/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23#ifndef __CLANG_CUDA_INTRINSICS_H__ 24#define __CLANG_CUDA_INTRINSICS_H__ 25#ifndef __CUDA__ 26#error "This file is for CUDA compilation only." 27#endif 28 29// sm_30 intrinsics: __shfl_{up,down,xor}. 30 31#define __SM_30_INTRINSICS_H__ 32#define __SM_30_INTRINSICS_HPP__ 33 34#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 35 36#pragma push_macro("__MAKE_SHUFFLES") 37#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ 38 __Type) \ 39 inline __device__ int __FnName(int __val, __Type __offset, \ 40 int __width = warpSize) { \ 41 return __IntIntrinsic(__val, __offset, \ 42 ((warpSize - __width) << 8) | (__Mask)); \ 43 } \ 44 inline __device__ float __FnName(float __val, __Type __offset, \ 45 int __width = warpSize) { \ 46 return __FloatIntrinsic(__val, __offset, \ 47 ((warpSize - __width) << 8) | (__Mask)); \ 48 } \ 49 inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ 50 int __width = warpSize) { \ 51 return static_cast<unsigned int>( \ 52 ::__FnName(static_cast<int>(__val), __offset, __width)); \ 53 } \ 54 inline __device__ long long __FnName(long long __val, __Type __offset, \ 55 int __width = warpSize) { \ 56 struct __Bits { \ 57 int __a, __b; \ 58 }; \ 59 _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 60 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 61 __Bits __tmp; \ 62 memcpy(&__val, &__tmp, sizeof(__val)); \ 63 __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 64 __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 65 long long __ret; \ 66 memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 67 return __ret; \ 68 } \ 69 inline __device__ long __FnName(long __val, __Type __offset, \ 70 int __width = warpSize) { \ 71 _Static_assert(sizeof(long) == sizeof(long long) || \ 72 sizeof(long) == sizeof(int)); \ 73 if (sizeof(long) == sizeof(long long)) { \ 74 return static_cast<long>( \ 75 ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 76 } else if (sizeof(long) == sizeof(int)) { \ 77 return static_cast<long>( \ 78 ::__FnName(static_cast<int>(__val), __offset, __width)); \ 79 } \ 80 } \ 81 inline __device__ unsigned long __FnName( \ 82 unsigned long __val, __Type __offset, int __width = warpSize) { \ 83 return static_cast<unsigned long>( \ 84 ::__FnName(static_cast<long>(__val), __offset, __width)); \ 85 } \ 86 inline __device__ unsigned long long __FnName( \ 87 unsigned long long __val, __Type __offset, int __width = warpSize) { \ 88 return static_cast<unsigned long long>(::__FnName( \ 89 static_cast<unsigned long long>(__val), __offset, __width)); \ 90 } \ 91 inline __device__ double __FnName(double __val, __Type __offset, \ 92 int __width = warpSize) { \ 93 long long __tmp; \ 94 _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 95 memcpy(&__tmp, &__val, sizeof(__val)); \ 96 __tmp = ::__FnName(__tmp, __offset, __width); \ 97 double __ret; \ 98 memcpy(&__ret, &__tmp, sizeof(__ret)); \ 99 return __ret; \ 100 } 101 102__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); 103// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 104// maxLane. 105__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, 106 unsigned int); 107__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, 108 unsigned int); 109__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, 110 int); 111#pragma pop_macro("__MAKE_SHUFFLES") 112 113#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 114 115#if CUDA_VERSION >= 9000 116#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) 117// __shfl_sync_* variants available in CUDA-9 118#pragma push_macro("__MAKE_SYNC_SHUFFLES") 119#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ 120 __Mask, __Type) \ 121 inline __device__ int __FnName(unsigned int __mask, int __val, \ 122 __Type __offset, int __width = warpSize) { \ 123 return __IntIntrinsic(__mask, __val, __offset, \ 124 ((warpSize - __width) << 8) | (__Mask)); \ 125 } \ 126 inline __device__ float __FnName(unsigned int __mask, float __val, \ 127 __Type __offset, int __width = warpSize) { \ 128 return __FloatIntrinsic(__mask, __val, __offset, \ 129 ((warpSize - __width) << 8) | (__Mask)); \ 130 } \ 131 inline __device__ unsigned int __FnName(unsigned int __mask, \ 132 unsigned int __val, __Type __offset, \ 133 int __width = warpSize) { \ 134 return static_cast<unsigned int>( \ 135 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 136 } \ 137 inline __device__ long long __FnName(unsigned int __mask, long long __val, \ 138 __Type __offset, \ 139 int __width = warpSize) { \ 140 struct __Bits { \ 141 int __a, __b; \ 142 }; \ 143 _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 144 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 145 __Bits __tmp; \ 146 memcpy(&__val, &__tmp, sizeof(__val)); \ 147 __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ 148 __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ 149 long long __ret; \ 150 memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 151 return __ret; \ 152 } \ 153 inline __device__ unsigned long long __FnName( \ 154 unsigned int __mask, unsigned long long __val, __Type __offset, \ 155 int __width = warpSize) { \ 156 return static_cast<unsigned long long>(::__FnName( \ 157 __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ 158 } \ 159 inline __device__ long __FnName(unsigned int __mask, long __val, \ 160 __Type __offset, int __width = warpSize) { \ 161 _Static_assert(sizeof(long) == sizeof(long long) || \ 162 sizeof(long) == sizeof(int)); \ 163 if (sizeof(long) == sizeof(long long)) { \ 164 return static_cast<long>(::__FnName( \ 165 __mask, static_cast<long long>(__val), __offset, __width)); \ 166 } else if (sizeof(long) == sizeof(int)) { \ 167 return static_cast<long>( \ 168 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 169 } \ 170 } \ 171 inline __device__ unsigned long __FnName( \ 172 unsigned int __mask, unsigned long __val, __Type __offset, \ 173 int __width = warpSize) { \ 174 return static_cast<unsigned long>( \ 175 ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ 176 } \ 177 inline __device__ double __FnName(unsigned int __mask, double __val, \ 178 __Type __offset, int __width = warpSize) { \ 179 long long __tmp; \ 180 _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 181 memcpy(&__tmp, &__val, sizeof(__val)); \ 182 __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ 183 double __ret; \ 184 memcpy(&__ret, &__tmp, sizeof(__ret)); \ 185 return __ret; \ 186 } 187__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, 188 __nvvm_shfl_sync_idx_f32, 0x1f, int); 189// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 190// maxLane. 191__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, 192 __nvvm_shfl_sync_up_f32, 0, unsigned int); 193__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, 194 __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); 195__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, 196 __nvvm_shfl_sync_bfly_f32, 0x1f, int); 197#pragma pop_macro("__MAKE_SYNC_SHUFFLES") 198 199inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { 200 return __nvvm_bar_warp_sync(mask); 201} 202 203inline __device__ void __barrier_sync(unsigned int id) { 204 __nvvm_barrier_sync(id); 205} 206 207inline __device__ void __barrier_sync_count(unsigned int id, 208 unsigned int count) { 209 __nvvm_barrier_sync_cnt(id, count); 210} 211 212inline __device__ int __all_sync(unsigned int mask, int pred) { 213 return __nvvm_vote_all_sync(mask, pred); 214} 215 216inline __device__ int __any_sync(unsigned int mask, int pred) { 217 return __nvvm_vote_any_sync(mask, pred); 218} 219 220inline __device__ int __uni_sync(unsigned int mask, int pred) { 221 return __nvvm_vote_uni_sync(mask, pred); 222} 223 224inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { 225 return __nvvm_vote_ballot_sync(mask, pred); 226} 227 228inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } 229 230inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { 231 return __nvvm_fns(mask, base, offset); 232} 233 234#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 235 236// Define __match* builtins CUDA-9 headers expect to see. 237#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 238inline __device__ unsigned int __match32_any_sync(unsigned int mask, 239 unsigned int value) { 240 return __nvvm_match_any_sync_i32(mask, value); 241} 242 243inline __device__ unsigned long long 244__match64_any_sync(unsigned int mask, unsigned long long value) { 245 return __nvvm_match_any_sync_i64(mask, value); 246} 247 248inline __device__ unsigned int 249__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { 250 return __nvvm_match_all_sync_i32p(mask, value, pred); 251} 252 253inline __device__ unsigned long long 254__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { 255 return __nvvm_match_all_sync_i64p(mask, value, pred); 256} 257#include "crt/sm_70_rt.hpp" 258 259#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 260#endif // __CUDA_VERSION >= 9000 261 262// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 263 264// Prevent the vanilla sm_32 intrinsics header from being included. 265#define __SM_32_INTRINSICS_H__ 266#define __SM_32_INTRINSICS_HPP__ 267 268#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 269 270inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 271inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 272inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 273inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 274inline __device__ long long __ldg(const long long *ptr) { 275 return __nvvm_ldg_ll(ptr); 276} 277inline __device__ unsigned char __ldg(const unsigned char *ptr) { 278 return __nvvm_ldg_uc(ptr); 279} 280inline __device__ signed char __ldg(const signed char *ptr) { 281 return __nvvm_ldg_uc((const unsigned char *)ptr); 282} 283inline __device__ unsigned short __ldg(const unsigned short *ptr) { 284 return __nvvm_ldg_us(ptr); 285} 286inline __device__ unsigned int __ldg(const unsigned int *ptr) { 287 return __nvvm_ldg_ui(ptr); 288} 289inline __device__ unsigned long __ldg(const unsigned long *ptr) { 290 return __nvvm_ldg_ul(ptr); 291} 292inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 293 return __nvvm_ldg_ull(ptr); 294} 295inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 296inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 297 298inline __device__ char2 __ldg(const char2 *ptr) { 299 typedef char c2 __attribute__((ext_vector_type(2))); 300 // We can assume that ptr is aligned at least to char2's alignment, but the 301 // load will assume that ptr is aligned to char2's alignment. This is only 302 // safe if alignof(c2) <= alignof(char2). 303 c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 304 char2 ret; 305 ret.x = rv[0]; 306 ret.y = rv[1]; 307 return ret; 308} 309inline __device__ char4 __ldg(const char4 *ptr) { 310 typedef char c4 __attribute__((ext_vector_type(4))); 311 c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 312 char4 ret; 313 ret.x = rv[0]; 314 ret.y = rv[1]; 315 ret.z = rv[2]; 316 ret.w = rv[3]; 317 return ret; 318} 319inline __device__ short2 __ldg(const short2 *ptr) { 320 typedef short s2 __attribute__((ext_vector_type(2))); 321 s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 322 short2 ret; 323 ret.x = rv[0]; 324 ret.y = rv[1]; 325 return ret; 326} 327inline __device__ short4 __ldg(const short4 *ptr) { 328 typedef short s4 __attribute__((ext_vector_type(4))); 329 s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 330 short4 ret; 331 ret.x = rv[0]; 332 ret.y = rv[1]; 333 ret.z = rv[2]; 334 ret.w = rv[3]; 335 return ret; 336} 337inline __device__ int2 __ldg(const int2 *ptr) { 338 typedef int i2 __attribute__((ext_vector_type(2))); 339 i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 340 int2 ret; 341 ret.x = rv[0]; 342 ret.y = rv[1]; 343 return ret; 344} 345inline __device__ int4 __ldg(const int4 *ptr) { 346 typedef int i4 __attribute__((ext_vector_type(4))); 347 i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 348 int4 ret; 349 ret.x = rv[0]; 350 ret.y = rv[1]; 351 ret.z = rv[2]; 352 ret.w = rv[3]; 353 return ret; 354} 355inline __device__ longlong2 __ldg(const longlong2 *ptr) { 356 typedef long long ll2 __attribute__((ext_vector_type(2))); 357 ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 358 longlong2 ret; 359 ret.x = rv[0]; 360 ret.y = rv[1]; 361 return ret; 362} 363 364inline __device__ uchar2 __ldg(const uchar2 *ptr) { 365 typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 366 uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 367 uchar2 ret; 368 ret.x = rv[0]; 369 ret.y = rv[1]; 370 return ret; 371} 372inline __device__ uchar4 __ldg(const uchar4 *ptr) { 373 typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 374 uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 375 uchar4 ret; 376 ret.x = rv[0]; 377 ret.y = rv[1]; 378 ret.z = rv[2]; 379 ret.w = rv[3]; 380 return ret; 381} 382inline __device__ ushort2 __ldg(const ushort2 *ptr) { 383 typedef unsigned short us2 __attribute__((ext_vector_type(2))); 384 us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 385 ushort2 ret; 386 ret.x = rv[0]; 387 ret.y = rv[1]; 388 return ret; 389} 390inline __device__ ushort4 __ldg(const ushort4 *ptr) { 391 typedef unsigned short us4 __attribute__((ext_vector_type(4))); 392 us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 393 ushort4 ret; 394 ret.x = rv[0]; 395 ret.y = rv[1]; 396 ret.z = rv[2]; 397 ret.w = rv[3]; 398 return ret; 399} 400inline __device__ uint2 __ldg(const uint2 *ptr) { 401 typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 402 ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 403 uint2 ret; 404 ret.x = rv[0]; 405 ret.y = rv[1]; 406 return ret; 407} 408inline __device__ uint4 __ldg(const uint4 *ptr) { 409 typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 410 ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 411 uint4 ret; 412 ret.x = rv[0]; 413 ret.y = rv[1]; 414 ret.z = rv[2]; 415 ret.w = rv[3]; 416 return ret; 417} 418inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 419 typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 420 ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 421 ulonglong2 ret; 422 ret.x = rv[0]; 423 ret.y = rv[1]; 424 return ret; 425} 426 427inline __device__ float2 __ldg(const float2 *ptr) { 428 typedef float f2 __attribute__((ext_vector_type(2))); 429 f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 430 float2 ret; 431 ret.x = rv[0]; 432 ret.y = rv[1]; 433 return ret; 434} 435inline __device__ float4 __ldg(const float4 *ptr) { 436 typedef float f4 __attribute__((ext_vector_type(4))); 437 f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 438 float4 ret; 439 ret.x = rv[0]; 440 ret.y = rv[1]; 441 ret.z = rv[2]; 442 ret.w = rv[3]; 443 return ret; 444} 445inline __device__ double2 __ldg(const double2 *ptr) { 446 typedef double d2 __attribute__((ext_vector_type(2))); 447 d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 448 double2 ret; 449 ret.x = rv[0]; 450 ret.y = rv[1]; 451 return ret; 452} 453 454// TODO: Implement these as intrinsics, so the backend can work its magic on 455// these. Alternatively, we could implement these as plain C and try to get 456// llvm to recognize the relevant patterns. 457inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 458 unsigned shiftWidth) { 459 unsigned result; 460 asm("shf.l.wrap.b32 %0, %1, %2, %3;" 461 : "=r"(result) 462 : "r"(low32), "r"(high32), "r"(shiftWidth)); 463 return result; 464} 465inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 466 unsigned shiftWidth) { 467 unsigned result; 468 asm("shf.l.clamp.b32 %0, %1, %2, %3;" 469 : "=r"(result) 470 : "r"(low32), "r"(high32), "r"(shiftWidth)); 471 return result; 472} 473inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 474 unsigned shiftWidth) { 475 unsigned result; 476 asm("shf.r.wrap.b32 %0, %1, %2, %3;" 477 : "=r"(result) 478 : "r"(low32), "r"(high32), "r"(shiftWidth)); 479 return result; 480} 481inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 482 unsigned shiftWidth) { 483 unsigned ret; 484 asm("shf.r.clamp.b32 %0, %1, %2, %3;" 485 : "=r"(ret) 486 : "r"(low32), "r"(high32), "r"(shiftWidth)); 487 return ret; 488} 489 490#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 491 492#endif // defined(__CLANG_CUDA_INTRINSICS_H__) 493