__clang_cuda_intrinsics.h revision 341825
1303233Sdim/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 2303233Sdim * 3303233Sdim * Permission is hereby granted, free of charge, to any person obtaining a copy 4303233Sdim * of this software and associated documentation files (the "Software"), to deal 5303233Sdim * in the Software without restriction, including without limitation the rights 6303233Sdim * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7303233Sdim * copies of the Software, and to permit persons to whom the Software is 8303233Sdim * furnished to do so, subject to the following conditions: 9303233Sdim * 10303233Sdim * The above copyright notice and this permission notice shall be included in 11303233Sdim * all copies or substantial portions of the Software. 12303233Sdim * 13303233Sdim * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14303233Sdim * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15303233Sdim * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16303233Sdim * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17303233Sdim * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18303233Sdim * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19303233Sdim * THE SOFTWARE. 20303233Sdim * 21303233Sdim *===-----------------------------------------------------------------------=== 22303233Sdim */ 23303233Sdim#ifndef __CLANG_CUDA_INTRINSICS_H__ 24303233Sdim#define __CLANG_CUDA_INTRINSICS_H__ 25303233Sdim#ifndef __CUDA__ 26303233Sdim#error "This file is for CUDA compilation only." 27303233Sdim#endif 28303233Sdim 29303233Sdim// sm_30 intrinsics: __shfl_{up,down,xor}. 30303233Sdim 31303233Sdim#define __SM_30_INTRINSICS_H__ 32303233Sdim#define __SM_30_INTRINSICS_HPP__ 33303233Sdim 34303233Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 35303233Sdim 36303233Sdim#pragma push_macro("__MAKE_SHUFFLES") 37327952Sdim#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ 38327952Sdim __Type) \ 39327952Sdim inline __device__ int __FnName(int __val, __Type __offset, \ 40303233Sdim int __width = warpSize) { \ 41314564Sdim return __IntIntrinsic(__val, __offset, \ 42303233Sdim ((warpSize - __width) << 8) | (__Mask)); \ 43303233Sdim } \ 44327952Sdim inline __device__ float __FnName(float __val, __Type __offset, \ 45303233Sdim int __width = warpSize) { \ 46314564Sdim return __FloatIntrinsic(__val, __offset, \ 47303233Sdim ((warpSize - __width) << 8) | (__Mask)); \ 48303233Sdim } \ 49327952Sdim inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ 50303233Sdim int __width = warpSize) { \ 51303233Sdim return static_cast<unsigned int>( \ 52314564Sdim ::__FnName(static_cast<int>(__val), __offset, __width)); \ 53303233Sdim } \ 54327952Sdim inline __device__ long long __FnName(long long __val, __Type __offset, \ 55303233Sdim int __width = warpSize) { \ 56303233Sdim struct __Bits { \ 57303233Sdim int __a, __b; \ 58303233Sdim }; \ 59314564Sdim _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 60303233Sdim _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 61303233Sdim __Bits __tmp; \ 62314564Sdim memcpy(&__val, &__tmp, sizeof(__val)); \ 63303233Sdim __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 64303233Sdim __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 65314564Sdim long long __ret; \ 66314564Sdim memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 67314564Sdim return __ret; \ 68303233Sdim } \ 69327952Sdim inline __device__ long __FnName(long __val, __Type __offset, \ 70327952Sdim int __width = warpSize) { \ 71327952Sdim _Static_assert(sizeof(long) == sizeof(long long) || \ 72327952Sdim sizeof(long) == sizeof(int)); \ 73327952Sdim if (sizeof(long) == sizeof(long long)) { \ 74327952Sdim return static_cast<long>( \ 75327952Sdim ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 76327952Sdim } else if (sizeof(long) == sizeof(int)) { \ 77327952Sdim return static_cast<long>( \ 78327952Sdim ::__FnName(static_cast<int>(__val), __offset, __width)); \ 79327952Sdim } \ 80327952Sdim } \ 81327952Sdim inline __device__ unsigned long __FnName( \ 82327952Sdim unsigned long __val, __Type __offset, int __width = warpSize) { \ 83327952Sdim return static_cast<unsigned long>( \ 84327952Sdim ::__FnName(static_cast<long>(__val), __offset, __width)); \ 85327952Sdim } \ 86303233Sdim inline __device__ unsigned long long __FnName( \ 87327952Sdim unsigned long long __val, __Type __offset, int __width = warpSize) { \ 88314564Sdim return static_cast<unsigned long long>(::__FnName( \ 89314564Sdim static_cast<unsigned long long>(__val), __offset, __width)); \ 90303233Sdim } \ 91327952Sdim inline __device__ double __FnName(double __val, __Type __offset, \ 92303233Sdim int __width = warpSize) { \ 93303233Sdim long long __tmp; \ 94314564Sdim _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 95314564Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 96303233Sdim __tmp = ::__FnName(__tmp, __offset, __width); \ 97314564Sdim double __ret; \ 98314564Sdim memcpy(&__ret, &__tmp, sizeof(__ret)); \ 99314564Sdim return __ret; \ 100303233Sdim } 101303233Sdim 102327952Sdim__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); 103303233Sdim// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 104303233Sdim// maxLane. 105327952Sdim__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, 106327952Sdim unsigned int); 107327952Sdim__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, 108327952Sdim unsigned int); 109327952Sdim__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, 110327952Sdim int); 111303233Sdim#pragma pop_macro("__MAKE_SHUFFLES") 112303233Sdim 113303233Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 114303233Sdim 115327952Sdim#if CUDA_VERSION >= 9000 116327952Sdim#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) 117327952Sdim// __shfl_sync_* variants available in CUDA-9 118327952Sdim#pragma push_macro("__MAKE_SYNC_SHUFFLES") 119327952Sdim#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ 120327952Sdim __Mask, __Type) \ 121327952Sdim inline __device__ int __FnName(unsigned int __mask, int __val, \ 122327952Sdim __Type __offset, int __width = warpSize) { \ 123327952Sdim return __IntIntrinsic(__mask, __val, __offset, \ 124327952Sdim ((warpSize - __width) << 8) | (__Mask)); \ 125327952Sdim } \ 126327952Sdim inline __device__ float __FnName(unsigned int __mask, float __val, \ 127327952Sdim __Type __offset, int __width = warpSize) { \ 128327952Sdim return __FloatIntrinsic(__mask, __val, __offset, \ 129327952Sdim ((warpSize - __width) << 8) | (__Mask)); \ 130327952Sdim } \ 131327952Sdim inline __device__ unsigned int __FnName(unsigned int __mask, \ 132327952Sdim unsigned int __val, __Type __offset, \ 133327952Sdim int __width = warpSize) { \ 134327952Sdim return static_cast<unsigned int>( \ 135327952Sdim ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 136327952Sdim } \ 137327952Sdim inline __device__ long long __FnName(unsigned int __mask, long long __val, \ 138327952Sdim __Type __offset, \ 139327952Sdim int __width = warpSize) { \ 140327952Sdim struct __Bits { \ 141327952Sdim int __a, __b; \ 142327952Sdim }; \ 143327952Sdim _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 144327952Sdim _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 145327952Sdim __Bits __tmp; \ 146327952Sdim memcpy(&__val, &__tmp, sizeof(__val)); \ 147327952Sdim __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ 148327952Sdim __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ 149327952Sdim long long __ret; \ 150327952Sdim memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 151327952Sdim return __ret; \ 152327952Sdim } \ 153327952Sdim inline __device__ unsigned long long __FnName( \ 154327952Sdim unsigned int __mask, unsigned long long __val, __Type __offset, \ 155327952Sdim int __width = warpSize) { \ 156327952Sdim return static_cast<unsigned long long>(::__FnName( \ 157327952Sdim __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ 158327952Sdim } \ 159327952Sdim inline __device__ long __FnName(unsigned int __mask, long __val, \ 160327952Sdim __Type __offset, int __width = warpSize) { \ 161327952Sdim _Static_assert(sizeof(long) == sizeof(long long) || \ 162327952Sdim sizeof(long) == sizeof(int)); \ 163327952Sdim if (sizeof(long) == sizeof(long long)) { \ 164327952Sdim return static_cast<long>(::__FnName( \ 165327952Sdim __mask, static_cast<long long>(__val), __offset, __width)); \ 166327952Sdim } else if (sizeof(long) == sizeof(int)) { \ 167327952Sdim return static_cast<long>( \ 168327952Sdim ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 169327952Sdim } \ 170327952Sdim } \ 171327952Sdim inline __device__ unsigned long __FnName( \ 172327952Sdim unsigned int __mask, unsigned long __val, __Type __offset, \ 173327952Sdim int __width = warpSize) { \ 174327952Sdim return static_cast<unsigned long>( \ 175327952Sdim ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ 176327952Sdim } \ 177327952Sdim inline __device__ double __FnName(unsigned int __mask, double __val, \ 178327952Sdim __Type __offset, int __width = warpSize) { \ 179327952Sdim long long __tmp; \ 180327952Sdim _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 181327952Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 182327952Sdim __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ 183327952Sdim double __ret; \ 184327952Sdim memcpy(&__ret, &__tmp, sizeof(__ret)); \ 185327952Sdim return __ret; \ 186327952Sdim } 187327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, 188327952Sdim __nvvm_shfl_sync_idx_f32, 0x1f, int); 189327952Sdim// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 190327952Sdim// maxLane. 191327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, 192327952Sdim __nvvm_shfl_sync_up_f32, 0, unsigned int); 193327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, 194327952Sdim __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); 195327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, 196327952Sdim __nvvm_shfl_sync_bfly_f32, 0x1f, int); 197327952Sdim#pragma pop_macro("__MAKE_SYNC_SHUFFLES") 198327952Sdim 199327952Sdiminline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { 200327952Sdim return __nvvm_bar_warp_sync(mask); 201327952Sdim} 202327952Sdim 203327952Sdiminline __device__ void __barrier_sync(unsigned int id) { 204327952Sdim __nvvm_barrier_sync(id); 205327952Sdim} 206327952Sdim 207327952Sdiminline __device__ void __barrier_sync_count(unsigned int id, 208327952Sdim unsigned int count) { 209327952Sdim __nvvm_barrier_sync_cnt(id, count); 210327952Sdim} 211327952Sdim 212327952Sdiminline __device__ int __all_sync(unsigned int mask, int pred) { 213327952Sdim return __nvvm_vote_all_sync(mask, pred); 214327952Sdim} 215327952Sdim 216327952Sdiminline __device__ int __any_sync(unsigned int mask, int pred) { 217327952Sdim return __nvvm_vote_any_sync(mask, pred); 218327952Sdim} 219327952Sdim 220327952Sdiminline __device__ int __uni_sync(unsigned int mask, int pred) { 221327952Sdim return __nvvm_vote_uni_sync(mask, pred); 222327952Sdim} 223327952Sdim 224327952Sdiminline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { 225327952Sdim return __nvvm_vote_ballot_sync(mask, pred); 226327952Sdim} 227327952Sdim 228327952Sdiminline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } 229327952Sdim 230327952Sdiminline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { 231327952Sdim return __nvvm_fns(mask, base, offset); 232327952Sdim} 233327952Sdim 234327952Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 235327952Sdim 236327952Sdim// Define __match* builtins CUDA-9 headers expect to see. 237327952Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 238327952Sdiminline __device__ unsigned int __match32_any_sync(unsigned int mask, 239327952Sdim unsigned int value) { 240327952Sdim return __nvvm_match_any_sync_i32(mask, value); 241327952Sdim} 242327952Sdim 243327952Sdiminline __device__ unsigned long long 244327952Sdim__match64_any_sync(unsigned int mask, unsigned long long value) { 245327952Sdim return __nvvm_match_any_sync_i64(mask, value); 246327952Sdim} 247327952Sdim 248327952Sdiminline __device__ unsigned int 249327952Sdim__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { 250327952Sdim return __nvvm_match_all_sync_i32p(mask, value, pred); 251327952Sdim} 252327952Sdim 253327952Sdiminline __device__ unsigned long long 254327952Sdim__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { 255327952Sdim return __nvvm_match_all_sync_i64p(mask, value, pred); 256327952Sdim} 257327952Sdim#include "crt/sm_70_rt.hpp" 258327952Sdim 259327952Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 260327952Sdim#endif // __CUDA_VERSION >= 9000 261327952Sdim 262303233Sdim// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 263303233Sdim 264303233Sdim// Prevent the vanilla sm_32 intrinsics header from being included. 265303233Sdim#define __SM_32_INTRINSICS_H__ 266303233Sdim#define __SM_32_INTRINSICS_HPP__ 267303233Sdim 268303233Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 269303233Sdim 270303233Sdiminline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 271303233Sdiminline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 272303233Sdiminline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 273303233Sdiminline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 274303233Sdiminline __device__ long long __ldg(const long long *ptr) { 275303233Sdim return __nvvm_ldg_ll(ptr); 276303233Sdim} 277303233Sdiminline __device__ unsigned char __ldg(const unsigned char *ptr) { 278303233Sdim return __nvvm_ldg_uc(ptr); 279303233Sdim} 280341825Sdiminline __device__ signed char __ldg(const signed char *ptr) { 281341825Sdim return __nvvm_ldg_uc((const unsigned char *)ptr); 282341825Sdim} 283303233Sdiminline __device__ unsigned short __ldg(const unsigned short *ptr) { 284303233Sdim return __nvvm_ldg_us(ptr); 285303233Sdim} 286303233Sdiminline __device__ unsigned int __ldg(const unsigned int *ptr) { 287303233Sdim return __nvvm_ldg_ui(ptr); 288303233Sdim} 289303233Sdiminline __device__ unsigned long __ldg(const unsigned long *ptr) { 290303233Sdim return __nvvm_ldg_ul(ptr); 291303233Sdim} 292303233Sdiminline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 293303233Sdim return __nvvm_ldg_ull(ptr); 294303233Sdim} 295303233Sdiminline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 296303233Sdiminline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 297303233Sdim 298303233Sdiminline __device__ char2 __ldg(const char2 *ptr) { 299303233Sdim typedef char c2 __attribute__((ext_vector_type(2))); 300303233Sdim // We can assume that ptr is aligned at least to char2's alignment, but the 301303233Sdim // load will assume that ptr is aligned to char2's alignment. This is only 302303233Sdim // safe if alignof(c2) <= alignof(char2). 303303233Sdim c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 304303233Sdim char2 ret; 305303233Sdim ret.x = rv[0]; 306303233Sdim ret.y = rv[1]; 307303233Sdim return ret; 308303233Sdim} 309303233Sdiminline __device__ char4 __ldg(const char4 *ptr) { 310303233Sdim typedef char c4 __attribute__((ext_vector_type(4))); 311303233Sdim c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 312303233Sdim char4 ret; 313303233Sdim ret.x = rv[0]; 314303233Sdim ret.y = rv[1]; 315303233Sdim ret.z = rv[2]; 316303233Sdim ret.w = rv[3]; 317303233Sdim return ret; 318303233Sdim} 319303233Sdiminline __device__ short2 __ldg(const short2 *ptr) { 320303233Sdim typedef short s2 __attribute__((ext_vector_type(2))); 321303233Sdim s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 322303233Sdim short2 ret; 323303233Sdim ret.x = rv[0]; 324303233Sdim ret.y = rv[1]; 325303233Sdim return ret; 326303233Sdim} 327303233Sdiminline __device__ short4 __ldg(const short4 *ptr) { 328303233Sdim typedef short s4 __attribute__((ext_vector_type(4))); 329303233Sdim s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 330303233Sdim short4 ret; 331303233Sdim ret.x = rv[0]; 332303233Sdim ret.y = rv[1]; 333303233Sdim ret.z = rv[2]; 334303233Sdim ret.w = rv[3]; 335303233Sdim return ret; 336303233Sdim} 337303233Sdiminline __device__ int2 __ldg(const int2 *ptr) { 338303233Sdim typedef int i2 __attribute__((ext_vector_type(2))); 339303233Sdim i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 340303233Sdim int2 ret; 341303233Sdim ret.x = rv[0]; 342303233Sdim ret.y = rv[1]; 343303233Sdim return ret; 344303233Sdim} 345303233Sdiminline __device__ int4 __ldg(const int4 *ptr) { 346303233Sdim typedef int i4 __attribute__((ext_vector_type(4))); 347303233Sdim i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 348303233Sdim int4 ret; 349303233Sdim ret.x = rv[0]; 350303233Sdim ret.y = rv[1]; 351303233Sdim ret.z = rv[2]; 352303233Sdim ret.w = rv[3]; 353303233Sdim return ret; 354303233Sdim} 355303233Sdiminline __device__ longlong2 __ldg(const longlong2 *ptr) { 356303233Sdim typedef long long ll2 __attribute__((ext_vector_type(2))); 357303233Sdim ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 358303233Sdim longlong2 ret; 359303233Sdim ret.x = rv[0]; 360303233Sdim ret.y = rv[1]; 361303233Sdim return ret; 362303233Sdim} 363303233Sdim 364303233Sdiminline __device__ uchar2 __ldg(const uchar2 *ptr) { 365303233Sdim typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 366303233Sdim uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 367303233Sdim uchar2 ret; 368303233Sdim ret.x = rv[0]; 369303233Sdim ret.y = rv[1]; 370303233Sdim return ret; 371303233Sdim} 372303233Sdiminline __device__ uchar4 __ldg(const uchar4 *ptr) { 373303233Sdim typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 374303233Sdim uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 375303233Sdim uchar4 ret; 376303233Sdim ret.x = rv[0]; 377303233Sdim ret.y = rv[1]; 378303233Sdim ret.z = rv[2]; 379303233Sdim ret.w = rv[3]; 380303233Sdim return ret; 381303233Sdim} 382303233Sdiminline __device__ ushort2 __ldg(const ushort2 *ptr) { 383303233Sdim typedef unsigned short us2 __attribute__((ext_vector_type(2))); 384303233Sdim us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 385303233Sdim ushort2 ret; 386303233Sdim ret.x = rv[0]; 387303233Sdim ret.y = rv[1]; 388303233Sdim return ret; 389303233Sdim} 390303233Sdiminline __device__ ushort4 __ldg(const ushort4 *ptr) { 391303233Sdim typedef unsigned short us4 __attribute__((ext_vector_type(4))); 392303233Sdim us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 393303233Sdim ushort4 ret; 394303233Sdim ret.x = rv[0]; 395303233Sdim ret.y = rv[1]; 396303233Sdim ret.z = rv[2]; 397303233Sdim ret.w = rv[3]; 398303233Sdim return ret; 399303233Sdim} 400303233Sdiminline __device__ uint2 __ldg(const uint2 *ptr) { 401303233Sdim typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 402303233Sdim ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 403303233Sdim uint2 ret; 404303233Sdim ret.x = rv[0]; 405303233Sdim ret.y = rv[1]; 406303233Sdim return ret; 407303233Sdim} 408303233Sdiminline __device__ uint4 __ldg(const uint4 *ptr) { 409303233Sdim typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 410303233Sdim ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 411303233Sdim uint4 ret; 412303233Sdim ret.x = rv[0]; 413303233Sdim ret.y = rv[1]; 414303233Sdim ret.z = rv[2]; 415303233Sdim ret.w = rv[3]; 416303233Sdim return ret; 417303233Sdim} 418303233Sdiminline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 419303233Sdim typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 420303233Sdim ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 421303233Sdim ulonglong2 ret; 422303233Sdim ret.x = rv[0]; 423303233Sdim ret.y = rv[1]; 424303233Sdim return ret; 425303233Sdim} 426303233Sdim 427303233Sdiminline __device__ float2 __ldg(const float2 *ptr) { 428303233Sdim typedef float f2 __attribute__((ext_vector_type(2))); 429303233Sdim f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 430303233Sdim float2 ret; 431303233Sdim ret.x = rv[0]; 432303233Sdim ret.y = rv[1]; 433303233Sdim return ret; 434303233Sdim} 435303233Sdiminline __device__ float4 __ldg(const float4 *ptr) { 436303233Sdim typedef float f4 __attribute__((ext_vector_type(4))); 437303233Sdim f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 438303233Sdim float4 ret; 439303233Sdim ret.x = rv[0]; 440303233Sdim ret.y = rv[1]; 441303233Sdim ret.z = rv[2]; 442303233Sdim ret.w = rv[3]; 443303233Sdim return ret; 444303233Sdim} 445303233Sdiminline __device__ double2 __ldg(const double2 *ptr) { 446303233Sdim typedef double d2 __attribute__((ext_vector_type(2))); 447303233Sdim d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 448303233Sdim double2 ret; 449303233Sdim ret.x = rv[0]; 450303233Sdim ret.y = rv[1]; 451303233Sdim return ret; 452303233Sdim} 453303233Sdim 454303233Sdim// TODO: Implement these as intrinsics, so the backend can work its magic on 455303233Sdim// these. Alternatively, we could implement these as plain C and try to get 456303233Sdim// llvm to recognize the relevant patterns. 457303233Sdiminline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 458303233Sdim unsigned shiftWidth) { 459303233Sdim unsigned result; 460303233Sdim asm("shf.l.wrap.b32 %0, %1, %2, %3;" 461303233Sdim : "=r"(result) 462303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 463303233Sdim return result; 464303233Sdim} 465303233Sdiminline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 466303233Sdim unsigned shiftWidth) { 467303233Sdim unsigned result; 468303233Sdim asm("shf.l.clamp.b32 %0, %1, %2, %3;" 469303233Sdim : "=r"(result) 470303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 471303233Sdim return result; 472303233Sdim} 473303233Sdiminline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 474303233Sdim unsigned shiftWidth) { 475303233Sdim unsigned result; 476303233Sdim asm("shf.r.wrap.b32 %0, %1, %2, %3;" 477303233Sdim : "=r"(result) 478303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 479303233Sdim return result; 480303233Sdim} 481303233Sdiminline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 482303233Sdim unsigned shiftWidth) { 483303233Sdim unsigned ret; 484303233Sdim asm("shf.r.clamp.b32 %0, %1, %2, %3;" 485303233Sdim : "=r"(ret) 486303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 487303233Sdim return ret; 488303233Sdim} 489303233Sdim 490303233Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 491303233Sdim 492303233Sdim#endif // defined(__CLANG_CUDA_INTRINSICS_H__) 493