1303233Sdim/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== 2303233Sdim * 3353358Sdim * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim * See https://llvm.org/LICENSE.txt for license information. 5353358Sdim * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6303233Sdim * 7303233Sdim *===-----------------------------------------------------------------------=== 8303233Sdim */ 9303233Sdim#ifndef __CLANG_CUDA_INTRINSICS_H__ 10303233Sdim#define __CLANG_CUDA_INTRINSICS_H__ 11303233Sdim#ifndef __CUDA__ 12303233Sdim#error "This file is for CUDA compilation only." 13303233Sdim#endif 14303233Sdim 15303233Sdim// sm_30 intrinsics: __shfl_{up,down,xor}. 16303233Sdim 17303233Sdim#define __SM_30_INTRINSICS_H__ 18303233Sdim#define __SM_30_INTRINSICS_HPP__ 19303233Sdim 20303233Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 21303233Sdim 22303233Sdim#pragma push_macro("__MAKE_SHUFFLES") 23327952Sdim#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ 24327952Sdim __Type) \ 25327952Sdim inline __device__ int __FnName(int __val, __Type __offset, \ 26303233Sdim int __width = warpSize) { \ 27314564Sdim return __IntIntrinsic(__val, __offset, \ 28303233Sdim ((warpSize - __width) << 8) | (__Mask)); \ 29303233Sdim } \ 30327952Sdim inline __device__ float __FnName(float __val, __Type __offset, \ 31303233Sdim int __width = warpSize) { \ 32314564Sdim return __FloatIntrinsic(__val, __offset, \ 33303233Sdim ((warpSize - __width) << 8) | (__Mask)); \ 34303233Sdim } \ 35327952Sdim inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ 36303233Sdim int __width = warpSize) { \ 37303233Sdim return static_cast<unsigned int>( \ 38314564Sdim ::__FnName(static_cast<int>(__val), __offset, __width)); \ 39303233Sdim } \ 40327952Sdim inline __device__ long long __FnName(long long __val, __Type __offset, \ 41303233Sdim int __width = warpSize) { \ 42303233Sdim struct __Bits { \ 43303233Sdim int __a, __b; \ 44303233Sdim }; \ 45314564Sdim _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 46303233Sdim _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 47303233Sdim __Bits __tmp; \ 48360784Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 49303233Sdim __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ 50303233Sdim __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ 51314564Sdim long long __ret; \ 52314564Sdim memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 53314564Sdim return __ret; \ 54303233Sdim } \ 55327952Sdim inline __device__ long __FnName(long __val, __Type __offset, \ 56327952Sdim int __width = warpSize) { \ 57327952Sdim _Static_assert(sizeof(long) == sizeof(long long) || \ 58327952Sdim sizeof(long) == sizeof(int)); \ 59327952Sdim if (sizeof(long) == sizeof(long long)) { \ 60327952Sdim return static_cast<long>( \ 61327952Sdim ::__FnName(static_cast<long long>(__val), __offset, __width)); \ 62327952Sdim } else if (sizeof(long) == sizeof(int)) { \ 63327952Sdim return static_cast<long>( \ 64327952Sdim ::__FnName(static_cast<int>(__val), __offset, __width)); \ 65327952Sdim } \ 66327952Sdim } \ 67327952Sdim inline __device__ unsigned long __FnName( \ 68327952Sdim unsigned long __val, __Type __offset, int __width = warpSize) { \ 69327952Sdim return static_cast<unsigned long>( \ 70327952Sdim ::__FnName(static_cast<long>(__val), __offset, __width)); \ 71327952Sdim } \ 72303233Sdim inline __device__ unsigned long long __FnName( \ 73327952Sdim unsigned long long __val, __Type __offset, int __width = warpSize) { \ 74314564Sdim return static_cast<unsigned long long>(::__FnName( \ 75314564Sdim static_cast<unsigned long long>(__val), __offset, __width)); \ 76303233Sdim } \ 77327952Sdim inline __device__ double __FnName(double __val, __Type __offset, \ 78303233Sdim int __width = warpSize) { \ 79303233Sdim long long __tmp; \ 80314564Sdim _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 81314564Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 82303233Sdim __tmp = ::__FnName(__tmp, __offset, __width); \ 83314564Sdim double __ret; \ 84314564Sdim memcpy(&__ret, &__tmp, sizeof(__ret)); \ 85314564Sdim return __ret; \ 86303233Sdim } 87303233Sdim 88327952Sdim__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); 89303233Sdim// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 90303233Sdim// maxLane. 91327952Sdim__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, 92327952Sdim unsigned int); 93327952Sdim__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, 94327952Sdim unsigned int); 95327952Sdim__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, 96327952Sdim int); 97303233Sdim#pragma pop_macro("__MAKE_SHUFFLES") 98303233Sdim 99303233Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 100303233Sdim 101327952Sdim#if CUDA_VERSION >= 9000 102327952Sdim#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) 103327952Sdim// __shfl_sync_* variants available in CUDA-9 104327952Sdim#pragma push_macro("__MAKE_SYNC_SHUFFLES") 105327952Sdim#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ 106327952Sdim __Mask, __Type) \ 107327952Sdim inline __device__ int __FnName(unsigned int __mask, int __val, \ 108327952Sdim __Type __offset, int __width = warpSize) { \ 109327952Sdim return __IntIntrinsic(__mask, __val, __offset, \ 110327952Sdim ((warpSize - __width) << 8) | (__Mask)); \ 111327952Sdim } \ 112327952Sdim inline __device__ float __FnName(unsigned int __mask, float __val, \ 113327952Sdim __Type __offset, int __width = warpSize) { \ 114327952Sdim return __FloatIntrinsic(__mask, __val, __offset, \ 115327952Sdim ((warpSize - __width) << 8) | (__Mask)); \ 116327952Sdim } \ 117327952Sdim inline __device__ unsigned int __FnName(unsigned int __mask, \ 118327952Sdim unsigned int __val, __Type __offset, \ 119327952Sdim int __width = warpSize) { \ 120327952Sdim return static_cast<unsigned int>( \ 121327952Sdim ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 122327952Sdim } \ 123327952Sdim inline __device__ long long __FnName(unsigned int __mask, long long __val, \ 124327952Sdim __Type __offset, \ 125327952Sdim int __width = warpSize) { \ 126327952Sdim struct __Bits { \ 127327952Sdim int __a, __b; \ 128327952Sdim }; \ 129327952Sdim _Static_assert(sizeof(__val) == sizeof(__Bits)); \ 130327952Sdim _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ 131327952Sdim __Bits __tmp; \ 132360784Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 133327952Sdim __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ 134327952Sdim __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ 135327952Sdim long long __ret; \ 136327952Sdim memcpy(&__ret, &__tmp, sizeof(__tmp)); \ 137327952Sdim return __ret; \ 138327952Sdim } \ 139327952Sdim inline __device__ unsigned long long __FnName( \ 140327952Sdim unsigned int __mask, unsigned long long __val, __Type __offset, \ 141327952Sdim int __width = warpSize) { \ 142327952Sdim return static_cast<unsigned long long>(::__FnName( \ 143327952Sdim __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ 144327952Sdim } \ 145327952Sdim inline __device__ long __FnName(unsigned int __mask, long __val, \ 146327952Sdim __Type __offset, int __width = warpSize) { \ 147327952Sdim _Static_assert(sizeof(long) == sizeof(long long) || \ 148327952Sdim sizeof(long) == sizeof(int)); \ 149327952Sdim if (sizeof(long) == sizeof(long long)) { \ 150327952Sdim return static_cast<long>(::__FnName( \ 151327952Sdim __mask, static_cast<long long>(__val), __offset, __width)); \ 152327952Sdim } else if (sizeof(long) == sizeof(int)) { \ 153327952Sdim return static_cast<long>( \ 154327952Sdim ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ 155327952Sdim } \ 156327952Sdim } \ 157327952Sdim inline __device__ unsigned long __FnName( \ 158327952Sdim unsigned int __mask, unsigned long __val, __Type __offset, \ 159327952Sdim int __width = warpSize) { \ 160327952Sdim return static_cast<unsigned long>( \ 161327952Sdim ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ 162327952Sdim } \ 163327952Sdim inline __device__ double __FnName(unsigned int __mask, double __val, \ 164327952Sdim __Type __offset, int __width = warpSize) { \ 165327952Sdim long long __tmp; \ 166327952Sdim _Static_assert(sizeof(__tmp) == sizeof(__val)); \ 167327952Sdim memcpy(&__tmp, &__val, sizeof(__val)); \ 168327952Sdim __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ 169327952Sdim double __ret; \ 170327952Sdim memcpy(&__ret, &__tmp, sizeof(__ret)); \ 171327952Sdim return __ret; \ 172327952Sdim } 173327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, 174327952Sdim __nvvm_shfl_sync_idx_f32, 0x1f, int); 175327952Sdim// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= 176327952Sdim// maxLane. 177327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, 178327952Sdim __nvvm_shfl_sync_up_f32, 0, unsigned int); 179327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, 180327952Sdim __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); 181327952Sdim__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, 182327952Sdim __nvvm_shfl_sync_bfly_f32, 0x1f, int); 183327952Sdim#pragma pop_macro("__MAKE_SYNC_SHUFFLES") 184327952Sdim 185327952Sdiminline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { 186327952Sdim return __nvvm_bar_warp_sync(mask); 187327952Sdim} 188327952Sdim 189327952Sdiminline __device__ void __barrier_sync(unsigned int id) { 190327952Sdim __nvvm_barrier_sync(id); 191327952Sdim} 192327952Sdim 193327952Sdiminline __device__ void __barrier_sync_count(unsigned int id, 194327952Sdim unsigned int count) { 195327952Sdim __nvvm_barrier_sync_cnt(id, count); 196327952Sdim} 197327952Sdim 198327952Sdiminline __device__ int __all_sync(unsigned int mask, int pred) { 199327952Sdim return __nvvm_vote_all_sync(mask, pred); 200327952Sdim} 201327952Sdim 202327952Sdiminline __device__ int __any_sync(unsigned int mask, int pred) { 203327952Sdim return __nvvm_vote_any_sync(mask, pred); 204327952Sdim} 205327952Sdim 206327952Sdiminline __device__ int __uni_sync(unsigned int mask, int pred) { 207327952Sdim return __nvvm_vote_uni_sync(mask, pred); 208327952Sdim} 209327952Sdim 210327952Sdiminline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { 211327952Sdim return __nvvm_vote_ballot_sync(mask, pred); 212327952Sdim} 213327952Sdim 214360784Sdiminline __device__ unsigned int __activemask() { 215360784Sdim#if CUDA_VERSION < 9020 216360784Sdim return __nvvm_vote_ballot(1); 217360784Sdim#else 218360784Sdim unsigned int mask; 219360784Sdim asm volatile("activemask.b32 %0;" : "=r"(mask)); 220360784Sdim return mask; 221360784Sdim#endif 222360784Sdim} 223327952Sdim 224327952Sdiminline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { 225327952Sdim return __nvvm_fns(mask, base, offset); 226327952Sdim} 227327952Sdim 228327952Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 229327952Sdim 230327952Sdim// Define __match* builtins CUDA-9 headers expect to see. 231327952Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 232327952Sdiminline __device__ unsigned int __match32_any_sync(unsigned int mask, 233327952Sdim unsigned int value) { 234327952Sdim return __nvvm_match_any_sync_i32(mask, value); 235327952Sdim} 236327952Sdim 237327952Sdiminline __device__ unsigned long long 238327952Sdim__match64_any_sync(unsigned int mask, unsigned long long value) { 239327952Sdim return __nvvm_match_any_sync_i64(mask, value); 240327952Sdim} 241327952Sdim 242327952Sdiminline __device__ unsigned int 243327952Sdim__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { 244327952Sdim return __nvvm_match_all_sync_i32p(mask, value, pred); 245327952Sdim} 246327952Sdim 247327952Sdiminline __device__ unsigned long long 248327952Sdim__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { 249327952Sdim return __nvvm_match_all_sync_i64p(mask, value, pred); 250327952Sdim} 251327952Sdim#include "crt/sm_70_rt.hpp" 252327952Sdim 253327952Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 254327952Sdim#endif // __CUDA_VERSION >= 9000 255327952Sdim 256303233Sdim// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. 257303233Sdim 258303233Sdim// Prevent the vanilla sm_32 intrinsics header from being included. 259303233Sdim#define __SM_32_INTRINSICS_H__ 260303233Sdim#define __SM_32_INTRINSICS_HPP__ 261303233Sdim 262303233Sdim#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 263303233Sdim 264303233Sdiminline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } 265303233Sdiminline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } 266303233Sdiminline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } 267303233Sdiminline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } 268303233Sdiminline __device__ long long __ldg(const long long *ptr) { 269303233Sdim return __nvvm_ldg_ll(ptr); 270303233Sdim} 271303233Sdiminline __device__ unsigned char __ldg(const unsigned char *ptr) { 272303233Sdim return __nvvm_ldg_uc(ptr); 273303233Sdim} 274341825Sdiminline __device__ signed char __ldg(const signed char *ptr) { 275341825Sdim return __nvvm_ldg_uc((const unsigned char *)ptr); 276341825Sdim} 277303233Sdiminline __device__ unsigned short __ldg(const unsigned short *ptr) { 278303233Sdim return __nvvm_ldg_us(ptr); 279303233Sdim} 280303233Sdiminline __device__ unsigned int __ldg(const unsigned int *ptr) { 281303233Sdim return __nvvm_ldg_ui(ptr); 282303233Sdim} 283303233Sdiminline __device__ unsigned long __ldg(const unsigned long *ptr) { 284303233Sdim return __nvvm_ldg_ul(ptr); 285303233Sdim} 286303233Sdiminline __device__ unsigned long long __ldg(const unsigned long long *ptr) { 287303233Sdim return __nvvm_ldg_ull(ptr); 288303233Sdim} 289303233Sdiminline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } 290303233Sdiminline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } 291303233Sdim 292303233Sdiminline __device__ char2 __ldg(const char2 *ptr) { 293303233Sdim typedef char c2 __attribute__((ext_vector_type(2))); 294303233Sdim // We can assume that ptr is aligned at least to char2's alignment, but the 295303233Sdim // load will assume that ptr is aligned to char2's alignment. This is only 296303233Sdim // safe if alignof(c2) <= alignof(char2). 297303233Sdim c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); 298303233Sdim char2 ret; 299303233Sdim ret.x = rv[0]; 300303233Sdim ret.y = rv[1]; 301303233Sdim return ret; 302303233Sdim} 303303233Sdiminline __device__ char4 __ldg(const char4 *ptr) { 304303233Sdim typedef char c4 __attribute__((ext_vector_type(4))); 305303233Sdim c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); 306303233Sdim char4 ret; 307303233Sdim ret.x = rv[0]; 308303233Sdim ret.y = rv[1]; 309303233Sdim ret.z = rv[2]; 310303233Sdim ret.w = rv[3]; 311303233Sdim return ret; 312303233Sdim} 313303233Sdiminline __device__ short2 __ldg(const short2 *ptr) { 314303233Sdim typedef short s2 __attribute__((ext_vector_type(2))); 315303233Sdim s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); 316303233Sdim short2 ret; 317303233Sdim ret.x = rv[0]; 318303233Sdim ret.y = rv[1]; 319303233Sdim return ret; 320303233Sdim} 321303233Sdiminline __device__ short4 __ldg(const short4 *ptr) { 322303233Sdim typedef short s4 __attribute__((ext_vector_type(4))); 323303233Sdim s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); 324303233Sdim short4 ret; 325303233Sdim ret.x = rv[0]; 326303233Sdim ret.y = rv[1]; 327303233Sdim ret.z = rv[2]; 328303233Sdim ret.w = rv[3]; 329303233Sdim return ret; 330303233Sdim} 331303233Sdiminline __device__ int2 __ldg(const int2 *ptr) { 332303233Sdim typedef int i2 __attribute__((ext_vector_type(2))); 333303233Sdim i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); 334303233Sdim int2 ret; 335303233Sdim ret.x = rv[0]; 336303233Sdim ret.y = rv[1]; 337303233Sdim return ret; 338303233Sdim} 339303233Sdiminline __device__ int4 __ldg(const int4 *ptr) { 340303233Sdim typedef int i4 __attribute__((ext_vector_type(4))); 341303233Sdim i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); 342303233Sdim int4 ret; 343303233Sdim ret.x = rv[0]; 344303233Sdim ret.y = rv[1]; 345303233Sdim ret.z = rv[2]; 346303233Sdim ret.w = rv[3]; 347303233Sdim return ret; 348303233Sdim} 349303233Sdiminline __device__ longlong2 __ldg(const longlong2 *ptr) { 350303233Sdim typedef long long ll2 __attribute__((ext_vector_type(2))); 351303233Sdim ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); 352303233Sdim longlong2 ret; 353303233Sdim ret.x = rv[0]; 354303233Sdim ret.y = rv[1]; 355303233Sdim return ret; 356303233Sdim} 357303233Sdim 358303233Sdiminline __device__ uchar2 __ldg(const uchar2 *ptr) { 359303233Sdim typedef unsigned char uc2 __attribute__((ext_vector_type(2))); 360303233Sdim uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); 361303233Sdim uchar2 ret; 362303233Sdim ret.x = rv[0]; 363303233Sdim ret.y = rv[1]; 364303233Sdim return ret; 365303233Sdim} 366303233Sdiminline __device__ uchar4 __ldg(const uchar4 *ptr) { 367303233Sdim typedef unsigned char uc4 __attribute__((ext_vector_type(4))); 368303233Sdim uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); 369303233Sdim uchar4 ret; 370303233Sdim ret.x = rv[0]; 371303233Sdim ret.y = rv[1]; 372303233Sdim ret.z = rv[2]; 373303233Sdim ret.w = rv[3]; 374303233Sdim return ret; 375303233Sdim} 376303233Sdiminline __device__ ushort2 __ldg(const ushort2 *ptr) { 377303233Sdim typedef unsigned short us2 __attribute__((ext_vector_type(2))); 378303233Sdim us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); 379303233Sdim ushort2 ret; 380303233Sdim ret.x = rv[0]; 381303233Sdim ret.y = rv[1]; 382303233Sdim return ret; 383303233Sdim} 384303233Sdiminline __device__ ushort4 __ldg(const ushort4 *ptr) { 385303233Sdim typedef unsigned short us4 __attribute__((ext_vector_type(4))); 386303233Sdim us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); 387303233Sdim ushort4 ret; 388303233Sdim ret.x = rv[0]; 389303233Sdim ret.y = rv[1]; 390303233Sdim ret.z = rv[2]; 391303233Sdim ret.w = rv[3]; 392303233Sdim return ret; 393303233Sdim} 394303233Sdiminline __device__ uint2 __ldg(const uint2 *ptr) { 395303233Sdim typedef unsigned int ui2 __attribute__((ext_vector_type(2))); 396303233Sdim ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); 397303233Sdim uint2 ret; 398303233Sdim ret.x = rv[0]; 399303233Sdim ret.y = rv[1]; 400303233Sdim return ret; 401303233Sdim} 402303233Sdiminline __device__ uint4 __ldg(const uint4 *ptr) { 403303233Sdim typedef unsigned int ui4 __attribute__((ext_vector_type(4))); 404303233Sdim ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); 405303233Sdim uint4 ret; 406303233Sdim ret.x = rv[0]; 407303233Sdim ret.y = rv[1]; 408303233Sdim ret.z = rv[2]; 409303233Sdim ret.w = rv[3]; 410303233Sdim return ret; 411303233Sdim} 412303233Sdiminline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { 413303233Sdim typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); 414303233Sdim ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); 415303233Sdim ulonglong2 ret; 416303233Sdim ret.x = rv[0]; 417303233Sdim ret.y = rv[1]; 418303233Sdim return ret; 419303233Sdim} 420303233Sdim 421303233Sdiminline __device__ float2 __ldg(const float2 *ptr) { 422303233Sdim typedef float f2 __attribute__((ext_vector_type(2))); 423303233Sdim f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); 424303233Sdim float2 ret; 425303233Sdim ret.x = rv[0]; 426303233Sdim ret.y = rv[1]; 427303233Sdim return ret; 428303233Sdim} 429303233Sdiminline __device__ float4 __ldg(const float4 *ptr) { 430303233Sdim typedef float f4 __attribute__((ext_vector_type(4))); 431303233Sdim f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); 432303233Sdim float4 ret; 433303233Sdim ret.x = rv[0]; 434303233Sdim ret.y = rv[1]; 435303233Sdim ret.z = rv[2]; 436303233Sdim ret.w = rv[3]; 437303233Sdim return ret; 438303233Sdim} 439303233Sdiminline __device__ double2 __ldg(const double2 *ptr) { 440303233Sdim typedef double d2 __attribute__((ext_vector_type(2))); 441303233Sdim d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); 442303233Sdim double2 ret; 443303233Sdim ret.x = rv[0]; 444303233Sdim ret.y = rv[1]; 445303233Sdim return ret; 446303233Sdim} 447303233Sdim 448303233Sdim// TODO: Implement these as intrinsics, so the backend can work its magic on 449303233Sdim// these. Alternatively, we could implement these as plain C and try to get 450303233Sdim// llvm to recognize the relevant patterns. 451303233Sdiminline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, 452303233Sdim unsigned shiftWidth) { 453303233Sdim unsigned result; 454303233Sdim asm("shf.l.wrap.b32 %0, %1, %2, %3;" 455303233Sdim : "=r"(result) 456303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 457303233Sdim return result; 458303233Sdim} 459303233Sdiminline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, 460303233Sdim unsigned shiftWidth) { 461303233Sdim unsigned result; 462303233Sdim asm("shf.l.clamp.b32 %0, %1, %2, %3;" 463303233Sdim : "=r"(result) 464303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 465303233Sdim return result; 466303233Sdim} 467303233Sdiminline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, 468303233Sdim unsigned shiftWidth) { 469303233Sdim unsigned result; 470303233Sdim asm("shf.r.wrap.b32 %0, %1, %2, %3;" 471303233Sdim : "=r"(result) 472303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 473303233Sdim return result; 474303233Sdim} 475303233Sdiminline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, 476303233Sdim unsigned shiftWidth) { 477303233Sdim unsigned ret; 478303233Sdim asm("shf.r.clamp.b32 %0, %1, %2, %3;" 479303233Sdim : "=r"(ret) 480303233Sdim : "r"(low32), "r"(high32), "r"(shiftWidth)); 481303233Sdim return ret; 482303233Sdim} 483303233Sdim 484303233Sdim#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 485303233Sdim 486303233Sdim#endif // defined(__CLANG_CUDA_INTRINSICS_H__) 487