1/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 11#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 12 13#ifndef __OPENMP_NVPTX__ 14#if CUDA_VERSION < 9000 15#error This file is intended to be used with CUDA-9+ only. 16#endif 17#endif 18 19// __DEVICE__ is a helper macro with common set of attributes for the wrappers 20// we implement in this file. We need static in order to avoid emitting unused 21// functions and __forceinline__ helps inlining these wrappers at -O1. 22#pragma push_macro("__DEVICE__") 23#ifdef __OPENMP_NVPTX__ 24#define __DEVICE__ static __attribute__((always_inline, nothrow)) 25#else 26#define __DEVICE__ static __device__ __forceinline__ 27#endif 28 29__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); } 30__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); } 31__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); } 32__DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); } 33__DEVICE__ unsigned long long __brevll(unsigned long long __a) { 34 return __nv_brevll(__a); 35} 36#if defined(__cplusplus) 37__DEVICE__ void __brkpt() { asm volatile("brkpt;"); } 38__DEVICE__ void __brkpt(int __a) { __brkpt(); } 39#else 40__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); } 41__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); } 42#endif 43__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, 44 unsigned int __c) { 45 return __nv_byte_perm(__a, __b, __c); 46} 47__DEVICE__ int __clz(int __a) { return __nv_clz(__a); } 48__DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); } 49__DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); } 50__DEVICE__ double __dAtomicAdd(double *__p, double __v) { 51 return __nvvm_atom_add_gen_d(__p, __v); 52} 53__DEVICE__ double __dAtomicAdd_block(double *__p, double __v) { 54 return __nvvm_atom_cta_add_gen_d(__p, __v); 55} 56__DEVICE__ double __dAtomicAdd_system(double *__p, double __v) { 57 return __nvvm_atom_sys_add_gen_d(__p, __v); 58} 59__DEVICE__ double __dadd_rd(double __a, double __b) { 60 return __nv_dadd_rd(__a, __b); 61} 62__DEVICE__ double __dadd_rn(double __a, double __b) { 63 return __nv_dadd_rn(__a, __b); 64} 65__DEVICE__ double __dadd_ru(double __a, double __b) { 66 return __nv_dadd_ru(__a, __b); 67} 68__DEVICE__ double __dadd_rz(double __a, double __b) { 69 return __nv_dadd_rz(__a, __b); 70} 71__DEVICE__ double __ddiv_rd(double __a, double __b) { 72 return __nv_ddiv_rd(__a, __b); 73} 74__DEVICE__ double __ddiv_rn(double __a, double __b) { 75 return __nv_ddiv_rn(__a, __b); 76} 77__DEVICE__ double __ddiv_ru(double __a, double __b) { 78 return __nv_ddiv_ru(__a, __b); 79} 80__DEVICE__ double __ddiv_rz(double __a, double __b) { 81 return __nv_ddiv_rz(__a, __b); 82} 83__DEVICE__ double __dmul_rd(double __a, double __b) { 84 return __nv_dmul_rd(__a, __b); 85} 86__DEVICE__ double __dmul_rn(double __a, double __b) { 87 return __nv_dmul_rn(__a, __b); 88} 89__DEVICE__ double __dmul_ru(double __a, double __b) { 90 return __nv_dmul_ru(__a, __b); 91} 92__DEVICE__ double __dmul_rz(double __a, double __b) { 93 return __nv_dmul_rz(__a, __b); 94} 95__DEVICE__ float __double2float_rd(double __a) { 96 return __nv_double2float_rd(__a); 97} 98__DEVICE__ float __double2float_rn(double __a) { 99 return __nv_double2float_rn(__a); 100} 101__DEVICE__ float __double2float_ru(double __a) { 102 return __nv_double2float_ru(__a); 103} 104__DEVICE__ float __double2float_rz(double __a) { 105 return __nv_double2float_rz(__a); 106} 107__DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); } 108__DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); } 109__DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); } 110__DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); } 111__DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); } 112__DEVICE__ long long __double2ll_rd(double __a) { 113 return __nv_double2ll_rd(__a); 114} 115__DEVICE__ long long __double2ll_rn(double __a) { 116 return __nv_double2ll_rn(__a); 117} 118__DEVICE__ long long __double2ll_ru(double __a) { 119 return __nv_double2ll_ru(__a); 120} 121__DEVICE__ long long __double2ll_rz(double __a) { 122 return __nv_double2ll_rz(__a); 123} 124__DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); } 125__DEVICE__ unsigned int __double2uint_rd(double __a) { 126 return __nv_double2uint_rd(__a); 127} 128__DEVICE__ unsigned int __double2uint_rn(double __a) { 129 return __nv_double2uint_rn(__a); 130} 131__DEVICE__ unsigned int __double2uint_ru(double __a) { 132 return __nv_double2uint_ru(__a); 133} 134__DEVICE__ unsigned int __double2uint_rz(double __a) { 135 return __nv_double2uint_rz(__a); 136} 137__DEVICE__ unsigned long long __double2ull_rd(double __a) { 138 return __nv_double2ull_rd(__a); 139} 140__DEVICE__ unsigned long long __double2ull_rn(double __a) { 141 return __nv_double2ull_rn(__a); 142} 143__DEVICE__ unsigned long long __double2ull_ru(double __a) { 144 return __nv_double2ull_ru(__a); 145} 146__DEVICE__ unsigned long long __double2ull_rz(double __a) { 147 return __nv_double2ull_rz(__a); 148} 149__DEVICE__ long long __double_as_longlong(double __a) { 150 return __nv_double_as_longlong(__a); 151} 152__DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); } 153__DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); } 154__DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); } 155__DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); } 156__DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); } 157__DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); } 158__DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); } 159__DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); } 160__DEVICE__ double __dsub_rd(double __a, double __b) { 161 return __nv_dsub_rd(__a, __b); 162} 163__DEVICE__ double __dsub_rn(double __a, double __b) { 164 return __nv_dsub_rn(__a, __b); 165} 166__DEVICE__ double __dsub_ru(double __a, double __b) { 167 return __nv_dsub_ru(__a, __b); 168} 169__DEVICE__ double __dsub_rz(double __a, double __b) { 170 return __nv_dsub_rz(__a, __b); 171} 172__DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); } 173__DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); } 174__DEVICE__ float __fAtomicAdd(float *__p, float __v) { 175 return __nvvm_atom_add_gen_f(__p, __v); 176} 177__DEVICE__ float __fAtomicAdd_block(float *__p, float __v) { 178 return __nvvm_atom_cta_add_gen_f(__p, __v); 179} 180__DEVICE__ float __fAtomicAdd_system(float *__p, float __v) { 181 return __nvvm_atom_sys_add_gen_f(__p, __v); 182} 183__DEVICE__ float __fAtomicExch(float *__p, float __v) { 184 return __nv_int_as_float( 185 __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 186} 187__DEVICE__ float __fAtomicExch_block(float *__p, float __v) { 188 return __nv_int_as_float( 189 __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 190} 191__DEVICE__ float __fAtomicExch_system(float *__p, float __v) { 192 return __nv_int_as_float( 193 __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 194} 195__DEVICE__ float __fadd_rd(float __a, float __b) { 196 return __nv_fadd_rd(__a, __b); 197} 198__DEVICE__ float __fadd_rn(float __a, float __b) { 199 return __nv_fadd_rn(__a, __b); 200} 201__DEVICE__ float __fadd_ru(float __a, float __b) { 202 return __nv_fadd_ru(__a, __b); 203} 204__DEVICE__ float __fadd_rz(float __a, float __b) { 205 return __nv_fadd_rz(__a, __b); 206} 207__DEVICE__ float __fdiv_rd(float __a, float __b) { 208 return __nv_fdiv_rd(__a, __b); 209} 210__DEVICE__ float __fdiv_rn(float __a, float __b) { 211 return __nv_fdiv_rn(__a, __b); 212} 213__DEVICE__ float __fdiv_ru(float __a, float __b) { 214 return __nv_fdiv_ru(__a, __b); 215} 216__DEVICE__ float __fdiv_rz(float __a, float __b) { 217 return __nv_fdiv_rz(__a, __b); 218} 219__DEVICE__ float __fdividef(float __a, float __b) { 220 return __nv_fast_fdividef(__a, __b); 221} 222__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); } 223__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); } 224__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); } 225__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); } 226#ifdef _MSC_VER 227__DEVICE__ int __finitel(long double __a); 228#endif 229__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); } 230__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); } 231__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); } 232__DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); } 233__DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); } 234__DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); } 235__DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); } 236__DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); } 237__DEVICE__ unsigned int __float2uint_rd(float __a) { 238 return __nv_float2uint_rd(__a); 239} 240__DEVICE__ unsigned int __float2uint_rn(float __a) { 241 return __nv_float2uint_rn(__a); 242} 243__DEVICE__ unsigned int __float2uint_ru(float __a) { 244 return __nv_float2uint_ru(__a); 245} 246__DEVICE__ unsigned int __float2uint_rz(float __a) { 247 return __nv_float2uint_rz(__a); 248} 249__DEVICE__ unsigned long long __float2ull_rd(float __a) { 250 return __nv_float2ull_rd(__a); 251} 252__DEVICE__ unsigned long long __float2ull_rn(float __a) { 253 return __nv_float2ull_rn(__a); 254} 255__DEVICE__ unsigned long long __float2ull_ru(float __a) { 256 return __nv_float2ull_ru(__a); 257} 258__DEVICE__ unsigned long long __float2ull_rz(float __a) { 259 return __nv_float2ull_rz(__a); 260} 261__DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); } 262__DEVICE__ unsigned int __float_as_uint(float __a) { 263 return __nv_float_as_uint(__a); 264} 265__DEVICE__ double __fma_rd(double __a, double __b, double __c) { 266 return __nv_fma_rd(__a, __b, __c); 267} 268__DEVICE__ double __fma_rn(double __a, double __b, double __c) { 269 return __nv_fma_rn(__a, __b, __c); 270} 271__DEVICE__ double __fma_ru(double __a, double __b, double __c) { 272 return __nv_fma_ru(__a, __b, __c); 273} 274__DEVICE__ double __fma_rz(double __a, double __b, double __c) { 275 return __nv_fma_rz(__a, __b, __c); 276} 277__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) { 278 return __nv_fmaf_ieee_rd(__a, __b, __c); 279} 280__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) { 281 return __nv_fmaf_ieee_rn(__a, __b, __c); 282} 283__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) { 284 return __nv_fmaf_ieee_ru(__a, __b, __c); 285} 286__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) { 287 return __nv_fmaf_ieee_rz(__a, __b, __c); 288} 289__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) { 290 return __nv_fmaf_rd(__a, __b, __c); 291} 292__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) { 293 return __nv_fmaf_rn(__a, __b, __c); 294} 295__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) { 296 return __nv_fmaf_ru(__a, __b, __c); 297} 298__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) { 299 return __nv_fmaf_rz(__a, __b, __c); 300} 301__DEVICE__ float __fmul_rd(float __a, float __b) { 302 return __nv_fmul_rd(__a, __b); 303} 304__DEVICE__ float __fmul_rn(float __a, float __b) { 305 return __nv_fmul_rn(__a, __b); 306} 307__DEVICE__ float __fmul_ru(float __a, float __b) { 308 return __nv_fmul_ru(__a, __b); 309} 310__DEVICE__ float __fmul_rz(float __a, float __b) { 311 return __nv_fmul_rz(__a, __b); 312} 313__DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); } 314__DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); } 315__DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); } 316__DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); } 317__DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); } 318__DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); } 319__DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); } 320__DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); } 321__DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); } 322__DEVICE__ float __fsub_rd(float __a, float __b) { 323 return __nv_fsub_rd(__a, __b); 324} 325__DEVICE__ float __fsub_rn(float __a, float __b) { 326 return __nv_fsub_rn(__a, __b); 327} 328__DEVICE__ float __fsub_ru(float __a, float __b) { 329 return __nv_fsub_ru(__a, __b); 330} 331__DEVICE__ float __fsub_rz(float __a, float __b) { 332 return __nv_fsub_rz(__a, __b); 333} 334__DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); } 335__DEVICE__ double __hiloint2double(int __a, int __b) { 336 return __nv_hiloint2double(__a, __b); 337} 338__DEVICE__ int __iAtomicAdd(int *__p, int __v) { 339 return __nvvm_atom_add_gen_i(__p, __v); 340} 341__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) { 342 return __nvvm_atom_cta_add_gen_i(__p, __v); 343} 344__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) { 345 return __nvvm_atom_sys_add_gen_i(__p, __v); 346} 347__DEVICE__ int __iAtomicAnd(int *__p, int __v) { 348 return __nvvm_atom_and_gen_i(__p, __v); 349} 350__DEVICE__ int __iAtomicAnd_block(int *__p, int __v) { 351 return __nvvm_atom_cta_and_gen_i(__p, __v); 352} 353__DEVICE__ int __iAtomicAnd_system(int *__p, int __v) { 354 return __nvvm_atom_sys_and_gen_i(__p, __v); 355} 356__DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) { 357 return __nvvm_atom_cas_gen_i(__p, __cmp, __v); 358} 359__DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) { 360 return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v); 361} 362__DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) { 363 return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v); 364} 365__DEVICE__ int __iAtomicExch(int *__p, int __v) { 366 return __nvvm_atom_xchg_gen_i(__p, __v); 367} 368__DEVICE__ int __iAtomicExch_block(int *__p, int __v) { 369 return __nvvm_atom_cta_xchg_gen_i(__p, __v); 370} 371__DEVICE__ int __iAtomicExch_system(int *__p, int __v) { 372 return __nvvm_atom_sys_xchg_gen_i(__p, __v); 373} 374__DEVICE__ int __iAtomicMax(int *__p, int __v) { 375 return __nvvm_atom_max_gen_i(__p, __v); 376} 377__DEVICE__ int __iAtomicMax_block(int *__p, int __v) { 378 return __nvvm_atom_cta_max_gen_i(__p, __v); 379} 380__DEVICE__ int __iAtomicMax_system(int *__p, int __v) { 381 return __nvvm_atom_sys_max_gen_i(__p, __v); 382} 383__DEVICE__ int __iAtomicMin(int *__p, int __v) { 384 return __nvvm_atom_min_gen_i(__p, __v); 385} 386__DEVICE__ int __iAtomicMin_block(int *__p, int __v) { 387 return __nvvm_atom_cta_min_gen_i(__p, __v); 388} 389__DEVICE__ int __iAtomicMin_system(int *__p, int __v) { 390 return __nvvm_atom_sys_min_gen_i(__p, __v); 391} 392__DEVICE__ int __iAtomicOr(int *__p, int __v) { 393 return __nvvm_atom_or_gen_i(__p, __v); 394} 395__DEVICE__ int __iAtomicOr_block(int *__p, int __v) { 396 return __nvvm_atom_cta_or_gen_i(__p, __v); 397} 398__DEVICE__ int __iAtomicOr_system(int *__p, int __v) { 399 return __nvvm_atom_sys_or_gen_i(__p, __v); 400} 401__DEVICE__ int __iAtomicXor(int *__p, int __v) { 402 return __nvvm_atom_xor_gen_i(__p, __v); 403} 404__DEVICE__ int __iAtomicXor_block(int *__p, int __v) { 405 return __nvvm_atom_cta_xor_gen_i(__p, __v); 406} 407__DEVICE__ int __iAtomicXor_system(int *__p, int __v) { 408 return __nvvm_atom_sys_xor_gen_i(__p, __v); 409} 410__DEVICE__ long long __illAtomicMax(long long *__p, long long __v) { 411 return __nvvm_atom_max_gen_ll(__p, __v); 412} 413__DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) { 414 return __nvvm_atom_cta_max_gen_ll(__p, __v); 415} 416__DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) { 417 return __nvvm_atom_sys_max_gen_ll(__p, __v); 418} 419__DEVICE__ long long __illAtomicMin(long long *__p, long long __v) { 420 return __nvvm_atom_min_gen_ll(__p, __v); 421} 422__DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) { 423 return __nvvm_atom_cta_min_gen_ll(__p, __v); 424} 425__DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) { 426 return __nvvm_atom_sys_min_gen_ll(__p, __v); 427} 428__DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); } 429__DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); } 430__DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); } 431__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); } 432__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); } 433__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); } 434__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); } 435__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); } 436__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); } 437#ifdef _MSC_VER 438__DEVICE__ int __isinfl(long double __a); 439#endif 440__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); } 441__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); } 442#ifdef _MSC_VER 443__DEVICE__ int __isnanl(long double __a); 444#endif 445__DEVICE__ double __ll2double_rd(long long __a) { 446 return __nv_ll2double_rd(__a); 447} 448__DEVICE__ double __ll2double_rn(long long __a) { 449 return __nv_ll2double_rn(__a); 450} 451__DEVICE__ double __ll2double_ru(long long __a) { 452 return __nv_ll2double_ru(__a); 453} 454__DEVICE__ double __ll2double_rz(long long __a) { 455 return __nv_ll2double_rz(__a); 456} 457__DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); } 458__DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); } 459__DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); } 460__DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); } 461__DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) { 462 return __nvvm_atom_and_gen_ll(__p, __v); 463} 464__DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) { 465 return __nvvm_atom_cta_and_gen_ll(__p, __v); 466} 467__DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) { 468 return __nvvm_atom_sys_and_gen_ll(__p, __v); 469} 470__DEVICE__ long long __llAtomicOr(long long *__p, long long __v) { 471 return __nvvm_atom_or_gen_ll(__p, __v); 472} 473__DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) { 474 return __nvvm_atom_cta_or_gen_ll(__p, __v); 475} 476__DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) { 477 return __nvvm_atom_sys_or_gen_ll(__p, __v); 478} 479__DEVICE__ long long __llAtomicXor(long long *__p, long long __v) { 480 return __nvvm_atom_xor_gen_ll(__p, __v); 481} 482__DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) { 483 return __nvvm_atom_cta_xor_gen_ll(__p, __v); 484} 485__DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) { 486 return __nvvm_atom_sys_xor_gen_ll(__p, __v); 487} 488__DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); } 489__DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); } 490__DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); } 491__DEVICE__ double __longlong_as_double(long long __a) { 492 return __nv_longlong_as_double(__a); 493} 494__DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); } 495__DEVICE__ long long __mul64hi(long long __a, long long __b) { 496 return __nv_mul64hi(__a, __b); 497} 498__DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); } 499__DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); } 500__DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); } 501__DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); } 502__DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); } 503__DEVICE__ int __popc(int __a) { return __nv_popc(__a); } 504__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); } 505__DEVICE__ float __powf(float __a, float __b) { 506 return __nv_fast_powf(__a, __b); 507} 508 509// Parameter must have a known integer value. 510#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a)) 511__DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); } 512__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { 513 return __nv_sad(__a, __b, __c); 514} 515__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); } 516__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); } 517__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); } 518__DEVICE__ void __sincosf(float __a, float *__s, float *__c) { 519 return __nv_fast_sincosf(__a, __s, __c); 520} 521__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); } 522__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); } 523__DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); } 524__DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); } 525__DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } 526__DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } 527__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; 528__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; 529__DEVICE__ void __trap(void) { asm volatile("trap;"); } 530__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { 531 return __nvvm_atom_add_gen_i((int *)__p, __v); 532} 533__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p, 534 unsigned int __v) { 535 return __nvvm_atom_cta_add_gen_i((int *)__p, __v); 536} 537__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p, 538 unsigned int __v) { 539 return __nvvm_atom_sys_add_gen_i((int *)__p, __v); 540} 541__DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) { 542 return __nvvm_atom_and_gen_i((int *)__p, __v); 543} 544__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p, 545 unsigned int __v) { 546 return __nvvm_atom_cta_and_gen_i((int *)__p, __v); 547} 548__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p, 549 unsigned int __v) { 550 return __nvvm_atom_sys_and_gen_i((int *)__p, __v); 551} 552__DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp, 553 unsigned int __v) { 554 return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v); 555} 556__DEVICE__ unsigned int 557__uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) { 558 return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v); 559} 560__DEVICE__ unsigned int 561__uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) { 562 return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v); 563} 564__DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) { 565 return __nvvm_atom_dec_gen_ui(__p, __v); 566} 567__DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p, 568 unsigned int __v) { 569 return __nvvm_atom_cta_dec_gen_ui(__p, __v); 570} 571__DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p, 572 unsigned int __v) { 573 return __nvvm_atom_sys_dec_gen_ui(__p, __v); 574} 575__DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) { 576 return __nvvm_atom_xchg_gen_i((int *)__p, __v); 577} 578__DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p, 579 unsigned int __v) { 580 return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v); 581} 582__DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p, 583 unsigned int __v) { 584 return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v); 585} 586__DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) { 587 return __nvvm_atom_inc_gen_ui(__p, __v); 588} 589__DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p, 590 unsigned int __v) { 591 return __nvvm_atom_cta_inc_gen_ui(__p, __v); 592} 593__DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p, 594 unsigned int __v) { 595 return __nvvm_atom_sys_inc_gen_ui(__p, __v); 596} 597__DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) { 598 return __nvvm_atom_max_gen_ui(__p, __v); 599} 600__DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p, 601 unsigned int __v) { 602 return __nvvm_atom_cta_max_gen_ui(__p, __v); 603} 604__DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p, 605 unsigned int __v) { 606 return __nvvm_atom_sys_max_gen_ui(__p, __v); 607} 608__DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) { 609 return __nvvm_atom_min_gen_ui(__p, __v); 610} 611__DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p, 612 unsigned int __v) { 613 return __nvvm_atom_cta_min_gen_ui(__p, __v); 614} 615__DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p, 616 unsigned int __v) { 617 return __nvvm_atom_sys_min_gen_ui(__p, __v); 618} 619__DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) { 620 return __nvvm_atom_or_gen_i((int *)__p, __v); 621} 622__DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) { 623 return __nvvm_atom_cta_or_gen_i((int *)__p, __v); 624} 625__DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p, 626 unsigned int __v) { 627 return __nvvm_atom_sys_or_gen_i((int *)__p, __v); 628} 629__DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) { 630 return __nvvm_atom_xor_gen_i((int *)__p, __v); 631} 632__DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p, 633 unsigned int __v) { 634 return __nvvm_atom_cta_xor_gen_i((int *)__p, __v); 635} 636__DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p, 637 unsigned int __v) { 638 return __nvvm_atom_sys_xor_gen_i((int *)__p, __v); 639} 640__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) { 641 return __nv_uhadd(__a, __b); 642} 643__DEVICE__ double __uint2double_rn(unsigned int __a) { 644 return __nv_uint2double_rn(__a); 645} 646__DEVICE__ float __uint2float_rd(unsigned int __a) { 647 return __nv_uint2float_rd(__a); 648} 649__DEVICE__ float __uint2float_rn(unsigned int __a) { 650 return __nv_uint2float_rn(__a); 651} 652__DEVICE__ float __uint2float_ru(unsigned int __a) { 653 return __nv_uint2float_ru(__a); 654} 655__DEVICE__ float __uint2float_rz(unsigned int __a) { 656 return __nv_uint2float_rz(__a); 657} 658__DEVICE__ float __uint_as_float(unsigned int __a) { 659 return __nv_uint_as_float(__a); 660} // 661__DEVICE__ double __ull2double_rd(unsigned long long __a) { 662 return __nv_ull2double_rd(__a); 663} 664__DEVICE__ double __ull2double_rn(unsigned long long __a) { 665 return __nv_ull2double_rn(__a); 666} 667__DEVICE__ double __ull2double_ru(unsigned long long __a) { 668 return __nv_ull2double_ru(__a); 669} 670__DEVICE__ double __ull2double_rz(unsigned long long __a) { 671 return __nv_ull2double_rz(__a); 672} 673__DEVICE__ float __ull2float_rd(unsigned long long __a) { 674 return __nv_ull2float_rd(__a); 675} 676__DEVICE__ float __ull2float_rn(unsigned long long __a) { 677 return __nv_ull2float_rn(__a); 678} 679__DEVICE__ float __ull2float_ru(unsigned long long __a) { 680 return __nv_ull2float_ru(__a); 681} 682__DEVICE__ float __ull2float_rz(unsigned long long __a) { 683 return __nv_ull2float_rz(__a); 684} 685__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p, 686 unsigned long long __v) { 687 return __nvvm_atom_add_gen_ll((long long *)__p, __v); 688} 689__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p, 690 unsigned long long __v) { 691 return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v); 692} 693__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p, 694 unsigned long long __v) { 695 return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v); 696} 697__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p, 698 unsigned long long __v) { 699 return __nvvm_atom_and_gen_ll((long long *)__p, __v); 700} 701__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p, 702 unsigned long long __v) { 703 return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v); 704} 705__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p, 706 unsigned long long __v) { 707 return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v); 708} 709__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p, 710 unsigned long long __cmp, 711 unsigned long long __v) { 712 return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v); 713} 714__DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p, 715 unsigned long long __cmp, 716 unsigned long long __v) { 717 return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v); 718} 719__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p, 720 unsigned long long __cmp, 721 unsigned long long __v) { 722 return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v); 723} 724__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p, 725 unsigned long long __v) { 726 return __nvvm_atom_xchg_gen_ll((long long *)__p, __v); 727} 728__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p, 729 unsigned long long __v) { 730 return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v); 731} 732__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p, 733 unsigned long long __v) { 734 return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v); 735} 736__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p, 737 unsigned long long __v) { 738 return __nvvm_atom_max_gen_ull(__p, __v); 739} 740__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p, 741 unsigned long long __v) { 742 return __nvvm_atom_cta_max_gen_ull(__p, __v); 743} 744__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p, 745 unsigned long long __v) { 746 return __nvvm_atom_sys_max_gen_ull(__p, __v); 747} 748__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p, 749 unsigned long long __v) { 750 return __nvvm_atom_min_gen_ull(__p, __v); 751} 752__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p, 753 unsigned long long __v) { 754 return __nvvm_atom_cta_min_gen_ull(__p, __v); 755} 756__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p, 757 unsigned long long __v) { 758 return __nvvm_atom_sys_min_gen_ull(__p, __v); 759} 760__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p, 761 unsigned long long __v) { 762 return __nvvm_atom_or_gen_ll((long long *)__p, __v); 763} 764__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p, 765 unsigned long long __v) { 766 return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v); 767} 768__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p, 769 unsigned long long __v) { 770 return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v); 771} 772__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p, 773 unsigned long long __v) { 774 return __nvvm_atom_xor_gen_ll((long long *)__p, __v); 775} 776__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p, 777 unsigned long long __v) { 778 return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v); 779} 780__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p, 781 unsigned long long __v) { 782 return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v); 783} 784__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) { 785 return __nv_umul24(__a, __b); 786} 787__DEVICE__ unsigned long long __umul64hi(unsigned long long __a, 788 unsigned long long __b) { 789 return __nv_umul64hi(__a, __b); 790} 791__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) { 792 return __nv_umulhi(__a, __b); 793} 794__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) { 795 return __nv_urhadd(__a, __b); 796} 797__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, 798 unsigned int __c) { 799 return __nv_usad(__a, __b, __c); 800} 801 802#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 803__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); } 804__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); } 805__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { 806 return __nv_vabsdiffs2(__a, __b); 807} 808__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { 809 return __nv_vabsdiffs4(__a, __b); 810} 811__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { 812 return __nv_vabsdiffu2(__a, __b); 813} 814__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { 815 return __nv_vabsdiffu4(__a, __b); 816} 817__DEVICE__ unsigned int __vabsss2(unsigned int __a) { 818 return __nv_vabsss2(__a); 819} 820__DEVICE__ unsigned int __vabsss4(unsigned int __a) { 821 return __nv_vabsss4(__a); 822} 823__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { 824 return __nv_vadd2(__a, __b); 825} 826__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { 827 return __nv_vadd4(__a, __b); 828} 829__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { 830 return __nv_vaddss2(__a, __b); 831} 832__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { 833 return __nv_vaddss4(__a, __b); 834} 835__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { 836 return __nv_vaddus2(__a, __b); 837} 838__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { 839 return __nv_vaddus4(__a, __b); 840} 841__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { 842 return __nv_vavgs2(__a, __b); 843} 844__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { 845 return __nv_vavgs4(__a, __b); 846} 847__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { 848 return __nv_vavgu2(__a, __b); 849} 850__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { 851 return __nv_vavgu4(__a, __b); 852} 853__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { 854 return __nv_vcmpeq2(__a, __b); 855} 856__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { 857 return __nv_vcmpeq4(__a, __b); 858} 859__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { 860 return __nv_vcmpges2(__a, __b); 861} 862__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { 863 return __nv_vcmpges4(__a, __b); 864} 865__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { 866 return __nv_vcmpgeu2(__a, __b); 867} 868__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { 869 return __nv_vcmpgeu4(__a, __b); 870} 871__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { 872 return __nv_vcmpgts2(__a, __b); 873} 874__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { 875 return __nv_vcmpgts4(__a, __b); 876} 877__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { 878 return __nv_vcmpgtu2(__a, __b); 879} 880__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { 881 return __nv_vcmpgtu4(__a, __b); 882} 883__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { 884 return __nv_vcmples2(__a, __b); 885} 886__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { 887 return __nv_vcmples4(__a, __b); 888} 889__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { 890 return __nv_vcmpleu2(__a, __b); 891} 892__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { 893 return __nv_vcmpleu4(__a, __b); 894} 895__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { 896 return __nv_vcmplts2(__a, __b); 897} 898__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { 899 return __nv_vcmplts4(__a, __b); 900} 901__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { 902 return __nv_vcmpltu2(__a, __b); 903} 904__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { 905 return __nv_vcmpltu4(__a, __b); 906} 907__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { 908 return __nv_vcmpne2(__a, __b); 909} 910__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { 911 return __nv_vcmpne4(__a, __b); 912} 913__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { 914 return __nv_vhaddu2(__a, __b); 915} 916__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { 917 return __nv_vhaddu4(__a, __b); 918} 919__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { 920 return __nv_vmaxs2(__a, __b); 921} 922__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { 923 return __nv_vmaxs4(__a, __b); 924} 925__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { 926 return __nv_vmaxu2(__a, __b); 927} 928__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { 929 return __nv_vmaxu4(__a, __b); 930} 931__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { 932 return __nv_vmins2(__a, __b); 933} 934__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { 935 return __nv_vmins4(__a, __b); 936} 937__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { 938 return __nv_vminu2(__a, __b); 939} 940__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { 941 return __nv_vminu4(__a, __b); 942} 943__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); } 944__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); } 945__DEVICE__ unsigned int __vnegss2(unsigned int __a) { 946 return __nv_vnegss2(__a); 947} 948__DEVICE__ unsigned int __vnegss4(unsigned int __a) { 949 return __nv_vnegss4(__a); 950} 951__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { 952 return __nv_vsads2(__a, __b); 953} 954__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { 955 return __nv_vsads4(__a, __b); 956} 957__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { 958 return __nv_vsadu2(__a, __b); 959} 960__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { 961 return __nv_vsadu4(__a, __b); 962} 963__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { 964 return __nv_vseteq2(__a, __b); 965} 966__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { 967 return __nv_vseteq4(__a, __b); 968} 969__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { 970 return __nv_vsetges2(__a, __b); 971} 972__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { 973 return __nv_vsetges4(__a, __b); 974} 975__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { 976 return __nv_vsetgeu2(__a, __b); 977} 978__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { 979 return __nv_vsetgeu4(__a, __b); 980} 981__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { 982 return __nv_vsetgts2(__a, __b); 983} 984__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { 985 return __nv_vsetgts4(__a, __b); 986} 987__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { 988 return __nv_vsetgtu2(__a, __b); 989} 990__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { 991 return __nv_vsetgtu4(__a, __b); 992} 993__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { 994 return __nv_vsetles2(__a, __b); 995} 996__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { 997 return __nv_vsetles4(__a, __b); 998} 999__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { 1000 return __nv_vsetleu2(__a, __b); 1001} 1002__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { 1003 return __nv_vsetleu4(__a, __b); 1004} 1005__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { 1006 return __nv_vsetlts2(__a, __b); 1007} 1008__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { 1009 return __nv_vsetlts4(__a, __b); 1010} 1011__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { 1012 return __nv_vsetltu2(__a, __b); 1013} 1014__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { 1015 return __nv_vsetltu4(__a, __b); 1016} 1017__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { 1018 return __nv_vsetne2(__a, __b); 1019} 1020__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { 1021 return __nv_vsetne4(__a, __b); 1022} 1023__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { 1024 return __nv_vsub2(__a, __b); 1025} 1026__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { 1027 return __nv_vsub4(__a, __b); 1028} 1029__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { 1030 return __nv_vsubss2(__a, __b); 1031} 1032__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { 1033 return __nv_vsubss4(__a, __b); 1034} 1035__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { 1036 return __nv_vsubus2(__a, __b); 1037} 1038__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { 1039 return __nv_vsubus4(__a, __b); 1040} 1041#else // CUDA_VERSION >= 9020 1042// CUDA no longer provides inline assembly (or bitcode) implementation of these 1043// functions, so we have to reimplment them. The implementation is naive and is 1044// not optimized for performance. 1045 1046// Helper function to convert N-bit boolean subfields into all-0 or all-1. 1047// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 1048// __bool2mask(0x00010000,16) -> 0xffff0000 1049__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { 1050 return (__a << shift) - __a; 1051} 1052__DEVICE__ unsigned int __vabs2(unsigned int __a) { 1053 unsigned int r; 1054 asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" 1055 : "=r"(r) 1056 : "r"(__a), "r"(0), "r"(0)); 1057 return r; 1058} 1059__DEVICE__ unsigned int __vabs4(unsigned int __a) { 1060 unsigned int r; 1061 asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" 1062 : "=r"(r) 1063 : "r"(__a), "r"(0), "r"(0)); 1064 return r; 1065} 1066__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { 1067 unsigned int r; 1068 asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" 1069 : "=r"(r) 1070 : "r"(__a), "r"(__b), "r"(0)); 1071 return r; 1072} 1073 1074__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { 1075 unsigned int r; 1076 asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" 1077 : "=r"(r) 1078 : "r"(__a), "r"(__b), "r"(0)); 1079 return r; 1080} 1081__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { 1082 unsigned int r; 1083 asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" 1084 : "=r"(r) 1085 : "r"(__a), "r"(__b), "r"(0)); 1086 return r; 1087} 1088__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { 1089 unsigned int r; 1090 asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" 1091 : "=r"(r) 1092 : "r"(__a), "r"(__b), "r"(0)); 1093 return r; 1094} 1095__DEVICE__ unsigned int __vabsss2(unsigned int __a) { 1096 unsigned int r; 1097 asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" 1098 : "=r"(r) 1099 : "r"(__a), "r"(0), "r"(0)); 1100 return r; 1101} 1102__DEVICE__ unsigned int __vabsss4(unsigned int __a) { 1103 unsigned int r; 1104 asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" 1105 : "=r"(r) 1106 : "r"(__a), "r"(0), "r"(0)); 1107 return r; 1108} 1109__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { 1110 unsigned int r; 1111 asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1112 return r; 1113} 1114__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { 1115 unsigned int r; 1116 asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1117 return r; 1118} 1119__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { 1120 unsigned int r; 1121 asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" 1122 : "=r"(r) 1123 : "r"(__a), "r"(__b), "r"(0)); 1124 return r; 1125} 1126__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { 1127 unsigned int r; 1128 asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" 1129 : "=r"(r) 1130 : "r"(__a), "r"(__b), "r"(0)); 1131 return r; 1132} 1133__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { 1134 unsigned int r; 1135 asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" 1136 : "=r"(r) 1137 : "r"(__a), "r"(__b), "r"(0)); 1138 return r; 1139} 1140__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { 1141 unsigned int r; 1142 asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" 1143 : "=r"(r) 1144 : "r"(__a), "r"(__b), "r"(0)); 1145 return r; 1146} 1147__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { 1148 unsigned int r; 1149 asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1150 return r; 1151} 1152__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { 1153 unsigned int r; 1154 asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1155 return r; 1156} 1157__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { 1158 unsigned int r; 1159 asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1160 return r; 1161} 1162__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { 1163 unsigned int r; 1164 asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1165 return r; 1166} 1167__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { 1168 unsigned int r; 1169 asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1170 return r; 1171} 1172__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { 1173 return __bool2mask(__vseteq2(__a, __b), 16); 1174} 1175__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { 1176 unsigned int r; 1177 asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1178 return r; 1179} 1180__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { 1181 return __bool2mask(__vseteq4(__a, __b), 8); 1182} 1183__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { 1184 unsigned int r; 1185 asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1186 return r; 1187} 1188__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { 1189 return __bool2mask(__vsetges2(__a, __b), 16); 1190} 1191__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { 1192 unsigned int r; 1193 asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1194 return r; 1195} 1196__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { 1197 return __bool2mask(__vsetges4(__a, __b), 8); 1198} 1199__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { 1200 unsigned int r; 1201 asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1202 return r; 1203} 1204__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { 1205 return __bool2mask(__vsetgeu2(__a, __b), 16); 1206} 1207__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { 1208 unsigned int r; 1209 asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1210 return r; 1211} 1212__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { 1213 return __bool2mask(__vsetgeu4(__a, __b), 8); 1214} 1215__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { 1216 unsigned int r; 1217 asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1218 return r; 1219} 1220__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { 1221 return __bool2mask(__vsetgts2(__a, __b), 16); 1222} 1223__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { 1224 unsigned int r; 1225 asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1226 return r; 1227} 1228__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { 1229 return __bool2mask(__vsetgts4(__a, __b), 8); 1230} 1231__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { 1232 unsigned int r; 1233 asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1234 return r; 1235} 1236__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { 1237 return __bool2mask(__vsetgtu2(__a, __b), 16); 1238} 1239__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { 1240 unsigned int r; 1241 asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1242 return r; 1243} 1244__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { 1245 return __bool2mask(__vsetgtu4(__a, __b), 8); 1246} 1247__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { 1248 unsigned int r; 1249 asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1250 return r; 1251} 1252__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { 1253 return __bool2mask(__vsetles2(__a, __b), 16); 1254} 1255__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { 1256 unsigned int r; 1257 asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1258 return r; 1259} 1260__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { 1261 return __bool2mask(__vsetles4(__a, __b), 8); 1262} 1263__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { 1264 unsigned int r; 1265 asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1266 return r; 1267} 1268__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { 1269 return __bool2mask(__vsetleu2(__a, __b), 16); 1270} 1271__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { 1272 unsigned int r; 1273 asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1274 return r; 1275} 1276__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { 1277 return __bool2mask(__vsetleu4(__a, __b), 8); 1278} 1279__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { 1280 unsigned int r; 1281 asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1282 return r; 1283} 1284__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { 1285 return __bool2mask(__vsetlts2(__a, __b), 16); 1286} 1287__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { 1288 unsigned int r; 1289 asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1290 return r; 1291} 1292__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { 1293 return __bool2mask(__vsetlts4(__a, __b), 8); 1294} 1295__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { 1296 unsigned int r; 1297 asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1298 return r; 1299} 1300__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { 1301 return __bool2mask(__vsetltu2(__a, __b), 16); 1302} 1303__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { 1304 unsigned int r; 1305 asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1306 return r; 1307} 1308__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { 1309 return __bool2mask(__vsetltu4(__a, __b), 8); 1310} 1311__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { 1312 unsigned int r; 1313 asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1314 return r; 1315} 1316__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { 1317 return __bool2mask(__vsetne2(__a, __b), 16); 1318} 1319__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { 1320 unsigned int r; 1321 asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1322 return r; 1323} 1324__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { 1325 return __bool2mask(__vsetne4(__a, __b), 8); 1326} 1327 1328// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086 1329// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) => 1330// (a + b) / 2 = ((a ^ b) >> 1) + (a & b) 1331// To operate on multiple sub-elements we need to make sure to mask out bits 1332// that crossed over into adjacent elements during the shift. 1333__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { 1334 return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b); 1335} 1336__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { 1337 return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); 1338} 1339 1340__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { 1341 unsigned int r; 1342 if ((__a & 0x8000) && (__b & 0x8000)) { 1343 // Work around a bug in ptxas which produces invalid result if low element 1344 // is negative. 1345 unsigned mask = __vcmpgts2(__a, __b); 1346 r = (__a & mask) | (__b & ~mask); 1347 } else { 1348 asm("vmax2.s32.s32.s32 %0,%1,%2,%3;" 1349 : "=r"(r) 1350 : "r"(__a), "r"(__b), "r"(0)); 1351 } 1352 return r; 1353} 1354__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { 1355 unsigned int r; 1356 asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1357 return r; 1358} 1359__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { 1360 unsigned int r; 1361 asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1362 return r; 1363} 1364__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { 1365 unsigned int r; 1366 asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1367 return r; 1368} 1369__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { 1370 unsigned int r; 1371 asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1372 return r; 1373} 1374__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { 1375 unsigned int r; 1376 asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1377 return r; 1378} 1379__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { 1380 unsigned int r; 1381 asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1382 return r; 1383} 1384__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { 1385 unsigned int r; 1386 asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1387 return r; 1388} 1389__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { 1390 unsigned int r; 1391 asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" 1392 : "=r"(r) 1393 : "r"(__a), "r"(__b), "r"(0)); 1394 return r; 1395} 1396__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { 1397 unsigned int r; 1398 asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" 1399 : "=r"(r) 1400 : "r"(__a), "r"(__b), "r"(0)); 1401 return r; 1402} 1403__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { 1404 unsigned int r; 1405 asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" 1406 : "=r"(r) 1407 : "r"(__a), "r"(__b), "r"(0)); 1408 return r; 1409} 1410__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { 1411 unsigned int r; 1412 asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" 1413 : "=r"(r) 1414 : "r"(__a), "r"(__b), "r"(0)); 1415 return r; 1416} 1417 1418__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { 1419 unsigned int r; 1420 asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1421 return r; 1422} 1423__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } 1424 1425__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { 1426 unsigned int r; 1427 asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1428 return r; 1429} 1430__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); } 1431__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { 1432 unsigned int r; 1433 asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" 1434 : "=r"(r) 1435 : "r"(__a), "r"(__b), "r"(0)); 1436 return r; 1437} 1438__DEVICE__ unsigned int __vnegss2(unsigned int __a) { 1439 return __vsubss2(0, __a); 1440} 1441__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { 1442 unsigned int r; 1443 asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" 1444 : "=r"(r) 1445 : "r"(__a), "r"(__b), "r"(0)); 1446 return r; 1447} 1448__DEVICE__ unsigned int __vnegss4(unsigned int __a) { 1449 return __vsubss4(0, __a); 1450} 1451__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { 1452 unsigned int r; 1453 asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" 1454 : "=r"(r) 1455 : "r"(__a), "r"(__b), "r"(0)); 1456 return r; 1457} 1458__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { 1459 unsigned int r; 1460 asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" 1461 : "=r"(r) 1462 : "r"(__a), "r"(__b), "r"(0)); 1463 return r; 1464} 1465#endif // CUDA_VERSION >= 9020 1466 1467// For OpenMP we require the user to include <time.h> as we need to know what 1468// clock_t is on the system. 1469#ifndef __OPENMP_NVPTX__ 1470__DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); } 1471#endif 1472__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); } 1473 1474// These functions shouldn't be declared when including this header 1475// for math function resolution purposes. 1476#ifndef __OPENMP_NVPTX__ 1477__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) { 1478 return __builtin_memcpy(__a, __b, __c); 1479} 1480__DEVICE__ void *memset(void *__a, int __b, size_t __c) { 1481 return __builtin_memset(__a, __b, __c); 1482} 1483#endif 1484 1485#pragma pop_macro("__DEVICE__") 1486#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 1487