1/*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9#ifndef __CLANG_HIP_MATH_H__ 10#define __CLANG_HIP_MATH_H__ 11 12#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) 13#error "This file is for HIP and OpenMP AMDGCN device compilation only." 14#endif 15 16#if !defined(__HIPCC_RTC__) 17#include <limits.h> 18#include <stdint.h> 19#ifdef __OPENMP_AMDGCN__ 20#include <omp.h> 21#endif 22#endif // !defined(__HIPCC_RTC__) 23 24#pragma push_macro("__DEVICE__") 25 26#ifdef __OPENMP_AMDGCN__ 27#define __DEVICE__ static inline __attribute__((always_inline, nothrow)) 28#else 29#define __DEVICE__ static __device__ inline __attribute__((always_inline)) 30#endif 31 32// Device library provides fast low precision and slow full-recision 33// implementations for some functions. Which one gets selected depends on 34// __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if 35// -ffast-math or -fgpu-approx-transcendentals are in effect. 36#pragma push_macro("__FAST_OR_SLOW") 37#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) 38#define __FAST_OR_SLOW(fast, slow) fast 39#else 40#define __FAST_OR_SLOW(fast, slow) slow 41#endif 42 43// A few functions return bool type starting only in C++11. 44#pragma push_macro("__RETURN_TYPE") 45#ifdef __OPENMP_AMDGCN__ 46#define __RETURN_TYPE int 47#else 48#if defined(__cplusplus) 49#define __RETURN_TYPE bool 50#else 51#define __RETURN_TYPE int 52#endif 53#endif // __OPENMP_AMDGCN__ 54 55#if defined (__cplusplus) && __cplusplus < 201103L 56// emulate static_assert on type sizes 57template<bool> 58struct __compare_result{}; 59template<> 60struct __compare_result<true> { 61 static const __device__ bool valid; 62}; 63 64__DEVICE__ 65void __suppress_unused_warning(bool b){}; 66template <unsigned int S, unsigned int T> 67__DEVICE__ void __static_assert_equal_size() { 68 __suppress_unused_warning(__compare_result<S == T>::valid); 69} 70 71#define __static_assert_type_size_equal(A, B) \ 72 __static_assert_equal_size<A,B>() 73 74#else 75#define __static_assert_type_size_equal(A,B) \ 76 static_assert((A) == (B), "") 77 78#endif 79 80__DEVICE__ 81uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { 82 uint64_t __r = 0; 83 while (*__tagp != '\0') { 84 char __tmp = *__tagp; 85 86 if (__tmp >= '0' && __tmp <= '7') 87 __r = (__r * 8u) + __tmp - '0'; 88 else 89 return 0; 90 91 ++__tagp; 92 } 93 94 return __r; 95} 96 97__DEVICE__ 98uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { 99 uint64_t __r = 0; 100 while (*__tagp != '\0') { 101 char __tmp = *__tagp; 102 103 if (__tmp >= '0' && __tmp <= '9') 104 __r = (__r * 10u) + __tmp - '0'; 105 else 106 return 0; 107 108 ++__tagp; 109 } 110 111 return __r; 112} 113 114__DEVICE__ 115uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { 116 uint64_t __r = 0; 117 while (*__tagp != '\0') { 118 char __tmp = *__tagp; 119 120 if (__tmp >= '0' && __tmp <= '9') 121 __r = (__r * 16u) + __tmp - '0'; 122 else if (__tmp >= 'a' && __tmp <= 'f') 123 __r = (__r * 16u) + __tmp - 'a' + 10; 124 else if (__tmp >= 'A' && __tmp <= 'F') 125 __r = (__r * 16u) + __tmp - 'A' + 10; 126 else 127 return 0; 128 129 ++__tagp; 130 } 131 132 return __r; 133} 134 135__DEVICE__ 136uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) { 137 if (*__tagp == '0') { 138 ++__tagp; 139 140 if (*__tagp == 'x' || *__tagp == 'X') 141 return __make_mantissa_base16(__tagp); 142 else 143 return __make_mantissa_base8(__tagp); 144 } 145 146 return __make_mantissa_base10(__tagp); 147} 148 149// BEGIN FLOAT 150 151// BEGIN INTRINSICS 152 153__DEVICE__ 154float __cosf(float __x) { return __ocml_native_cos_f32(__x); } 155 156__DEVICE__ 157float __exp10f(float __x) { 158 const float __log2_10 = 0x1.a934f0p+1f; 159 return __builtin_amdgcn_exp2f(__log2_10 * __x); 160} 161 162__DEVICE__ 163float __expf(float __x) { 164 const float __log2_e = 0x1.715476p+0; 165 return __builtin_amdgcn_exp2f(__log2_e * __x); 166} 167 168#if defined OCML_BASIC_ROUNDED_OPERATIONS 169__DEVICE__ 170float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } 171__DEVICE__ 172float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } 173__DEVICE__ 174float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } 175__DEVICE__ 176float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } 177#else 178__DEVICE__ 179float __fadd_rn(float __x, float __y) { return __x + __y; } 180#endif 181 182#if defined OCML_BASIC_ROUNDED_OPERATIONS 183__DEVICE__ 184float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } 185__DEVICE__ 186float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } 187__DEVICE__ 188float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } 189__DEVICE__ 190float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } 191#else 192__DEVICE__ 193float __fdiv_rn(float __x, float __y) { return __x / __y; } 194#endif 195 196__DEVICE__ 197float __fdividef(float __x, float __y) { return __x / __y; } 198 199#if defined OCML_BASIC_ROUNDED_OPERATIONS 200__DEVICE__ 201float __fmaf_rd(float __x, float __y, float __z) { 202 return __ocml_fma_rtn_f32(__x, __y, __z); 203} 204__DEVICE__ 205float __fmaf_rn(float __x, float __y, float __z) { 206 return __ocml_fma_rte_f32(__x, __y, __z); 207} 208__DEVICE__ 209float __fmaf_ru(float __x, float __y, float __z) { 210 return __ocml_fma_rtp_f32(__x, __y, __z); 211} 212__DEVICE__ 213float __fmaf_rz(float __x, float __y, float __z) { 214 return __ocml_fma_rtz_f32(__x, __y, __z); 215} 216#else 217__DEVICE__ 218float __fmaf_rn(float __x, float __y, float __z) { 219 return __builtin_fmaf(__x, __y, __z); 220} 221#endif 222 223#if defined OCML_BASIC_ROUNDED_OPERATIONS 224__DEVICE__ 225float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } 226__DEVICE__ 227float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } 228__DEVICE__ 229float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } 230__DEVICE__ 231float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } 232#else 233__DEVICE__ 234float __fmul_rn(float __x, float __y) { return __x * __y; } 235#endif 236 237#if defined OCML_BASIC_ROUNDED_OPERATIONS 238__DEVICE__ 239float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } 240__DEVICE__ 241float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } 242__DEVICE__ 243float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } 244__DEVICE__ 245float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } 246#else 247__DEVICE__ 248float __frcp_rn(float __x) { return 1.0f / __x; } 249#endif 250 251__DEVICE__ 252float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); } 253 254#if defined OCML_BASIC_ROUNDED_OPERATIONS 255__DEVICE__ 256float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } 257__DEVICE__ 258float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } 259__DEVICE__ 260float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } 261__DEVICE__ 262float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } 263#else 264__DEVICE__ 265float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } 266#endif 267 268#if defined OCML_BASIC_ROUNDED_OPERATIONS 269__DEVICE__ 270float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } 271__DEVICE__ 272float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } 273__DEVICE__ 274float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } 275__DEVICE__ 276float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } 277#else 278__DEVICE__ 279float __fsub_rn(float __x, float __y) { return __x - __y; } 280#endif 281 282__DEVICE__ 283float __log10f(float __x) { return __builtin_log10f(__x); } 284 285__DEVICE__ 286float __log2f(float __x) { return __builtin_amdgcn_logf(__x); } 287 288__DEVICE__ 289float __logf(float __x) { return __builtin_logf(__x); } 290 291__DEVICE__ 292float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 293 294__DEVICE__ 295float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } 296 297__DEVICE__ 298void __sincosf(float __x, float *__sinptr, float *__cosptr) { 299 *__sinptr = __ocml_native_sin_f32(__x); 300 *__cosptr = __ocml_native_cos_f32(__x); 301} 302 303__DEVICE__ 304float __sinf(float __x) { return __ocml_native_sin_f32(__x); } 305 306__DEVICE__ 307float __tanf(float __x) { 308 return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x)); 309} 310// END INTRINSICS 311 312#if defined(__cplusplus) 313__DEVICE__ 314int abs(int __x) { 315 return __builtin_abs(__x); 316} 317__DEVICE__ 318long labs(long __x) { 319 return __builtin_labs(__x); 320} 321__DEVICE__ 322long long llabs(long long __x) { 323 return __builtin_llabs(__x); 324} 325#endif 326 327__DEVICE__ 328float acosf(float __x) { return __ocml_acos_f32(__x); } 329 330__DEVICE__ 331float acoshf(float __x) { return __ocml_acosh_f32(__x); } 332 333__DEVICE__ 334float asinf(float __x) { return __ocml_asin_f32(__x); } 335 336__DEVICE__ 337float asinhf(float __x) { return __ocml_asinh_f32(__x); } 338 339__DEVICE__ 340float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } 341 342__DEVICE__ 343float atanf(float __x) { return __ocml_atan_f32(__x); } 344 345__DEVICE__ 346float atanhf(float __x) { return __ocml_atanh_f32(__x); } 347 348__DEVICE__ 349float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } 350 351__DEVICE__ 352float ceilf(float __x) { return __builtin_ceilf(__x); } 353 354__DEVICE__ 355float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); } 356 357__DEVICE__ 358float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); } 359 360__DEVICE__ 361float coshf(float __x) { return __ocml_cosh_f32(__x); } 362 363__DEVICE__ 364float cospif(float __x) { return __ocml_cospi_f32(__x); } 365 366__DEVICE__ 367float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } 368 369__DEVICE__ 370float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } 371 372__DEVICE__ 373float erfcf(float __x) { return __ocml_erfc_f32(__x); } 374 375__DEVICE__ 376float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } 377 378__DEVICE__ 379float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } 380 381__DEVICE__ 382float erff(float __x) { return __ocml_erf_f32(__x); } 383 384__DEVICE__ 385float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } 386 387__DEVICE__ 388float exp10f(float __x) { return __ocml_exp10_f32(__x); } 389 390__DEVICE__ 391float exp2f(float __x) { return __builtin_exp2f(__x); } 392 393__DEVICE__ 394float expf(float __x) { return __builtin_expf(__x); } 395 396__DEVICE__ 397float expm1f(float __x) { return __ocml_expm1_f32(__x); } 398 399__DEVICE__ 400float fabsf(float __x) { return __builtin_fabsf(__x); } 401 402__DEVICE__ 403float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } 404 405__DEVICE__ 406float fdividef(float __x, float __y) { return __x / __y; } 407 408__DEVICE__ 409float floorf(float __x) { return __builtin_floorf(__x); } 410 411__DEVICE__ 412float fmaf(float __x, float __y, float __z) { 413 return __builtin_fmaf(__x, __y, __z); 414} 415 416__DEVICE__ 417float fmaxf(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 418 419__DEVICE__ 420float fminf(float __x, float __y) { return __builtin_fminf(__x, __y); } 421 422__DEVICE__ 423float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } 424 425__DEVICE__ 426float frexpf(float __x, int *__nptr) { 427 return __builtin_frexpf(__x, __nptr); 428} 429 430__DEVICE__ 431float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } 432 433__DEVICE__ 434int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } 435 436__DEVICE__ 437__RETURN_TYPE __finitef(float __x) { return __builtin_isfinite(__x); } 438 439__DEVICE__ 440__RETURN_TYPE __isinff(float __x) { return __builtin_isinf(__x); } 441 442__DEVICE__ 443__RETURN_TYPE __isnanf(float __x) { return __builtin_isnan(__x); } 444 445__DEVICE__ 446float j0f(float __x) { return __ocml_j0_f32(__x); } 447 448__DEVICE__ 449float j1f(float __x) { return __ocml_j1_f32(__x); } 450 451__DEVICE__ 452float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication 453 // and the Miller & Brown algorithm 454 // for linear recurrences to get O(log n) steps, but it's unclear if 455 // it'd be beneficial in this case. 456 if (__n == 0) 457 return j0f(__x); 458 if (__n == 1) 459 return j1f(__x); 460 461 float __x0 = j0f(__x); 462 float __x1 = j1f(__x); 463 for (int __i = 1; __i < __n; ++__i) { 464 float __x2 = (2 * __i) / __x * __x1 - __x0; 465 __x0 = __x1; 466 __x1 = __x2; 467 } 468 469 return __x1; 470} 471 472__DEVICE__ 473float ldexpf(float __x, int __e) { return __builtin_amdgcn_ldexpf(__x, __e); } 474 475__DEVICE__ 476float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } 477 478__DEVICE__ 479long long int llrintf(float __x) { return __builtin_rintf(__x); } 480 481__DEVICE__ 482long long int llroundf(float __x) { return __builtin_roundf(__x); } 483 484__DEVICE__ 485float log10f(float __x) { return __builtin_log10f(__x); } 486 487__DEVICE__ 488float log1pf(float __x) { return __ocml_log1p_f32(__x); } 489 490__DEVICE__ 491float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); } 492 493__DEVICE__ 494float logbf(float __x) { return __ocml_logb_f32(__x); } 495 496__DEVICE__ 497float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); } 498 499__DEVICE__ 500long int lrintf(float __x) { return __builtin_rintf(__x); } 501 502__DEVICE__ 503long int lroundf(float __x) { return __builtin_roundf(__x); } 504 505__DEVICE__ 506float modff(float __x, float *__iptr) { 507 float __tmp; 508#ifdef __OPENMP_AMDGCN__ 509#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 510#endif 511 float __r = 512 __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 513 *__iptr = __tmp; 514 return __r; 515} 516 517__DEVICE__ 518float nanf(const char *__tagp __attribute__((nonnull))) { 519 union { 520 float val; 521 struct ieee_float { 522 unsigned int mantissa : 22; 523 unsigned int quiet : 1; 524 unsigned int exponent : 8; 525 unsigned int sign : 1; 526 } bits; 527 } __tmp; 528 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 529 530 __tmp.bits.sign = 0u; 531 __tmp.bits.exponent = ~0u; 532 __tmp.bits.quiet = 1u; 533 __tmp.bits.mantissa = __make_mantissa(__tagp); 534 535 return __tmp.val; 536} 537 538__DEVICE__ 539float nearbyintf(float __x) { return __builtin_nearbyintf(__x); } 540 541__DEVICE__ 542float nextafterf(float __x, float __y) { 543 return __ocml_nextafter_f32(__x, __y); 544} 545 546__DEVICE__ 547float norm3df(float __x, float __y, float __z) { 548 return __ocml_len3_f32(__x, __y, __z); 549} 550 551__DEVICE__ 552float norm4df(float __x, float __y, float __z, float __w) { 553 return __ocml_len4_f32(__x, __y, __z, __w); 554} 555 556__DEVICE__ 557float normcdff(float __x) { return __ocml_ncdf_f32(__x); } 558 559__DEVICE__ 560float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } 561 562__DEVICE__ 563float normf(int __dim, 564 const float *__a) { // TODO: placeholder until OCML adds support. 565 float __r = 0; 566 while (__dim--) { 567 __r += __a[0] * __a[0]; 568 ++__a; 569 } 570 571 return __builtin_sqrtf(__r); 572} 573 574__DEVICE__ 575float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } 576 577__DEVICE__ 578float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } 579 580__DEVICE__ 581float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } 582 583__DEVICE__ 584float remainderf(float __x, float __y) { 585 return __ocml_remainder_f32(__x, __y); 586} 587 588__DEVICE__ 589float remquof(float __x, float __y, int *__quo) { 590 int __tmp; 591#ifdef __OPENMP_AMDGCN__ 592#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 593#endif 594 float __r = __ocml_remquo_f32( 595 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 596 *__quo = __tmp; 597 598 return __r; 599} 600 601__DEVICE__ 602float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } 603 604__DEVICE__ 605float rintf(float __x) { return __builtin_rintf(__x); } 606 607__DEVICE__ 608float rnorm3df(float __x, float __y, float __z) { 609 return __ocml_rlen3_f32(__x, __y, __z); 610} 611 612__DEVICE__ 613float rnorm4df(float __x, float __y, float __z, float __w) { 614 return __ocml_rlen4_f32(__x, __y, __z, __w); 615} 616 617__DEVICE__ 618float rnormf(int __dim, 619 const float *__a) { // TODO: placeholder until OCML adds support. 620 float __r = 0; 621 while (__dim--) { 622 __r += __a[0] * __a[0]; 623 ++__a; 624 } 625 626 return __ocml_rsqrt_f32(__r); 627} 628 629__DEVICE__ 630float roundf(float __x) { return __builtin_roundf(__x); } 631 632__DEVICE__ 633float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } 634 635__DEVICE__ 636float scalblnf(float __x, long int __n) { 637 return (__n < INT_MAX) ? __builtin_amdgcn_ldexpf(__x, __n) 638 : __ocml_scalb_f32(__x, __n); 639} 640 641__DEVICE__ 642float scalbnf(float __x, int __n) { return __builtin_amdgcn_ldexpf(__x, __n); } 643 644__DEVICE__ 645__RETURN_TYPE __signbitf(float __x) { return __builtin_signbitf(__x); } 646 647__DEVICE__ 648void sincosf(float __x, float *__sinptr, float *__cosptr) { 649 float __tmp; 650#ifdef __OPENMP_AMDGCN__ 651#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 652#endif 653#ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__ 654 __sincosf(__x, __sinptr, __cosptr); 655#else 656 *__sinptr = 657 __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); 658 *__cosptr = __tmp; 659#endif 660} 661 662__DEVICE__ 663void sincospif(float __x, float *__sinptr, float *__cosptr) { 664 float __tmp; 665#ifdef __OPENMP_AMDGCN__ 666#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 667#endif 668 *__sinptr = __ocml_sincospi_f32( 669 __x, (__attribute__((address_space(5))) float *)&__tmp); 670 *__cosptr = __tmp; 671} 672 673__DEVICE__ 674float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); } 675 676__DEVICE__ 677float sinhf(float __x) { return __ocml_sinh_f32(__x); } 678 679__DEVICE__ 680float sinpif(float __x) { return __ocml_sinpi_f32(__x); } 681 682__DEVICE__ 683float sqrtf(float __x) { return __builtin_sqrtf(__x); } 684 685__DEVICE__ 686float tanf(float __x) { return __ocml_tan_f32(__x); } 687 688__DEVICE__ 689float tanhf(float __x) { return __ocml_tanh_f32(__x); } 690 691__DEVICE__ 692float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } 693 694__DEVICE__ 695float truncf(float __x) { return __builtin_truncf(__x); } 696 697__DEVICE__ 698float y0f(float __x) { return __ocml_y0_f32(__x); } 699 700__DEVICE__ 701float y1f(float __x) { return __ocml_y1_f32(__x); } 702 703__DEVICE__ 704float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication 705 // and the Miller & Brown algorithm 706 // for linear recurrences to get O(log n) steps, but it's unclear if 707 // it'd be beneficial in this case. Placeholder until OCML adds 708 // support. 709 if (__n == 0) 710 return y0f(__x); 711 if (__n == 1) 712 return y1f(__x); 713 714 float __x0 = y0f(__x); 715 float __x1 = y1f(__x); 716 for (int __i = 1; __i < __n; ++__i) { 717 float __x2 = (2 * __i) / __x * __x1 - __x0; 718 __x0 = __x1; 719 __x1 = __x2; 720 } 721 722 return __x1; 723} 724 725 726// END FLOAT 727 728// BEGIN DOUBLE 729__DEVICE__ 730double acos(double __x) { return __ocml_acos_f64(__x); } 731 732__DEVICE__ 733double acosh(double __x) { return __ocml_acosh_f64(__x); } 734 735__DEVICE__ 736double asin(double __x) { return __ocml_asin_f64(__x); } 737 738__DEVICE__ 739double asinh(double __x) { return __ocml_asinh_f64(__x); } 740 741__DEVICE__ 742double atan(double __x) { return __ocml_atan_f64(__x); } 743 744__DEVICE__ 745double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } 746 747__DEVICE__ 748double atanh(double __x) { return __ocml_atanh_f64(__x); } 749 750__DEVICE__ 751double cbrt(double __x) { return __ocml_cbrt_f64(__x); } 752 753__DEVICE__ 754double ceil(double __x) { return __builtin_ceil(__x); } 755 756__DEVICE__ 757double copysign(double __x, double __y) { 758 return __builtin_copysign(__x, __y); 759} 760 761__DEVICE__ 762double cos(double __x) { return __ocml_cos_f64(__x); } 763 764__DEVICE__ 765double cosh(double __x) { return __ocml_cosh_f64(__x); } 766 767__DEVICE__ 768double cospi(double __x) { return __ocml_cospi_f64(__x); } 769 770__DEVICE__ 771double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } 772 773__DEVICE__ 774double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } 775 776__DEVICE__ 777double erf(double __x) { return __ocml_erf_f64(__x); } 778 779__DEVICE__ 780double erfc(double __x) { return __ocml_erfc_f64(__x); } 781 782__DEVICE__ 783double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } 784 785__DEVICE__ 786double erfcx(double __x) { return __ocml_erfcx_f64(__x); } 787 788__DEVICE__ 789double erfinv(double __x) { return __ocml_erfinv_f64(__x); } 790 791__DEVICE__ 792double exp(double __x) { return __ocml_exp_f64(__x); } 793 794__DEVICE__ 795double exp10(double __x) { return __ocml_exp10_f64(__x); } 796 797__DEVICE__ 798double exp2(double __x) { return __ocml_exp2_f64(__x); } 799 800__DEVICE__ 801double expm1(double __x) { return __ocml_expm1_f64(__x); } 802 803__DEVICE__ 804double fabs(double __x) { return __builtin_fabs(__x); } 805 806__DEVICE__ 807double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } 808 809__DEVICE__ 810double floor(double __x) { return __builtin_floor(__x); } 811 812__DEVICE__ 813double fma(double __x, double __y, double __z) { 814 return __builtin_fma(__x, __y, __z); 815} 816 817__DEVICE__ 818double fmax(double __x, double __y) { return __builtin_fmax(__x, __y); } 819 820__DEVICE__ 821double fmin(double __x, double __y) { return __builtin_fmin(__x, __y); } 822 823__DEVICE__ 824double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } 825 826__DEVICE__ 827double frexp(double __x, int *__nptr) { 828 return __builtin_frexp(__x, __nptr); 829} 830 831__DEVICE__ 832double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } 833 834__DEVICE__ 835int ilogb(double __x) { return __ocml_ilogb_f64(__x); } 836 837__DEVICE__ 838__RETURN_TYPE __finite(double __x) { return __builtin_isfinite(__x); } 839 840__DEVICE__ 841__RETURN_TYPE __isinf(double __x) { return __builtin_isinf(__x); } 842 843__DEVICE__ 844__RETURN_TYPE __isnan(double __x) { return __builtin_isnan(__x); } 845 846__DEVICE__ 847double j0(double __x) { return __ocml_j0_f64(__x); } 848 849__DEVICE__ 850double j1(double __x) { return __ocml_j1_f64(__x); } 851 852__DEVICE__ 853double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication 854 // and the Miller & Brown algorithm 855 // for linear recurrences to get O(log n) steps, but it's unclear if 856 // it'd be beneficial in this case. Placeholder until OCML adds 857 // support. 858 if (__n == 0) 859 return j0(__x); 860 if (__n == 1) 861 return j1(__x); 862 863 double __x0 = j0(__x); 864 double __x1 = j1(__x); 865 for (int __i = 1; __i < __n; ++__i) { 866 double __x2 = (2 * __i) / __x * __x1 - __x0; 867 __x0 = __x1; 868 __x1 = __x2; 869 } 870 return __x1; 871} 872 873__DEVICE__ 874double ldexp(double __x, int __e) { return __builtin_amdgcn_ldexp(__x, __e); } 875 876__DEVICE__ 877double lgamma(double __x) { return __ocml_lgamma_f64(__x); } 878 879__DEVICE__ 880long long int llrint(double __x) { return __builtin_rint(__x); } 881 882__DEVICE__ 883long long int llround(double __x) { return __builtin_round(__x); } 884 885__DEVICE__ 886double log(double __x) { return __ocml_log_f64(__x); } 887 888__DEVICE__ 889double log10(double __x) { return __ocml_log10_f64(__x); } 890 891__DEVICE__ 892double log1p(double __x) { return __ocml_log1p_f64(__x); } 893 894__DEVICE__ 895double log2(double __x) { return __ocml_log2_f64(__x); } 896 897__DEVICE__ 898double logb(double __x) { return __ocml_logb_f64(__x); } 899 900__DEVICE__ 901long int lrint(double __x) { return __builtin_rint(__x); } 902 903__DEVICE__ 904long int lround(double __x) { return __builtin_round(__x); } 905 906__DEVICE__ 907double modf(double __x, double *__iptr) { 908 double __tmp; 909#ifdef __OPENMP_AMDGCN__ 910#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 911#endif 912 double __r = 913 __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); 914 *__iptr = __tmp; 915 916 return __r; 917} 918 919__DEVICE__ 920double nan(const char *__tagp) { 921#if !_WIN32 922 union { 923 double val; 924 struct ieee_double { 925 uint64_t mantissa : 51; 926 uint32_t quiet : 1; 927 uint32_t exponent : 11; 928 uint32_t sign : 1; 929 } bits; 930 } __tmp; 931 __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); 932 933 __tmp.bits.sign = 0u; 934 __tmp.bits.exponent = ~0u; 935 __tmp.bits.quiet = 1u; 936 __tmp.bits.mantissa = __make_mantissa(__tagp); 937 938 return __tmp.val; 939#else 940 __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); 941 uint64_t __val = __make_mantissa(__tagp); 942 __val |= 0xFFF << 51; 943 return *reinterpret_cast<double *>(&__val); 944#endif 945} 946 947__DEVICE__ 948double nearbyint(double __x) { return __builtin_nearbyint(__x); } 949 950__DEVICE__ 951double nextafter(double __x, double __y) { 952 return __ocml_nextafter_f64(__x, __y); 953} 954 955__DEVICE__ 956double norm(int __dim, 957 const double *__a) { // TODO: placeholder until OCML adds support. 958 double __r = 0; 959 while (__dim--) { 960 __r += __a[0] * __a[0]; 961 ++__a; 962 } 963 964 return __builtin_sqrt(__r); 965} 966 967__DEVICE__ 968double norm3d(double __x, double __y, double __z) { 969 return __ocml_len3_f64(__x, __y, __z); 970} 971 972__DEVICE__ 973double norm4d(double __x, double __y, double __z, double __w) { 974 return __ocml_len4_f64(__x, __y, __z, __w); 975} 976 977__DEVICE__ 978double normcdf(double __x) { return __ocml_ncdf_f64(__x); } 979 980__DEVICE__ 981double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } 982 983__DEVICE__ 984double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } 985 986__DEVICE__ 987double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } 988 989__DEVICE__ 990double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } 991 992__DEVICE__ 993double remainder(double __x, double __y) { 994 return __ocml_remainder_f64(__x, __y); 995} 996 997__DEVICE__ 998double remquo(double __x, double __y, int *__quo) { 999 int __tmp; 1000#ifdef __OPENMP_AMDGCN__ 1001#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1002#endif 1003 double __r = __ocml_remquo_f64( 1004 __x, __y, (__attribute__((address_space(5))) int *)&__tmp); 1005 *__quo = __tmp; 1006 1007 return __r; 1008} 1009 1010__DEVICE__ 1011double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } 1012 1013__DEVICE__ 1014double rint(double __x) { return __builtin_rint(__x); } 1015 1016__DEVICE__ 1017double rnorm(int __dim, 1018 const double *__a) { // TODO: placeholder until OCML adds support. 1019 double __r = 0; 1020 while (__dim--) { 1021 __r += __a[0] * __a[0]; 1022 ++__a; 1023 } 1024 1025 return __ocml_rsqrt_f64(__r); 1026} 1027 1028__DEVICE__ 1029double rnorm3d(double __x, double __y, double __z) { 1030 return __ocml_rlen3_f64(__x, __y, __z); 1031} 1032 1033__DEVICE__ 1034double rnorm4d(double __x, double __y, double __z, double __w) { 1035 return __ocml_rlen4_f64(__x, __y, __z, __w); 1036} 1037 1038__DEVICE__ 1039double round(double __x) { return __builtin_round(__x); } 1040 1041__DEVICE__ 1042double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } 1043 1044__DEVICE__ 1045double scalbln(double __x, long int __n) { 1046 return (__n < INT_MAX) ? __builtin_amdgcn_ldexp(__x, __n) 1047 : __ocml_scalb_f64(__x, __n); 1048} 1049__DEVICE__ 1050double scalbn(double __x, int __n) { return __builtin_amdgcn_ldexp(__x, __n); } 1051 1052__DEVICE__ 1053__RETURN_TYPE __signbit(double __x) { return __builtin_signbit(__x); } 1054 1055__DEVICE__ 1056double sin(double __x) { return __ocml_sin_f64(__x); } 1057 1058__DEVICE__ 1059void sincos(double __x, double *__sinptr, double *__cosptr) { 1060 double __tmp; 1061#ifdef __OPENMP_AMDGCN__ 1062#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1063#endif 1064 *__sinptr = __ocml_sincos_f64( 1065 __x, (__attribute__((address_space(5))) double *)&__tmp); 1066 *__cosptr = __tmp; 1067} 1068 1069__DEVICE__ 1070void sincospi(double __x, double *__sinptr, double *__cosptr) { 1071 double __tmp; 1072#ifdef __OPENMP_AMDGCN__ 1073#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) 1074#endif 1075 *__sinptr = __ocml_sincospi_f64( 1076 __x, (__attribute__((address_space(5))) double *)&__tmp); 1077 *__cosptr = __tmp; 1078} 1079 1080__DEVICE__ 1081double sinh(double __x) { return __ocml_sinh_f64(__x); } 1082 1083__DEVICE__ 1084double sinpi(double __x) { return __ocml_sinpi_f64(__x); } 1085 1086__DEVICE__ 1087double sqrt(double __x) { return __builtin_sqrt(__x); } 1088 1089__DEVICE__ 1090double tan(double __x) { return __ocml_tan_f64(__x); } 1091 1092__DEVICE__ 1093double tanh(double __x) { return __ocml_tanh_f64(__x); } 1094 1095__DEVICE__ 1096double tgamma(double __x) { return __ocml_tgamma_f64(__x); } 1097 1098__DEVICE__ 1099double trunc(double __x) { return __builtin_trunc(__x); } 1100 1101__DEVICE__ 1102double y0(double __x) { return __ocml_y0_f64(__x); } 1103 1104__DEVICE__ 1105double y1(double __x) { return __ocml_y1_f64(__x); } 1106 1107__DEVICE__ 1108double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication 1109 // and the Miller & Brown algorithm 1110 // for linear recurrences to get O(log n) steps, but it's unclear if 1111 // it'd be beneficial in this case. Placeholder until OCML adds 1112 // support. 1113 if (__n == 0) 1114 return y0(__x); 1115 if (__n == 1) 1116 return y1(__x); 1117 1118 double __x0 = y0(__x); 1119 double __x1 = y1(__x); 1120 for (int __i = 1; __i < __n; ++__i) { 1121 double __x2 = (2 * __i) / __x * __x1 - __x0; 1122 __x0 = __x1; 1123 __x1 = __x2; 1124 } 1125 1126 return __x1; 1127} 1128 1129// BEGIN INTRINSICS 1130#if defined OCML_BASIC_ROUNDED_OPERATIONS 1131__DEVICE__ 1132double __dadd_rd(double __x, double __y) { 1133 return __ocml_add_rtn_f64(__x, __y); 1134} 1135__DEVICE__ 1136double __dadd_rn(double __x, double __y) { 1137 return __ocml_add_rte_f64(__x, __y); 1138} 1139__DEVICE__ 1140double __dadd_ru(double __x, double __y) { 1141 return __ocml_add_rtp_f64(__x, __y); 1142} 1143__DEVICE__ 1144double __dadd_rz(double __x, double __y) { 1145 return __ocml_add_rtz_f64(__x, __y); 1146} 1147#else 1148__DEVICE__ 1149double __dadd_rn(double __x, double __y) { return __x + __y; } 1150#endif 1151 1152#if defined OCML_BASIC_ROUNDED_OPERATIONS 1153__DEVICE__ 1154double __ddiv_rd(double __x, double __y) { 1155 return __ocml_div_rtn_f64(__x, __y); 1156} 1157__DEVICE__ 1158double __ddiv_rn(double __x, double __y) { 1159 return __ocml_div_rte_f64(__x, __y); 1160} 1161__DEVICE__ 1162double __ddiv_ru(double __x, double __y) { 1163 return __ocml_div_rtp_f64(__x, __y); 1164} 1165__DEVICE__ 1166double __ddiv_rz(double __x, double __y) { 1167 return __ocml_div_rtz_f64(__x, __y); 1168} 1169#else 1170__DEVICE__ 1171double __ddiv_rn(double __x, double __y) { return __x / __y; } 1172#endif 1173 1174#if defined OCML_BASIC_ROUNDED_OPERATIONS 1175__DEVICE__ 1176double __dmul_rd(double __x, double __y) { 1177 return __ocml_mul_rtn_f64(__x, __y); 1178} 1179__DEVICE__ 1180double __dmul_rn(double __x, double __y) { 1181 return __ocml_mul_rte_f64(__x, __y); 1182} 1183__DEVICE__ 1184double __dmul_ru(double __x, double __y) { 1185 return __ocml_mul_rtp_f64(__x, __y); 1186} 1187__DEVICE__ 1188double __dmul_rz(double __x, double __y) { 1189 return __ocml_mul_rtz_f64(__x, __y); 1190} 1191#else 1192__DEVICE__ 1193double __dmul_rn(double __x, double __y) { return __x * __y; } 1194#endif 1195 1196#if defined OCML_BASIC_ROUNDED_OPERATIONS 1197__DEVICE__ 1198double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } 1199__DEVICE__ 1200double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } 1201__DEVICE__ 1202double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } 1203__DEVICE__ 1204double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } 1205#else 1206__DEVICE__ 1207double __drcp_rn(double __x) { return 1.0 / __x; } 1208#endif 1209 1210#if defined OCML_BASIC_ROUNDED_OPERATIONS 1211__DEVICE__ 1212double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } 1213__DEVICE__ 1214double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } 1215__DEVICE__ 1216double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } 1217__DEVICE__ 1218double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } 1219#else 1220__DEVICE__ 1221double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); } 1222#endif 1223 1224#if defined OCML_BASIC_ROUNDED_OPERATIONS 1225__DEVICE__ 1226double __dsub_rd(double __x, double __y) { 1227 return __ocml_sub_rtn_f64(__x, __y); 1228} 1229__DEVICE__ 1230double __dsub_rn(double __x, double __y) { 1231 return __ocml_sub_rte_f64(__x, __y); 1232} 1233__DEVICE__ 1234double __dsub_ru(double __x, double __y) { 1235 return __ocml_sub_rtp_f64(__x, __y); 1236} 1237__DEVICE__ 1238double __dsub_rz(double __x, double __y) { 1239 return __ocml_sub_rtz_f64(__x, __y); 1240} 1241#else 1242__DEVICE__ 1243double __dsub_rn(double __x, double __y) { return __x - __y; } 1244#endif 1245 1246#if defined OCML_BASIC_ROUNDED_OPERATIONS 1247__DEVICE__ 1248double __fma_rd(double __x, double __y, double __z) { 1249 return __ocml_fma_rtn_f64(__x, __y, __z); 1250} 1251__DEVICE__ 1252double __fma_rn(double __x, double __y, double __z) { 1253 return __ocml_fma_rte_f64(__x, __y, __z); 1254} 1255__DEVICE__ 1256double __fma_ru(double __x, double __y, double __z) { 1257 return __ocml_fma_rtp_f64(__x, __y, __z); 1258} 1259__DEVICE__ 1260double __fma_rz(double __x, double __y, double __z) { 1261 return __ocml_fma_rtz_f64(__x, __y, __z); 1262} 1263#else 1264__DEVICE__ 1265double __fma_rn(double __x, double __y, double __z) { 1266 return __builtin_fma(__x, __y, __z); 1267} 1268#endif 1269// END INTRINSICS 1270// END DOUBLE 1271 1272// C only macros 1273#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1274#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) 1275#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) 1276#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) 1277#define signbit(__x) \ 1278 _Generic((__x), float : __signbitf, double : __signbit)(__x) 1279#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L 1280 1281#if defined(__cplusplus) 1282template <class T> __DEVICE__ T min(T __arg1, T __arg2) { 1283 return (__arg1 < __arg2) ? __arg1 : __arg2; 1284} 1285 1286template <class T> __DEVICE__ T max(T __arg1, T __arg2) { 1287 return (__arg1 > __arg2) ? __arg1 : __arg2; 1288} 1289 1290__DEVICE__ int min(int __arg1, int __arg2) { 1291 return (__arg1 < __arg2) ? __arg1 : __arg2; 1292} 1293__DEVICE__ int max(int __arg1, int __arg2) { 1294 return (__arg1 > __arg2) ? __arg1 : __arg2; 1295} 1296 1297__DEVICE__ 1298float max(float __x, float __y) { return __builtin_fmaxf(__x, __y); } 1299 1300__DEVICE__ 1301double max(double __x, double __y) { return __builtin_fmax(__x, __y); } 1302 1303__DEVICE__ 1304float min(float __x, float __y) { return __builtin_fminf(__x, __y); } 1305 1306__DEVICE__ 1307double min(double __x, double __y) { return __builtin_fmin(__x, __y); } 1308 1309#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1310__host__ inline static int min(int __arg1, int __arg2) { 1311 return __arg1 < __arg2 ? __arg1 : __arg2; 1312} 1313 1314__host__ inline static int max(int __arg1, int __arg2) { 1315 return __arg1 > __arg2 ? __arg1 : __arg2; 1316} 1317#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) 1318#endif 1319 1320#pragma pop_macro("__DEVICE__") 1321#pragma pop_macro("__RETURN_TYPE") 1322#pragma pop_macro("__FAST_OR_SLOW") 1323 1324#endif // __CLANG_HIP_MATH_H__ 1325