1/*===---- __clang_hip_math.h - Device-side HIP math support ----------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __CLANG_HIP_MATH_H__
10#define __CLANG_HIP_MATH_H__
11
12#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
13#error "This file is for HIP and OpenMP AMDGCN device compilation only."
14#endif
15
16#if !defined(__HIPCC_RTC__)
17#if defined(__cplusplus)
18#include <algorithm>
19#endif
20#include <limits.h>
21#include <stdint.h>
22#ifdef __OPENMP_AMDGCN__
23#include <omp.h>
24#endif
25#endif // !defined(__HIPCC_RTC__)
26
27#pragma push_macro("__DEVICE__")
28
29#ifdef __OPENMP_AMDGCN__
30#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
31#else
32#define __DEVICE__ static __device__ inline __attribute__((always_inline))
33#endif
34
35// A few functions return bool type starting only in C++11.
36#pragma push_macro("__RETURN_TYPE")
37#ifdef __OPENMP_AMDGCN__
38#define __RETURN_TYPE int
39#else
40#if defined(__cplusplus)
41#define __RETURN_TYPE bool
42#else
43#define __RETURN_TYPE int
44#endif
45#endif // __OPENMP_AMDGCN__
46
47#if defined (__cplusplus) && __cplusplus < 201103L
48// emulate static_assert on type sizes
49template<bool>
50struct __compare_result{};
51template<>
52struct __compare_result<true> {
53  static const __device__ bool valid;
54};
55
56__DEVICE__
57void __suppress_unused_warning(bool b){};
58template <unsigned int S, unsigned int T>
59__DEVICE__ void __static_assert_equal_size() {
60  __suppress_unused_warning(__compare_result<S == T>::valid);
61}
62
63#define __static_assert_type_size_equal(A, B) \
64  __static_assert_equal_size<A,B>()
65
66#else
67#define __static_assert_type_size_equal(A,B) \
68  static_assert((A) == (B), "")
69
70#endif
71
72__DEVICE__
73uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
74  uint64_t __r = 0;
75  while (*__tagp != '\0') {
76    char __tmp = *__tagp;
77
78    if (__tmp >= '0' && __tmp <= '7')
79      __r = (__r * 8u) + __tmp - '0';
80    else
81      return 0;
82
83    ++__tagp;
84  }
85
86  return __r;
87}
88
89__DEVICE__
90uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
91  uint64_t __r = 0;
92  while (*__tagp != '\0') {
93    char __tmp = *__tagp;
94
95    if (__tmp >= '0' && __tmp <= '9')
96      __r = (__r * 10u) + __tmp - '0';
97    else
98      return 0;
99
100    ++__tagp;
101  }
102
103  return __r;
104}
105
106__DEVICE__
107uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
108  uint64_t __r = 0;
109  while (*__tagp != '\0') {
110    char __tmp = *__tagp;
111
112    if (__tmp >= '0' && __tmp <= '9')
113      __r = (__r * 16u) + __tmp - '0';
114    else if (__tmp >= 'a' && __tmp <= 'f')
115      __r = (__r * 16u) + __tmp - 'a' + 10;
116    else if (__tmp >= 'A' && __tmp <= 'F')
117      __r = (__r * 16u) + __tmp - 'A' + 10;
118    else
119      return 0;
120
121    ++__tagp;
122  }
123
124  return __r;
125}
126
127__DEVICE__
128uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
129  if (*__tagp == '0') {
130    ++__tagp;
131
132    if (*__tagp == 'x' || *__tagp == 'X')
133      return __make_mantissa_base16(__tagp);
134    else
135      return __make_mantissa_base8(__tagp);
136  }
137
138  return __make_mantissa_base10(__tagp);
139}
140
141// BEGIN FLOAT
142#if defined(__cplusplus)
143__DEVICE__
144int abs(int __x) {
145  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
146  return (__x ^ __sgn) - __sgn;
147}
148__DEVICE__
149long labs(long __x) {
150  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
151  return (__x ^ __sgn) - __sgn;
152}
153__DEVICE__
154long long llabs(long long __x) {
155  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
156  return (__x ^ __sgn) - __sgn;
157}
158#endif
159
160__DEVICE__
161float acosf(float __x) { return __ocml_acos_f32(__x); }
162
163__DEVICE__
164float acoshf(float __x) { return __ocml_acosh_f32(__x); }
165
166__DEVICE__
167float asinf(float __x) { return __ocml_asin_f32(__x); }
168
169__DEVICE__
170float asinhf(float __x) { return __ocml_asinh_f32(__x); }
171
172__DEVICE__
173float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
174
175__DEVICE__
176float atanf(float __x) { return __ocml_atan_f32(__x); }
177
178__DEVICE__
179float atanhf(float __x) { return __ocml_atanh_f32(__x); }
180
181__DEVICE__
182float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
183
184__DEVICE__
185float ceilf(float __x) { return __ocml_ceil_f32(__x); }
186
187__DEVICE__
188float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
189
190__DEVICE__
191float cosf(float __x) { return __ocml_cos_f32(__x); }
192
193__DEVICE__
194float coshf(float __x) { return __ocml_cosh_f32(__x); }
195
196__DEVICE__
197float cospif(float __x) { return __ocml_cospi_f32(__x); }
198
199__DEVICE__
200float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
201
202__DEVICE__
203float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
204
205__DEVICE__
206float erfcf(float __x) { return __ocml_erfc_f32(__x); }
207
208__DEVICE__
209float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
210
211__DEVICE__
212float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
213
214__DEVICE__
215float erff(float __x) { return __ocml_erf_f32(__x); }
216
217__DEVICE__
218float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
219
220__DEVICE__
221float exp10f(float __x) { return __ocml_exp10_f32(__x); }
222
223__DEVICE__
224float exp2f(float __x) { return __ocml_exp2_f32(__x); }
225
226__DEVICE__
227float expf(float __x) { return __ocml_exp_f32(__x); }
228
229__DEVICE__
230float expm1f(float __x) { return __ocml_expm1_f32(__x); }
231
232__DEVICE__
233float fabsf(float __x) { return __builtin_fabsf(__x); }
234
235__DEVICE__
236float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
237
238__DEVICE__
239float fdividef(float __x, float __y) { return __x / __y; }
240
241__DEVICE__
242float floorf(float __x) { return __ocml_floor_f32(__x); }
243
244__DEVICE__
245float fmaf(float __x, float __y, float __z) {
246  return __ocml_fma_f32(__x, __y, __z);
247}
248
249__DEVICE__
250float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
251
252__DEVICE__
253float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
254
255__DEVICE__
256float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
257
258__DEVICE__
259float frexpf(float __x, int *__nptr) {
260  int __tmp;
261#ifdef __OPENMP_AMDGCN__
262#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
263#endif
264  float __r =
265      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
266  *__nptr = __tmp;
267
268  return __r;
269}
270
271__DEVICE__
272float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
273
274__DEVICE__
275int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
276
277__DEVICE__
278__RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); }
279
280__DEVICE__
281__RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); }
282
283__DEVICE__
284__RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); }
285
286__DEVICE__
287float j0f(float __x) { return __ocml_j0_f32(__x); }
288
289__DEVICE__
290float j1f(float __x) { return __ocml_j1_f32(__x); }
291
292__DEVICE__
293float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication
294                                // and the Miller & Brown algorithm
295  //       for linear recurrences to get O(log n) steps, but it's unclear if
296  //       it'd be beneficial in this case.
297  if (__n == 0)
298    return j0f(__x);
299  if (__n == 1)
300    return j1f(__x);
301
302  float __x0 = j0f(__x);
303  float __x1 = j1f(__x);
304  for (int __i = 1; __i < __n; ++__i) {
305    float __x2 = (2 * __i) / __x * __x1 - __x0;
306    __x0 = __x1;
307    __x1 = __x2;
308  }
309
310  return __x1;
311}
312
313__DEVICE__
314float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
315
316__DEVICE__
317float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
318
319__DEVICE__
320long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
321
322__DEVICE__
323long long int llroundf(float __x) { return __ocml_round_f32(__x); }
324
325__DEVICE__
326float log10f(float __x) { return __ocml_log10_f32(__x); }
327
328__DEVICE__
329float log1pf(float __x) { return __ocml_log1p_f32(__x); }
330
331__DEVICE__
332float log2f(float __x) { return __ocml_log2_f32(__x); }
333
334__DEVICE__
335float logbf(float __x) { return __ocml_logb_f32(__x); }
336
337__DEVICE__
338float logf(float __x) { return __ocml_log_f32(__x); }
339
340__DEVICE__
341long int lrintf(float __x) { return __ocml_rint_f32(__x); }
342
343__DEVICE__
344long int lroundf(float __x) { return __ocml_round_f32(__x); }
345
346__DEVICE__
347float modff(float __x, float *__iptr) {
348  float __tmp;
349#ifdef __OPENMP_AMDGCN__
350#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
351#endif
352  float __r =
353      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
354  *__iptr = __tmp;
355  return __r;
356}
357
358__DEVICE__
359float nanf(const char *__tagp __attribute__((nonnull))) {
360  union {
361    float val;
362    struct ieee_float {
363      unsigned int mantissa : 22;
364      unsigned int quiet : 1;
365      unsigned int exponent : 8;
366      unsigned int sign : 1;
367    } bits;
368  } __tmp;
369  __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
370
371  __tmp.bits.sign = 0u;
372  __tmp.bits.exponent = ~0u;
373  __tmp.bits.quiet = 1u;
374  __tmp.bits.mantissa = __make_mantissa(__tagp);
375
376  return __tmp.val;
377}
378
379__DEVICE__
380float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
381
382__DEVICE__
383float nextafterf(float __x, float __y) {
384  return __ocml_nextafter_f32(__x, __y);
385}
386
387__DEVICE__
388float norm3df(float __x, float __y, float __z) {
389  return __ocml_len3_f32(__x, __y, __z);
390}
391
392__DEVICE__
393float norm4df(float __x, float __y, float __z, float __w) {
394  return __ocml_len4_f32(__x, __y, __z, __w);
395}
396
397__DEVICE__
398float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
399
400__DEVICE__
401float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
402
403__DEVICE__
404float normf(int __dim,
405            const float *__a) { // TODO: placeholder until OCML adds support.
406  float __r = 0;
407  while (__dim--) {
408    __r += __a[0] * __a[0];
409    ++__a;
410  }
411
412  return __ocml_sqrt_f32(__r);
413}
414
415__DEVICE__
416float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
417
418__DEVICE__
419float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
420
421__DEVICE__
422float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
423
424__DEVICE__
425float remainderf(float __x, float __y) {
426  return __ocml_remainder_f32(__x, __y);
427}
428
429__DEVICE__
430float remquof(float __x, float __y, int *__quo) {
431  int __tmp;
432#ifdef __OPENMP_AMDGCN__
433#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
434#endif
435  float __r = __ocml_remquo_f32(
436      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
437  *__quo = __tmp;
438
439  return __r;
440}
441
442__DEVICE__
443float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
444
445__DEVICE__
446float rintf(float __x) { return __ocml_rint_f32(__x); }
447
448__DEVICE__
449float rnorm3df(float __x, float __y, float __z) {
450  return __ocml_rlen3_f32(__x, __y, __z);
451}
452
453__DEVICE__
454float rnorm4df(float __x, float __y, float __z, float __w) {
455  return __ocml_rlen4_f32(__x, __y, __z, __w);
456}
457
458__DEVICE__
459float rnormf(int __dim,
460             const float *__a) { // TODO: placeholder until OCML adds support.
461  float __r = 0;
462  while (__dim--) {
463    __r += __a[0] * __a[0];
464    ++__a;
465  }
466
467  return __ocml_rsqrt_f32(__r);
468}
469
470__DEVICE__
471float roundf(float __x) { return __ocml_round_f32(__x); }
472
473__DEVICE__
474float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
475
476__DEVICE__
477float scalblnf(float __x, long int __n) {
478  return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
479                         : __ocml_scalb_f32(__x, __n);
480}
481
482__DEVICE__
483float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
484
485__DEVICE__
486__RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
487
488__DEVICE__
489void sincosf(float __x, float *__sinptr, float *__cosptr) {
490  float __tmp;
491#ifdef __OPENMP_AMDGCN__
492#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
493#endif
494  *__sinptr =
495      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
496  *__cosptr = __tmp;
497}
498
499__DEVICE__
500void sincospif(float __x, float *__sinptr, float *__cosptr) {
501  float __tmp;
502#ifdef __OPENMP_AMDGCN__
503#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
504#endif
505  *__sinptr = __ocml_sincospi_f32(
506      __x, (__attribute__((address_space(5))) float *)&__tmp);
507  *__cosptr = __tmp;
508}
509
510__DEVICE__
511float sinf(float __x) { return __ocml_sin_f32(__x); }
512
513__DEVICE__
514float sinhf(float __x) { return __ocml_sinh_f32(__x); }
515
516__DEVICE__
517float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
518
519__DEVICE__
520float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
521
522__DEVICE__
523float tanf(float __x) { return __ocml_tan_f32(__x); }
524
525__DEVICE__
526float tanhf(float __x) { return __ocml_tanh_f32(__x); }
527
528__DEVICE__
529float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
530
531__DEVICE__
532float truncf(float __x) { return __ocml_trunc_f32(__x); }
533
534__DEVICE__
535float y0f(float __x) { return __ocml_y0_f32(__x); }
536
537__DEVICE__
538float y1f(float __x) { return __ocml_y1_f32(__x); }
539
540__DEVICE__
541float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
542                                // and the Miller & Brown algorithm
543  //       for linear recurrences to get O(log n) steps, but it's unclear if
544  //       it'd be beneficial in this case. Placeholder until OCML adds
545  //       support.
546  if (__n == 0)
547    return y0f(__x);
548  if (__n == 1)
549    return y1f(__x);
550
551  float __x0 = y0f(__x);
552  float __x1 = y1f(__x);
553  for (int __i = 1; __i < __n; ++__i) {
554    float __x2 = (2 * __i) / __x * __x1 - __x0;
555    __x0 = __x1;
556    __x1 = __x2;
557  }
558
559  return __x1;
560}
561
562// BEGIN INTRINSICS
563
564__DEVICE__
565float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
566
567__DEVICE__
568float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
569
570__DEVICE__
571float __expf(float __x) { return __ocml_native_exp_f32(__x); }
572
573#if defined OCML_BASIC_ROUNDED_OPERATIONS
574__DEVICE__
575float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
576__DEVICE__
577float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
578__DEVICE__
579float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
580__DEVICE__
581float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
582#else
583__DEVICE__
584float __fadd_rn(float __x, float __y) { return __x + __y; }
585#endif
586
587#if defined OCML_BASIC_ROUNDED_OPERATIONS
588__DEVICE__
589float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
590__DEVICE__
591float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
592__DEVICE__
593float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
594__DEVICE__
595float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
596#else
597__DEVICE__
598float __fdiv_rn(float __x, float __y) { return __x / __y; }
599#endif
600
601__DEVICE__
602float __fdividef(float __x, float __y) { return __x / __y; }
603
604#if defined OCML_BASIC_ROUNDED_OPERATIONS
605__DEVICE__
606float __fmaf_rd(float __x, float __y, float __z) {
607  return __ocml_fma_rtn_f32(__x, __y, __z);
608}
609__DEVICE__
610float __fmaf_rn(float __x, float __y, float __z) {
611  return __ocml_fma_rte_f32(__x, __y, __z);
612}
613__DEVICE__
614float __fmaf_ru(float __x, float __y, float __z) {
615  return __ocml_fma_rtp_f32(__x, __y, __z);
616}
617__DEVICE__
618float __fmaf_rz(float __x, float __y, float __z) {
619  return __ocml_fma_rtz_f32(__x, __y, __z);
620}
621#else
622__DEVICE__
623float __fmaf_rn(float __x, float __y, float __z) {
624  return __ocml_fma_f32(__x, __y, __z);
625}
626#endif
627
628#if defined OCML_BASIC_ROUNDED_OPERATIONS
629__DEVICE__
630float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
631__DEVICE__
632float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
633__DEVICE__
634float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
635__DEVICE__
636float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
637#else
638__DEVICE__
639float __fmul_rn(float __x, float __y) { return __x * __y; }
640#endif
641
642#if defined OCML_BASIC_ROUNDED_OPERATIONS
643__DEVICE__
644float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
645__DEVICE__
646float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
647__DEVICE__
648float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
649__DEVICE__
650float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
651#else
652__DEVICE__
653float __frcp_rn(float __x) { return 1.0f / __x; }
654#endif
655
656__DEVICE__
657float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
658
659#if defined OCML_BASIC_ROUNDED_OPERATIONS
660__DEVICE__
661float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
662__DEVICE__
663float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
664__DEVICE__
665float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
666__DEVICE__
667float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
668#else
669__DEVICE__
670float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
671#endif
672
673#if defined OCML_BASIC_ROUNDED_OPERATIONS
674__DEVICE__
675float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
676__DEVICE__
677float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
678__DEVICE__
679float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
680__DEVICE__
681float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
682#else
683__DEVICE__
684float __fsub_rn(float __x, float __y) { return __x - __y; }
685#endif
686
687__DEVICE__
688float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
689
690__DEVICE__
691float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
692
693__DEVICE__
694float __logf(float __x) { return __ocml_native_log_f32(__x); }
695
696__DEVICE__
697float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
698
699__DEVICE__
700float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
701
702__DEVICE__
703void __sincosf(float __x, float *__sinptr, float *__cosptr) {
704  *__sinptr = __ocml_native_sin_f32(__x);
705  *__cosptr = __ocml_native_cos_f32(__x);
706}
707
708__DEVICE__
709float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
710
711__DEVICE__
712float __tanf(float __x) { return __ocml_tan_f32(__x); }
713// END INTRINSICS
714// END FLOAT
715
716// BEGIN DOUBLE
717__DEVICE__
718double acos(double __x) { return __ocml_acos_f64(__x); }
719
720__DEVICE__
721double acosh(double __x) { return __ocml_acosh_f64(__x); }
722
723__DEVICE__
724double asin(double __x) { return __ocml_asin_f64(__x); }
725
726__DEVICE__
727double asinh(double __x) { return __ocml_asinh_f64(__x); }
728
729__DEVICE__
730double atan(double __x) { return __ocml_atan_f64(__x); }
731
732__DEVICE__
733double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
734
735__DEVICE__
736double atanh(double __x) { return __ocml_atanh_f64(__x); }
737
738__DEVICE__
739double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
740
741__DEVICE__
742double ceil(double __x) { return __ocml_ceil_f64(__x); }
743
744__DEVICE__
745double copysign(double __x, double __y) {
746  return __ocml_copysign_f64(__x, __y);
747}
748
749__DEVICE__
750double cos(double __x) { return __ocml_cos_f64(__x); }
751
752__DEVICE__
753double cosh(double __x) { return __ocml_cosh_f64(__x); }
754
755__DEVICE__
756double cospi(double __x) { return __ocml_cospi_f64(__x); }
757
758__DEVICE__
759double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
760
761__DEVICE__
762double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
763
764__DEVICE__
765double erf(double __x) { return __ocml_erf_f64(__x); }
766
767__DEVICE__
768double erfc(double __x) { return __ocml_erfc_f64(__x); }
769
770__DEVICE__
771double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
772
773__DEVICE__
774double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
775
776__DEVICE__
777double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
778
779__DEVICE__
780double exp(double __x) { return __ocml_exp_f64(__x); }
781
782__DEVICE__
783double exp10(double __x) { return __ocml_exp10_f64(__x); }
784
785__DEVICE__
786double exp2(double __x) { return __ocml_exp2_f64(__x); }
787
788__DEVICE__
789double expm1(double __x) { return __ocml_expm1_f64(__x); }
790
791__DEVICE__
792double fabs(double __x) { return __builtin_fabs(__x); }
793
794__DEVICE__
795double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
796
797__DEVICE__
798double floor(double __x) { return __ocml_floor_f64(__x); }
799
800__DEVICE__
801double fma(double __x, double __y, double __z) {
802  return __ocml_fma_f64(__x, __y, __z);
803}
804
805__DEVICE__
806double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
807
808__DEVICE__
809double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
810
811__DEVICE__
812double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
813
814__DEVICE__
815double frexp(double __x, int *__nptr) {
816  int __tmp;
817#ifdef __OPENMP_AMDGCN__
818#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
819#endif
820  double __r =
821      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
822  *__nptr = __tmp;
823  return __r;
824}
825
826__DEVICE__
827double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
828
829__DEVICE__
830int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
831
832__DEVICE__
833__RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); }
834
835__DEVICE__
836__RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); }
837
838__DEVICE__
839__RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); }
840
841__DEVICE__
842double j0(double __x) { return __ocml_j0_f64(__x); }
843
844__DEVICE__
845double j1(double __x) { return __ocml_j1_f64(__x); }
846
847__DEVICE__
848double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication
849                                 // and the Miller & Brown algorithm
850  //       for linear recurrences to get O(log n) steps, but it's unclear if
851  //       it'd be beneficial in this case. Placeholder until OCML adds
852  //       support.
853  if (__n == 0)
854    return j0(__x);
855  if (__n == 1)
856    return j1(__x);
857
858  double __x0 = j0(__x);
859  double __x1 = j1(__x);
860  for (int __i = 1; __i < __n; ++__i) {
861    double __x2 = (2 * __i) / __x * __x1 - __x0;
862    __x0 = __x1;
863    __x1 = __x2;
864  }
865  return __x1;
866}
867
868__DEVICE__
869double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
870
871__DEVICE__
872double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
873
874__DEVICE__
875long long int llrint(double __x) { return __ocml_rint_f64(__x); }
876
877__DEVICE__
878long long int llround(double __x) { return __ocml_round_f64(__x); }
879
880__DEVICE__
881double log(double __x) { return __ocml_log_f64(__x); }
882
883__DEVICE__
884double log10(double __x) { return __ocml_log10_f64(__x); }
885
886__DEVICE__
887double log1p(double __x) { return __ocml_log1p_f64(__x); }
888
889__DEVICE__
890double log2(double __x) { return __ocml_log2_f64(__x); }
891
892__DEVICE__
893double logb(double __x) { return __ocml_logb_f64(__x); }
894
895__DEVICE__
896long int lrint(double __x) { return __ocml_rint_f64(__x); }
897
898__DEVICE__
899long int lround(double __x) { return __ocml_round_f64(__x); }
900
901__DEVICE__
902double modf(double __x, double *__iptr) {
903  double __tmp;
904#ifdef __OPENMP_AMDGCN__
905#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
906#endif
907  double __r =
908      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
909  *__iptr = __tmp;
910
911  return __r;
912}
913
914__DEVICE__
915double nan(const char *__tagp) {
916#if !_WIN32
917  union {
918    double val;
919    struct ieee_double {
920      uint64_t mantissa : 51;
921      uint32_t quiet : 1;
922      uint32_t exponent : 11;
923      uint32_t sign : 1;
924    } bits;
925  } __tmp;
926  __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits));
927
928  __tmp.bits.sign = 0u;
929  __tmp.bits.exponent = ~0u;
930  __tmp.bits.quiet = 1u;
931  __tmp.bits.mantissa = __make_mantissa(__tagp);
932
933  return __tmp.val;
934#else
935  __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double));
936  uint64_t __val = __make_mantissa(__tagp);
937  __val |= 0xFFF << 51;
938  return *reinterpret_cast<double *>(&__val);
939#endif
940}
941
942__DEVICE__
943double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
944
945__DEVICE__
946double nextafter(double __x, double __y) {
947  return __ocml_nextafter_f64(__x, __y);
948}
949
950__DEVICE__
951double norm(int __dim,
952            const double *__a) { // TODO: placeholder until OCML adds support.
953  double __r = 0;
954  while (__dim--) {
955    __r += __a[0] * __a[0];
956    ++__a;
957  }
958
959  return __ocml_sqrt_f64(__r);
960}
961
962__DEVICE__
963double norm3d(double __x, double __y, double __z) {
964  return __ocml_len3_f64(__x, __y, __z);
965}
966
967__DEVICE__
968double norm4d(double __x, double __y, double __z, double __w) {
969  return __ocml_len4_f64(__x, __y, __z, __w);
970}
971
972__DEVICE__
973double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
974
975__DEVICE__
976double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
977
978__DEVICE__
979double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
980
981__DEVICE__
982double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
983
984__DEVICE__
985double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
986
987__DEVICE__
988double remainder(double __x, double __y) {
989  return __ocml_remainder_f64(__x, __y);
990}
991
992__DEVICE__
993double remquo(double __x, double __y, int *__quo) {
994  int __tmp;
995#ifdef __OPENMP_AMDGCN__
996#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
997#endif
998  double __r = __ocml_remquo_f64(
999      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
1000  *__quo = __tmp;
1001
1002  return __r;
1003}
1004
1005__DEVICE__
1006double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
1007
1008__DEVICE__
1009double rint(double __x) { return __ocml_rint_f64(__x); }
1010
1011__DEVICE__
1012double rnorm(int __dim,
1013             const double *__a) { // TODO: placeholder until OCML adds support.
1014  double __r = 0;
1015  while (__dim--) {
1016    __r += __a[0] * __a[0];
1017    ++__a;
1018  }
1019
1020  return __ocml_rsqrt_f64(__r);
1021}
1022
1023__DEVICE__
1024double rnorm3d(double __x, double __y, double __z) {
1025  return __ocml_rlen3_f64(__x, __y, __z);
1026}
1027
1028__DEVICE__
1029double rnorm4d(double __x, double __y, double __z, double __w) {
1030  return __ocml_rlen4_f64(__x, __y, __z, __w);
1031}
1032
1033__DEVICE__
1034double round(double __x) { return __ocml_round_f64(__x); }
1035
1036__DEVICE__
1037double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
1038
1039__DEVICE__
1040double scalbln(double __x, long int __n) {
1041  return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
1042                         : __ocml_scalb_f64(__x, __n);
1043}
1044__DEVICE__
1045double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
1046
1047__DEVICE__
1048__RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); }
1049
1050__DEVICE__
1051double sin(double __x) { return __ocml_sin_f64(__x); }
1052
1053__DEVICE__
1054void sincos(double __x, double *__sinptr, double *__cosptr) {
1055  double __tmp;
1056#ifdef __OPENMP_AMDGCN__
1057#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
1058#endif
1059  *__sinptr = __ocml_sincos_f64(
1060      __x, (__attribute__((address_space(5))) double *)&__tmp);
1061  *__cosptr = __tmp;
1062}
1063
1064__DEVICE__
1065void sincospi(double __x, double *__sinptr, double *__cosptr) {
1066  double __tmp;
1067#ifdef __OPENMP_AMDGCN__
1068#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
1069#endif
1070  *__sinptr = __ocml_sincospi_f64(
1071      __x, (__attribute__((address_space(5))) double *)&__tmp);
1072  *__cosptr = __tmp;
1073}
1074
1075__DEVICE__
1076double sinh(double __x) { return __ocml_sinh_f64(__x); }
1077
1078__DEVICE__
1079double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
1080
1081__DEVICE__
1082double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
1083
1084__DEVICE__
1085double tan(double __x) { return __ocml_tan_f64(__x); }
1086
1087__DEVICE__
1088double tanh(double __x) { return __ocml_tanh_f64(__x); }
1089
1090__DEVICE__
1091double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
1092
1093__DEVICE__
1094double trunc(double __x) { return __ocml_trunc_f64(__x); }
1095
1096__DEVICE__
1097double y0(double __x) { return __ocml_y0_f64(__x); }
1098
1099__DEVICE__
1100double y1(double __x) { return __ocml_y1_f64(__x); }
1101
1102__DEVICE__
1103double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication
1104                                 // and the Miller & Brown algorithm
1105  //       for linear recurrences to get O(log n) steps, but it's unclear if
1106  //       it'd be beneficial in this case. Placeholder until OCML adds
1107  //       support.
1108  if (__n == 0)
1109    return y0(__x);
1110  if (__n == 1)
1111    return y1(__x);
1112
1113  double __x0 = y0(__x);
1114  double __x1 = y1(__x);
1115  for (int __i = 1; __i < __n; ++__i) {
1116    double __x2 = (2 * __i) / __x * __x1 - __x0;
1117    __x0 = __x1;
1118    __x1 = __x2;
1119  }
1120
1121  return __x1;
1122}
1123
1124// BEGIN INTRINSICS
1125#if defined OCML_BASIC_ROUNDED_OPERATIONS
1126__DEVICE__
1127double __dadd_rd(double __x, double __y) {
1128  return __ocml_add_rtn_f64(__x, __y);
1129}
1130__DEVICE__
1131double __dadd_rn(double __x, double __y) {
1132  return __ocml_add_rte_f64(__x, __y);
1133}
1134__DEVICE__
1135double __dadd_ru(double __x, double __y) {
1136  return __ocml_add_rtp_f64(__x, __y);
1137}
1138__DEVICE__
1139double __dadd_rz(double __x, double __y) {
1140  return __ocml_add_rtz_f64(__x, __y);
1141}
1142#else
1143__DEVICE__
1144double __dadd_rn(double __x, double __y) { return __x + __y; }
1145#endif
1146
1147#if defined OCML_BASIC_ROUNDED_OPERATIONS
1148__DEVICE__
1149double __ddiv_rd(double __x, double __y) {
1150  return __ocml_div_rtn_f64(__x, __y);
1151}
1152__DEVICE__
1153double __ddiv_rn(double __x, double __y) {
1154  return __ocml_div_rte_f64(__x, __y);
1155}
1156__DEVICE__
1157double __ddiv_ru(double __x, double __y) {
1158  return __ocml_div_rtp_f64(__x, __y);
1159}
1160__DEVICE__
1161double __ddiv_rz(double __x, double __y) {
1162  return __ocml_div_rtz_f64(__x, __y);
1163}
1164#else
1165__DEVICE__
1166double __ddiv_rn(double __x, double __y) { return __x / __y; }
1167#endif
1168
1169#if defined OCML_BASIC_ROUNDED_OPERATIONS
1170__DEVICE__
1171double __dmul_rd(double __x, double __y) {
1172  return __ocml_mul_rtn_f64(__x, __y);
1173}
1174__DEVICE__
1175double __dmul_rn(double __x, double __y) {
1176  return __ocml_mul_rte_f64(__x, __y);
1177}
1178__DEVICE__
1179double __dmul_ru(double __x, double __y) {
1180  return __ocml_mul_rtp_f64(__x, __y);
1181}
1182__DEVICE__
1183double __dmul_rz(double __x, double __y) {
1184  return __ocml_mul_rtz_f64(__x, __y);
1185}
1186#else
1187__DEVICE__
1188double __dmul_rn(double __x, double __y) { return __x * __y; }
1189#endif
1190
1191#if defined OCML_BASIC_ROUNDED_OPERATIONS
1192__DEVICE__
1193double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); }
1194__DEVICE__
1195double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); }
1196__DEVICE__
1197double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); }
1198__DEVICE__
1199double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); }
1200#else
1201__DEVICE__
1202double __drcp_rn(double __x) { return 1.0 / __x; }
1203#endif
1204
1205#if defined OCML_BASIC_ROUNDED_OPERATIONS
1206__DEVICE__
1207double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
1208__DEVICE__
1209double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); }
1210__DEVICE__
1211double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
1212__DEVICE__
1213double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
1214#else
1215__DEVICE__
1216double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
1217#endif
1218
1219#if defined OCML_BASIC_ROUNDED_OPERATIONS
1220__DEVICE__
1221double __dsub_rd(double __x, double __y) {
1222  return __ocml_sub_rtn_f64(__x, __y);
1223}
1224__DEVICE__
1225double __dsub_rn(double __x, double __y) {
1226  return __ocml_sub_rte_f64(__x, __y);
1227}
1228__DEVICE__
1229double __dsub_ru(double __x, double __y) {
1230  return __ocml_sub_rtp_f64(__x, __y);
1231}
1232__DEVICE__
1233double __dsub_rz(double __x, double __y) {
1234  return __ocml_sub_rtz_f64(__x, __y);
1235}
1236#else
1237__DEVICE__
1238double __dsub_rn(double __x, double __y) { return __x - __y; }
1239#endif
1240
1241#if defined OCML_BASIC_ROUNDED_OPERATIONS
1242__DEVICE__
1243double __fma_rd(double __x, double __y, double __z) {
1244  return __ocml_fma_rtn_f64(__x, __y, __z);
1245}
1246__DEVICE__
1247double __fma_rn(double __x, double __y, double __z) {
1248  return __ocml_fma_rte_f64(__x, __y, __z);
1249}
1250__DEVICE__
1251double __fma_ru(double __x, double __y, double __z) {
1252  return __ocml_fma_rtp_f64(__x, __y, __z);
1253}
1254__DEVICE__
1255double __fma_rz(double __x, double __y, double __z) {
1256  return __ocml_fma_rtz_f64(__x, __y, __z);
1257}
1258#else
1259__DEVICE__
1260double __fma_rn(double __x, double __y, double __z) {
1261  return __ocml_fma_f64(__x, __y, __z);
1262}
1263#endif
1264// END INTRINSICS
1265// END DOUBLE
1266
1267// C only macros
1268#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
1269#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x)
1270#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x)
1271#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x)
1272#define signbit(__x)                                                           \
1273  _Generic((__x), float : __signbitf, double : __signbit)(__x)
1274#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L
1275
1276#if defined(__cplusplus)
1277template <class T> __DEVICE__ T min(T __arg1, T __arg2) {
1278  return (__arg1 < __arg2) ? __arg1 : __arg2;
1279}
1280
1281template <class T> __DEVICE__ T max(T __arg1, T __arg2) {
1282  return (__arg1 > __arg2) ? __arg1 : __arg2;
1283}
1284
1285__DEVICE__ int min(int __arg1, int __arg2) {
1286  return (__arg1 < __arg2) ? __arg1 : __arg2;
1287}
1288__DEVICE__ int max(int __arg1, int __arg2) {
1289  return (__arg1 > __arg2) ? __arg1 : __arg2;
1290}
1291
1292__DEVICE__
1293float max(float __x, float __y) { return fmaxf(__x, __y); }
1294
1295__DEVICE__
1296double max(double __x, double __y) { return fmax(__x, __y); }
1297
1298__DEVICE__
1299float min(float __x, float __y) { return fminf(__x, __y); }
1300
1301__DEVICE__
1302double min(double __x, double __y) { return fmin(__x, __y); }
1303
1304#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
1305__host__ inline static int min(int __arg1, int __arg2) {
1306  return std::min(__arg1, __arg2);
1307}
1308
1309__host__ inline static int max(int __arg1, int __arg2) {
1310  return std::max(__arg1, __arg2);
1311}
1312#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
1313#endif
1314
1315#pragma pop_macro("__DEVICE__")
1316#pragma pop_macro("__RETURN_TYPE")
1317
1318#endif // __CLANG_HIP_MATH_H__
1319