sharedRuntimeTrig.cpp revision 1472:c18cbe5936b8
1/*
2 * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "incls/_precompiled.incl"
26#include "incls/_sharedRuntimeTrig.cpp.incl"
27
28// This file contains copies of the fdlibm routines used by
29// StrictMath. It turns out that it is almost always required to use
30// these runtime routines; the Intel CPU doesn't meet the Java
31// specification for sin/cos outside a certain limited argument range,
32// and the SPARC CPU doesn't appear to have sin/cos instructions. It
33// also turns out that avoiding the indirect call through function
34// pointer out to libjava.so in SharedRuntime speeds these routines up
35// by roughly 15% on both Win32/x86 and Solaris/SPARC.
36
37// Enabling optimizations in this file causes incorrect code to be
38// generated; can not figure out how to turn down optimization for one
39// file in the IDE on Windows
40#ifdef WIN32
41# pragma optimize ( "", off )
42#endif
43
44/* The above workaround now causes more problems with the latest MS compiler.
45 * Visual Studio 2010's /GS option tries to guard against buffer overruns.
46 * /GS is on by default if you specify optimizations, which we do globally
47 * via /W3 /O2. However the above selective turning off of optimizations means
48 * that /GS issues a warning "4748". And since we treat warnings as errors (/WX)
49 * then the compilation fails. There are several possible solutions
50 * (1) Remove that pragma above as obsolete with VS2010 - requires testing.
51 * (2) Stop treating warnings as errors - would be a backward step
52 * (3) Disable /GS - may help performance but you lose the security checks
53 * (4) Disable the warning with "#pragma warning( disable : 4748 )"
54 * (5) Disable planting the code with  __declspec(safebuffers)
55 * I've opted for (5) although we should investigate the local performance
56 * benefits of (1) and global performance benefit of (3).
57 */
58#if defined(WIN32) && (defined(_MSC_VER) && (_MSC_VER >= 1600))
59#define SAFEBUF __declspec(safebuffers)
60#else
61#define SAFEBUF
62#endif
63
64#include <math.h>
65
66// VM_LITTLE_ENDIAN is #defined appropriately in the Makefiles
67// [jk] this is not 100% correct because the float word order may different
68// from the byte order (e.g. on ARM)
69#ifdef VM_LITTLE_ENDIAN
70# define __HI(x) *(1+(int*)&x)
71# define __LO(x) *(int*)&x
72#else
73# define __HI(x) *(int*)&x
74# define __LO(x) *(1+(int*)&x)
75#endif
76
77static double copysignA(double x, double y) {
78  __HI(x) = (__HI(x)&0x7fffffff)|(__HI(y)&0x80000000);
79  return x;
80}
81
82/*
83 * scalbn (double x, int n)
84 * scalbn(x,n) returns x* 2**n  computed by  exponent
85 * manipulation rather than by actually performing an
86 * exponentiation or a multiplication.
87 */
88
89static const double
90two54   =  1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
91twom54  =  5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
92hugeX  = 1.0e+300,
93tiny   = 1.0e-300;
94
95static double scalbnA (double x, int n) {
96  int  k,hx,lx;
97  hx = __HI(x);
98  lx = __LO(x);
99  k = (hx&0x7ff00000)>>20;              /* extract exponent */
100  if (k==0) {                           /* 0 or subnormal x */
101    if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
102    x *= two54;
103    hx = __HI(x);
104    k = ((hx&0x7ff00000)>>20) - 54;
105    if (n< -50000) return tiny*x;       /*underflow*/
106  }
107  if (k==0x7ff) return x+x;             /* NaN or Inf */
108  k = k+n;
109  if (k >  0x7fe) return hugeX*copysignA(hugeX,x); /* overflow  */
110  if (k > 0)                            /* normal result */
111    {__HI(x) = (hx&0x800fffff)|(k<<20); return x;}
112  if (k <= -54) {
113    if (n > 50000)      /* in case integer overflow in n+k */
114      return hugeX*copysignA(hugeX,x);  /*overflow*/
115    else return tiny*copysignA(tiny,x); /*underflow*/
116  }
117  k += 54;                              /* subnormal result */
118  __HI(x) = (hx&0x800fffff)|(k<<20);
119  return x*twom54;
120}
121
122/*
123 * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
124 * double x[],y[]; int e0,nx,prec; int ipio2[];
125 *
126 * __kernel_rem_pio2 return the last three digits of N with
127 *              y = x - N*pi/2
128 * so that |y| < pi/2.
129 *
130 * The method is to compute the integer (mod 8) and fraction parts of
131 * (2/pi)*x without doing the full multiplication. In general we
132 * skip the part of the product that are known to be a huge integer (
133 * more accurately, = 0 mod 8 ). Thus the number of operations are
134 * independent of the exponent of the input.
135 *
136 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
137 *
138 * Input parameters:
139 *      x[]     The input value (must be positive) is broken into nx
140 *              pieces of 24-bit integers in double precision format.
141 *              x[i] will be the i-th 24 bit of x. The scaled exponent
142 *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
143 *              match x's up to 24 bits.
144 *
145 *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
146 *                      e0 = ilogb(z)-23
147 *                      z  = scalbn(z,-e0)
148 *              for i = 0,1,2
149 *                      x[i] = floor(z)
150 *                      z    = (z-x[i])*2**24
151 *
152 *
153 *      y[]     ouput result in an array of double precision numbers.
154 *              The dimension of y[] is:
155 *                      24-bit  precision       1
156 *                      53-bit  precision       2
157 *                      64-bit  precision       2
158 *                      113-bit precision       3
159 *              The actual value is the sum of them. Thus for 113-bit
160 *              precsion, one may have to do something like:
161 *
162 *              long double t,w,r_head, r_tail;
163 *              t = (long double)y[2] + (long double)y[1];
164 *              w = (long double)y[0];
165 *              r_head = t+w;
166 *              r_tail = w - (r_head - t);
167 *
168 *      e0      The exponent of x[0]
169 *
170 *      nx      dimension of x[]
171 *
172 *      prec    an interger indicating the precision:
173 *                      0       24  bits (single)
174 *                      1       53  bits (double)
175 *                      2       64  bits (extended)
176 *                      3       113 bits (quad)
177 *
178 *      ipio2[]
179 *              integer array, contains the (24*i)-th to (24*i+23)-th
180 *              bit of 2/pi after binary point. The corresponding
181 *              floating value is
182 *
183 *                      ipio2[i] * 2^(-24(i+1)).
184 *
185 * External function:
186 *      double scalbn(), floor();
187 *
188 *
189 * Here is the description of some local variables:
190 *
191 *      jk      jk+1 is the initial number of terms of ipio2[] needed
192 *              in the computation. The recommended value is 2,3,4,
193 *              6 for single, double, extended,and quad.
194 *
195 *      jz      local integer variable indicating the number of
196 *              terms of ipio2[] used.
197 *
198 *      jx      nx - 1
199 *
200 *      jv      index for pointing to the suitable ipio2[] for the
201 *              computation. In general, we want
202 *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
203 *              is an integer. Thus
204 *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
205 *              Hence jv = max(0,(e0-3)/24).
206 *
207 *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
208 *
209 *      q[]     double array with integral value, representing the
210 *              24-bits chunk of the product of x and 2/pi.
211 *
212 *      q0      the corresponding exponent of q[0]. Note that the
213 *              exponent for q[i] would be q0-24*i.
214 *
215 *      PIo2[]  double precision array, obtained by cutting pi/2
216 *              into 24 bits chunks.
217 *
218 *      f[]     ipio2[] in floating point
219 *
220 *      iq[]    integer array by breaking up q[] in 24-bits chunk.
221 *
222 *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
223 *
224 *      ih      integer. If >0 it indicats q[] is >= 0.5, hence
225 *              it also indicates the *sign* of the result.
226 *
227 */
228
229
230/*
231 * Constants:
232 * The hexadecimal values are the intended ones for the following
233 * constants. The decimal values may be used, provided that the
234 * compiler will convert from decimal to binary accurately enough
235 * to produce the hexadecimal values shown.
236 */
237
238
239static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
240
241static const double PIo2[] = {
242  1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
243  7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
244  5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
245  3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
246  1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
247  1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
248  2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
249  2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
250};
251
252static const double
253zeroB   = 0.0,
254one     = 1.0,
255two24B  = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
256twon24  = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
257
258static SAFEBUF int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int *ipio2) {
259  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
260  double z,fw,f[20],fq[20],q[20];
261
262  /* initialize jk*/
263  jk = init_jk[prec];
264  jp = jk;
265
266  /* determine jx,jv,q0, note that 3>q0 */
267  jx =  nx-1;
268  jv = (e0-3)/24; if(jv<0) jv=0;
269  q0 =  e0-24*(jv+1);
270
271  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
272  j = jv-jx; m = jx+jk;
273  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : (double) ipio2[j];
274
275  /* compute q[0],q[1],...q[jk] */
276  for (i=0;i<=jk;i++) {
277    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
278  }
279
280  jz = jk;
281recompute:
282  /* distill q[] into iq[] reversingly */
283  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
284    fw    =  (double)((int)(twon24* z));
285    iq[i] =  (int)(z-two24B*fw);
286    z     =  q[j-1]+fw;
287  }
288
289  /* compute n */
290  z  = scalbnA(z,q0);           /* actual value of z */
291  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
292  n  = (int) z;
293  z -= (double)n;
294  ih = 0;
295  if(q0>0) {    /* need iq[jz-1] to determine n */
296    i  = (iq[jz-1]>>(24-q0)); n += i;
297    iq[jz-1] -= i<<(24-q0);
298    ih = iq[jz-1]>>(23-q0);
299  }
300  else if(q0==0) ih = iq[jz-1]>>23;
301  else if(z>=0.5) ih=2;
302
303  if(ih>0) {    /* q > 0.5 */
304    n += 1; carry = 0;
305    for(i=0;i<jz ;i++) {        /* compute 1-q */
306      j = iq[i];
307      if(carry==0) {
308        if(j!=0) {
309          carry = 1; iq[i] = 0x1000000- j;
310        }
311      } else  iq[i] = 0xffffff - j;
312    }
313    if(q0>0) {          /* rare case: chance is 1 in 12 */
314      switch(q0) {
315      case 1:
316        iq[jz-1] &= 0x7fffff; break;
317      case 2:
318        iq[jz-1] &= 0x3fffff; break;
319      }
320    }
321    if(ih==2) {
322      z = one - z;
323      if(carry!=0) z -= scalbnA(one,q0);
324    }
325  }
326
327  /* check if recomputation is needed */
328  if(z==zeroB) {
329    j = 0;
330    for (i=jz-1;i>=jk;i--) j |= iq[i];
331    if(j==0) { /* need recomputation */
332      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
333
334      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
335        f[jx+i] = (double) ipio2[jv+i];
336        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
337        q[i] = fw;
338      }
339      jz += k;
340      goto recompute;
341    }
342  }
343
344  /* chop off zero terms */
345  if(z==0.0) {
346    jz -= 1; q0 -= 24;
347    while(iq[jz]==0) { jz--; q0-=24;}
348  } else { /* break z into 24-bit if neccessary */
349    z = scalbnA(z,-q0);
350    if(z>=two24B) {
351      fw = (double)((int)(twon24*z));
352      iq[jz] = (int)(z-two24B*fw);
353      jz += 1; q0 += 24;
354      iq[jz] = (int) fw;
355    } else iq[jz] = (int) z ;
356  }
357
358  /* convert integer "bit" chunk to floating-point value */
359  fw = scalbnA(one,q0);
360  for(i=jz;i>=0;i--) {
361    q[i] = fw*(double)iq[i]; fw*=twon24;
362  }
363
364  /* compute PIo2[0,...,jp]*q[jz,...,0] */
365  for(i=jz;i>=0;i--) {
366    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
367    fq[jz-i] = fw;
368  }
369
370  /* compress fq[] into y[] */
371  switch(prec) {
372  case 0:
373    fw = 0.0;
374    for (i=jz;i>=0;i--) fw += fq[i];
375    y[0] = (ih==0)? fw: -fw;
376    break;
377  case 1:
378  case 2:
379    fw = 0.0;
380    for (i=jz;i>=0;i--) fw += fq[i];
381    y[0] = (ih==0)? fw: -fw;
382    fw = fq[0]-fw;
383    for (i=1;i<=jz;i++) fw += fq[i];
384    y[1] = (ih==0)? fw: -fw;
385    break;
386  case 3:       /* painful */
387    for (i=jz;i>0;i--) {
388      fw      = fq[i-1]+fq[i];
389      fq[i]  += fq[i-1]-fw;
390      fq[i-1] = fw;
391    }
392    for (i=jz;i>1;i--) {
393      fw      = fq[i-1]+fq[i];
394      fq[i]  += fq[i-1]-fw;
395      fq[i-1] = fw;
396    }
397    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
398    if(ih==0) {
399      y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
400    } else {
401      y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
402    }
403  }
404  return n&7;
405}
406
407
408/*
409 * ====================================================
410 * Copyright (c) 1993 Oracle and/or its affilates. All rights reserved.
411 *
412 * Developed at SunPro, a Sun Microsystems, Inc. business.
413 * Permission to use, copy, modify, and distribute this
414 * software is freely granted, provided that this notice
415 * is preserved.
416 * ====================================================
417 *
418 */
419
420/* __ieee754_rem_pio2(x,y)
421 *
422 * return the remainder of x rem pi/2 in y[0]+y[1]
423 * use __kernel_rem_pio2()
424 */
425
426/*
427 * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
428 */
429static const int two_over_pi[] = {
430  0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
431  0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
432  0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
433  0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
434  0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
435  0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
436  0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
437  0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
438  0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
439  0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
440  0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
441};
442
443static const int npio2_hw[] = {
444  0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
445  0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
446  0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
447  0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
448  0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
449  0x404858EB, 0x404921FB,
450};
451
452/*
453 * invpio2:  53 bits of 2/pi
454 * pio2_1:   first  33 bit of pi/2
455 * pio2_1t:  pi/2 - pio2_1
456 * pio2_2:   second 33 bit of pi/2
457 * pio2_2t:  pi/2 - (pio2_1+pio2_2)
458 * pio2_3:   third  33 bit of pi/2
459 * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
460 */
461
462static const double
463zeroA =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
464half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
465two24A =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
466invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
467pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
468pio2_1t =  6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
469pio2_2  =  6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
470pio2_2t =  2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
471pio2_3  =  2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
472pio2_3t =  8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
473
474static SAFEBUF int __ieee754_rem_pio2(double x, double *y) {
475  double z,w,t,r,fn;
476  double tx[3];
477  int e0,i,j,nx,n,ix,hx,i0;
478
479  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
480  hx = *(i0+(int*)&x);          /* high word of x */
481  ix = hx&0x7fffffff;
482  if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
483    {y[0] = x; y[1] = 0; return 0;}
484  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
485    if(hx>0) {
486      z = x - pio2_1;
487      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
488        y[0] = z - pio2_1t;
489        y[1] = (z-y[0])-pio2_1t;
490      } else {                /* near pi/2, use 33+33+53 bit pi */
491        z -= pio2_2;
492        y[0] = z - pio2_2t;
493        y[1] = (z-y[0])-pio2_2t;
494      }
495      return 1;
496    } else {    /* negative x */
497      z = x + pio2_1;
498      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
499        y[0] = z + pio2_1t;
500        y[1] = (z-y[0])+pio2_1t;
501      } else {                /* near pi/2, use 33+33+53 bit pi */
502        z += pio2_2;
503        y[0] = z + pio2_2t;
504        y[1] = (z-y[0])+pio2_2t;
505      }
506      return -1;
507    }
508  }
509  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
510    t  = fabsd(x);
511    n  = (int) (t*invpio2+half);
512    fn = (double)n;
513    r  = t-fn*pio2_1;
514    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
515    if(n<32&&ix!=npio2_hw[n-1]) {
516      y[0] = r-w;       /* quick check no cancellation */
517    } else {
518      j  = ix>>20;
519      y[0] = r-w;
520      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
521      if(i>16) {  /* 2nd iteration needed, good to 118 */
522        t  = r;
523        w  = fn*pio2_2;
524        r  = t-w;
525        w  = fn*pio2_2t-((t-r)-w);
526        y[0] = r-w;
527        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
528        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
529          t  = r;       /* will cover all possible cases */
530          w  = fn*pio2_3;
531          r  = t-w;
532          w  = fn*pio2_3t-((t-r)-w);
533          y[0] = r-w;
534        }
535      }
536    }
537    y[1] = (r-y[0])-w;
538    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
539    else         return n;
540  }
541  /*
542   * all other (large) arguments
543   */
544  if(ix>=0x7ff00000) {          /* x is inf or NaN */
545    y[0]=y[1]=x-x; return 0;
546  }
547  /* set z = scalbn(|x|,ilogb(x)-23) */
548  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
549  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
550  *(i0+(int*)&z) = ix - (e0<<20);
551  for(i=0;i<2;i++) {
552    tx[i] = (double)((int)(z));
553    z     = (z-tx[i])*two24A;
554  }
555  tx[2] = z;
556  nx = 3;
557  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
558  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
559  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
560  return n;
561}
562
563
564/* __kernel_sin( x, y, iy)
565 * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
566 * Input x is assumed to be bounded by ~pi/4 in magnitude.
567 * Input y is the tail of x.
568 * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
569 *
570 * Algorithm
571 *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
572 *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
573 *      3. sin(x) is approximated by a polynomial of degree 13 on
574 *         [0,pi/4]
575 *                               3            13
576 *              sin(x) ~ x + S1*x + ... + S6*x
577 *         where
578 *
579 *      |sin(x)         2     4     6     8     10     12  |     -58
580 *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
581 *      |  x                                               |
582 *
583 *      4. sin(x+y) = sin(x) + sin'(x')*y
584 *                  ~ sin(x) + (1-x*x/2)*y
585 *         For better accuracy, let
586 *                   3      2      2      2      2
587 *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
588 *         then                   3    2
589 *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
590 */
591
592static const double
593S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
594S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
595S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
596S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
597S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
598S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
599
600static double __kernel_sin(double x, double y, int iy)
601{
602        double z,r,v;
603        int ix;
604        ix = __HI(x)&0x7fffffff;        /* high word of x */
605        if(ix<0x3e400000)                       /* |x| < 2**-27 */
606           {if((int)x==0) return x;}            /* generate inexact */
607        z       =  x*x;
608        v       =  z*x;
609        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
610        if(iy==0) return x+v*(S1+z*r);
611        else      return x-((z*(half*y-v*r)-y)-v*S1);
612}
613
614/*
615 * __kernel_cos( x,  y )
616 * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
617 * Input x is assumed to be bounded by ~pi/4 in magnitude.
618 * Input y is the tail of x.
619 *
620 * Algorithm
621 *      1. Since cos(-x) = cos(x), we need only to consider positive x.
622 *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
623 *      3. cos(x) is approximated by a polynomial of degree 14 on
624 *         [0,pi/4]
625 *                                       4            14
626 *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
627 *         where the remez error is
628 *
629 *      |              2     4     6     8     10    12     14 |     -58
630 *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
631 *      |                                                      |
632 *
633 *                     4     6     8     10    12     14
634 *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
635 *             cos(x) = 1 - x*x/2 + r
636 *         since cos(x+y) ~ cos(x) - sin(x)*y
637 *                        ~ cos(x) - x*y,
638 *         a correction term is necessary in cos(x) and hence
639 *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
640 *         For better accuracy when x > 0.3, let qx = |x|/4 with
641 *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
642 *         Then
643 *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
644 *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
645 *         magnitude of the latter is at least a quarter of x*x/2,
646 *         thus, reducing the rounding error in the subtraction.
647 */
648
649static const double
650C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
651C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
652C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
653C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
654C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
655C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
656
657static double __kernel_cos(double x, double y)
658{
659  double a,hz,z,r,qx;
660  int ix;
661  ix = __HI(x)&0x7fffffff;      /* ix = |x|'s high word*/
662  if(ix<0x3e400000) {                   /* if x < 2**27 */
663    if(((int)x)==0) return one;         /* generate inexact */
664  }
665  z  = x*x;
666  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
667  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
668    return one - (0.5*z - (z*r - x*y));
669  else {
670    if(ix > 0x3fe90000) {               /* x > 0.78125 */
671      qx = 0.28125;
672    } else {
673      __HI(qx) = ix-0x00200000; /* x/4 */
674      __LO(qx) = 0;
675    }
676    hz = 0.5*z-qx;
677    a  = one-qx;
678    return a - (hz - (z*r-x*y));
679  }
680}
681
682/* __kernel_tan( x, y, k )
683 * kernel tan function on [-pi/4, pi/4], pi/4 ~ 0.7854
684 * Input x is assumed to be bounded by ~pi/4 in magnitude.
685 * Input y is the tail of x.
686 * Input k indicates whether tan (if k=1) or
687 * -1/tan (if k= -1) is returned.
688 *
689 * Algorithm
690 *      1. Since tan(-x) = -tan(x), we need only to consider positive x.
691 *      2. if x < 2^-28 (hx<0x3e300000 0), return x with inexact if x!=0.
692 *      3. tan(x) is approximated by a odd polynomial of degree 27 on
693 *         [0,0.67434]
694 *                               3             27
695 *              tan(x) ~ x + T1*x + ... + T13*x
696 *         where
697 *
698 *              |tan(x)         2     4            26   |     -59.2
699 *              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
700 *              |  x                                    |
701 *
702 *         Note: tan(x+y) = tan(x) + tan'(x)*y
703 *                        ~ tan(x) + (1+x*x)*y
704 *         Therefore, for better accuracy in computing tan(x+y), let
705 *                   3      2      2       2       2
706 *              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
707 *         then
708 *                                  3    2
709 *              tan(x+y) = x + (T1*x + (x *(r+y)+y))
710 *
711 *      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
712 *              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
713 *                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
714 */
715
716static const double
717pio4  =  7.85398163397448278999e-01, /* 0x3FE921FB, 0x54442D18 */
718pio4lo=  3.06161699786838301793e-17, /* 0x3C81A626, 0x33145C07 */
719T[] =  {
720  3.33333333333334091986e-01, /* 0x3FD55555, 0x55555563 */
721  1.33333333333201242699e-01, /* 0x3FC11111, 0x1110FE7A */
722  5.39682539762260521377e-02, /* 0x3FABA1BA, 0x1BB341FE */
723  2.18694882948595424599e-02, /* 0x3F9664F4, 0x8406D637 */
724  8.86323982359930005737e-03, /* 0x3F8226E3, 0xE96E8493 */
725  3.59207910759131235356e-03, /* 0x3F6D6D22, 0xC9560328 */
726  1.45620945432529025516e-03, /* 0x3F57DBC8, 0xFEE08315 */
727  5.88041240820264096874e-04, /* 0x3F4344D8, 0xF2F26501 */
728  2.46463134818469906812e-04, /* 0x3F3026F7, 0x1A8D1068 */
729  7.81794442939557092300e-05, /* 0x3F147E88, 0xA03792A6 */
730  7.14072491382608190305e-05, /* 0x3F12B80F, 0x32F0A7E9 */
731 -1.85586374855275456654e-05, /* 0xBEF375CB, 0xDB605373 */
732  2.59073051863633712884e-05, /* 0x3EFB2A70, 0x74BF7AD4 */
733};
734
735static double __kernel_tan(double x, double y, int iy)
736{
737  double z,r,v,w,s;
738  int ix,hx;
739  hx = __HI(x);   /* high word of x */
740  ix = hx&0x7fffffff;     /* high word of |x| */
741  if(ix<0x3e300000) {                     /* x < 2**-28 */
742    if((int)x==0) {                       /* generate inexact */
743      if (((ix | __LO(x)) | (iy + 1)) == 0)
744        return one / fabsd(x);
745      else {
746        if (iy == 1)
747          return x;
748        else {    /* compute -1 / (x+y) carefully */
749          double a, t;
750
751          z = w = x + y;
752          __LO(z) = 0;
753          v = y - (z - x);
754          t = a = -one / w;
755          __LO(t) = 0;
756          s = one + t * z;
757          return t + a * (s + t * v);
758        }
759      }
760    }
761  }
762  if(ix>=0x3FE59428) {                    /* |x|>=0.6744 */
763    if(hx<0) {x = -x; y = -y;}
764    z = pio4-x;
765    w = pio4lo-y;
766    x = z+w; y = 0.0;
767  }
768  z       =  x*x;
769  w       =  z*z;
770  /* Break x^5*(T[1]+x^2*T[2]+...) into
771   *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
772   *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
773   */
774  r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
775  v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
776  s = z*x;
777  r = y + z*(s*(r+v)+y);
778  r += T[0]*s;
779  w = x+r;
780  if(ix>=0x3FE59428) {
781    v = (double)iy;
782    return (double)(1-((hx>>30)&2))*(v-2.0*(x-(w*w/(w+v)-r)));
783  }
784  if(iy==1) return w;
785  else {          /* if allow error up to 2 ulp,
786                     simply return -1.0/(x+r) here */
787    /*  compute -1.0/(x+r) accurately */
788    double a,t;
789    z  = w;
790    __LO(z) = 0;
791    v  = r-(z - x);     /* z+v = r+x */
792    t = a  = -1.0/w;    /* a = -1.0/w */
793    __LO(t) = 0;
794    s  = 1.0+t*z;
795    return t+a*(s+t*v);
796  }
797}
798
799
800//----------------------------------------------------------------------
801//
802// Routines for new sin/cos implementation
803//
804//----------------------------------------------------------------------
805
806/* sin(x)
807 * Return sine function of x.
808 *
809 * kernel function:
810 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
811 *      __kernel_cos            ... cose function on [-pi/4,pi/4]
812 *      __ieee754_rem_pio2      ... argument reduction routine
813 *
814 * Method.
815 *      Let S,C and T denote the sin, cos and tan respectively on
816 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
817 *      in [-pi/4 , +pi/4], and let n = k mod 4.
818 *      We have
819 *
820 *          n        sin(x)      cos(x)        tan(x)
821 *     ----------------------------------------------------------
822 *          0          S           C             T
823 *          1          C          -S            -1/T
824 *          2         -S          -C             T
825 *          3         -C           S            -1/T
826 *     ----------------------------------------------------------
827 *
828 * Special cases:
829 *      Let trig be any of sin, cos, or tan.
830 *      trig(+-INF)  is NaN, with signals;
831 *      trig(NaN)    is that NaN;
832 *
833 * Accuracy:
834 *      TRIG(x) returns trig(x) nearly rounded
835 */
836
837JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
838  double y[2],z=0.0;
839  int n, ix;
840
841  /* High word of x. */
842  ix = __HI(x);
843
844  /* |x| ~< pi/4 */
845  ix &= 0x7fffffff;
846  if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
847
848  /* sin(Inf or NaN) is NaN */
849  else if (ix>=0x7ff00000) return x-x;
850
851  /* argument reduction needed */
852  else {
853    n = __ieee754_rem_pio2(x,y);
854    switch(n&3) {
855    case 0: return  __kernel_sin(y[0],y[1],1);
856    case 1: return  __kernel_cos(y[0],y[1]);
857    case 2: return -__kernel_sin(y[0],y[1],1);
858    default:
859      return -__kernel_cos(y[0],y[1]);
860    }
861  }
862JRT_END
863
864/* cos(x)
865 * Return cosine function of x.
866 *
867 * kernel function:
868 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
869 *      __kernel_cos            ... cosine function on [-pi/4,pi/4]
870 *      __ieee754_rem_pio2      ... argument reduction routine
871 *
872 * Method.
873 *      Let S,C and T denote the sin, cos and tan respectively on
874 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
875 *      in [-pi/4 , +pi/4], and let n = k mod 4.
876 *      We have
877 *
878 *          n        sin(x)      cos(x)        tan(x)
879 *     ----------------------------------------------------------
880 *          0          S           C             T
881 *          1          C          -S            -1/T
882 *          2         -S          -C             T
883 *          3         -C           S            -1/T
884 *     ----------------------------------------------------------
885 *
886 * Special cases:
887 *      Let trig be any of sin, cos, or tan.
888 *      trig(+-INF)  is NaN, with signals;
889 *      trig(NaN)    is that NaN;
890 *
891 * Accuracy:
892 *      TRIG(x) returns trig(x) nearly rounded
893 */
894
895JRT_LEAF(jdouble, SharedRuntime::dcos(jdouble x))
896  double y[2],z=0.0;
897  int n, ix;
898
899  /* High word of x. */
900  ix = __HI(x);
901
902  /* |x| ~< pi/4 */
903  ix &= 0x7fffffff;
904  if(ix <= 0x3fe921fb) return __kernel_cos(x,z);
905
906  /* cos(Inf or NaN) is NaN */
907  else if (ix>=0x7ff00000) return x-x;
908
909  /* argument reduction needed */
910  else {
911    n = __ieee754_rem_pio2(x,y);
912    switch(n&3) {
913    case 0: return  __kernel_cos(y[0],y[1]);
914    case 1: return -__kernel_sin(y[0],y[1],1);
915    case 2: return -__kernel_cos(y[0],y[1]);
916    default:
917      return  __kernel_sin(y[0],y[1],1);
918    }
919  }
920JRT_END
921
922/* tan(x)
923 * Return tangent function of x.
924 *
925 * kernel function:
926 *      __kernel_tan            ... tangent function on [-pi/4,pi/4]
927 *      __ieee754_rem_pio2      ... argument reduction routine
928 *
929 * Method.
930 *      Let S,C and T denote the sin, cos and tan respectively on
931 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
932 *      in [-pi/4 , +pi/4], and let n = k mod 4.
933 *      We have
934 *
935 *          n        sin(x)      cos(x)        tan(x)
936 *     ----------------------------------------------------------
937 *          0          S           C             T
938 *          1          C          -S            -1/T
939 *          2         -S          -C             T
940 *          3         -C           S            -1/T
941 *     ----------------------------------------------------------
942 *
943 * Special cases:
944 *      Let trig be any of sin, cos, or tan.
945 *      trig(+-INF)  is NaN, with signals;
946 *      trig(NaN)    is that NaN;
947 *
948 * Accuracy:
949 *      TRIG(x) returns trig(x) nearly rounded
950 */
951
952JRT_LEAF(jdouble, SharedRuntime::dtan(jdouble x))
953  double y[2],z=0.0;
954  int n, ix;
955
956  /* High word of x. */
957  ix = __HI(x);
958
959  /* |x| ~< pi/4 */
960  ix &= 0x7fffffff;
961  if(ix <= 0x3fe921fb) return __kernel_tan(x,z,1);
962
963  /* tan(Inf or NaN) is NaN */
964  else if (ix>=0x7ff00000) return x-x;            /* NaN */
965
966  /* argument reduction needed */
967  else {
968    n = __ieee754_rem_pio2(x,y);
969    return __kernel_tan(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
970                                                     -1 -- n odd */
971  }
972JRT_END
973
974
975#ifdef WIN32
976# pragma optimize ( "", on )
977#endif
978