sharedRuntimeTrig.cpp revision 6014:8a9bb7821e28
1/*
2 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "prims/jni.h"
27#include "runtime/interfaceSupport.hpp"
28#include "runtime/sharedRuntime.hpp"
29
30// This file contains copies of the fdlibm routines used by
31// StrictMath. It turns out that it is almost always required to use
32// these runtime routines; the Intel CPU doesn't meet the Java
33// specification for sin/cos outside a certain limited argument range,
34// and the SPARC CPU doesn't appear to have sin/cos instructions. It
35// also turns out that avoiding the indirect call through function
36// pointer out to libjava.so in SharedRuntime speeds these routines up
37// by roughly 15% on both Win32/x86 and Solaris/SPARC.
38
39// Enabling optimizations in this file causes incorrect code to be
40// generated; can not figure out how to turn down optimization for one
41// file in the IDE on Windows
42#ifdef WIN32
43# pragma optimize ( "", off )
44#endif
45
46/* The above workaround now causes more problems with the latest MS compiler.
47 * Visual Studio 2010's /GS option tries to guard against buffer overruns.
48 * /GS is on by default if you specify optimizations, which we do globally
49 * via /W3 /O2. However the above selective turning off of optimizations means
50 * that /GS issues a warning "4748". And since we treat warnings as errors (/WX)
51 * then the compilation fails. There are several possible solutions
52 * (1) Remove that pragma above as obsolete with VS2010 - requires testing.
53 * (2) Stop treating warnings as errors - would be a backward step
54 * (3) Disable /GS - may help performance but you lose the security checks
55 * (4) Disable the warning with "#pragma warning( disable : 4748 )"
56 * (5) Disable planting the code with  __declspec(safebuffers)
57 * I've opted for (5) although we should investigate the local performance
58 * benefits of (1) and global performance benefit of (3).
59 */
60#if defined(WIN32) && (defined(_MSC_VER) && (_MSC_VER >= 1600))
61#define SAFEBUF __declspec(safebuffers)
62#else
63#define SAFEBUF
64#endif
65
66#include <math.h>
67
68// VM_LITTLE_ENDIAN is #defined appropriately in the Makefiles
69// [jk] this is not 100% correct because the float word order may different
70// from the byte order (e.g. on ARM)
71#ifdef VM_LITTLE_ENDIAN
72# define __HI(x) *(1+(int*)&x)
73# define __LO(x) *(int*)&x
74#else
75# define __HI(x) *(int*)&x
76# define __LO(x) *(1+(int*)&x)
77#endif
78
79static double copysignA(double x, double y) {
80  __HI(x) = (__HI(x)&0x7fffffff)|(__HI(y)&0x80000000);
81  return x;
82}
83
84/*
85 * scalbn (double x, int n)
86 * scalbn(x,n) returns x* 2**n  computed by  exponent
87 * manipulation rather than by actually performing an
88 * exponentiation or a multiplication.
89 */
90
91static const double
92two54   =  1.80143985094819840000e+16, /* 0x43500000, 0x00000000 */
93twom54  =  5.55111512312578270212e-17, /* 0x3C900000, 0x00000000 */
94hugeX  = 1.0e+300,
95tiny   = 1.0e-300;
96
97static double scalbnA (double x, int n) {
98  int  k,hx,lx;
99  hx = __HI(x);
100  lx = __LO(x);
101  k = (hx&0x7ff00000)>>20;              /* extract exponent */
102  if (k==0) {                           /* 0 or subnormal x */
103    if ((lx|(hx&0x7fffffff))==0) return x; /* +-0 */
104    x *= two54;
105    hx = __HI(x);
106    k = ((hx&0x7ff00000)>>20) - 54;
107    if (n< -50000) return tiny*x;       /*underflow*/
108  }
109  if (k==0x7ff) return x+x;             /* NaN or Inf */
110  k = k+n;
111  if (k >  0x7fe) return hugeX*copysignA(hugeX,x); /* overflow  */
112  if (k > 0)                            /* normal result */
113    {__HI(x) = (hx&0x800fffff)|(k<<20); return x;}
114  if (k <= -54) {
115    if (n > 50000)      /* in case integer overflow in n+k */
116      return hugeX*copysignA(hugeX,x);  /*overflow*/
117    else return tiny*copysignA(tiny,x); /*underflow*/
118  }
119  k += 54;                              /* subnormal result */
120  __HI(x) = (hx&0x800fffff)|(k<<20);
121  return x*twom54;
122}
123
124/*
125 * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
126 * double x[],y[]; int e0,nx,prec; int ipio2[];
127 *
128 * __kernel_rem_pio2 return the last three digits of N with
129 *              y = x - N*pi/2
130 * so that |y| < pi/2.
131 *
132 * The method is to compute the integer (mod 8) and fraction parts of
133 * (2/pi)*x without doing the full multiplication. In general we
134 * skip the part of the product that are known to be a huge integer (
135 * more accurately, = 0 mod 8 ). Thus the number of operations are
136 * independent of the exponent of the input.
137 *
138 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
139 *
140 * Input parameters:
141 *      x[]     The input value (must be positive) is broken into nx
142 *              pieces of 24-bit integers in double precision format.
143 *              x[i] will be the i-th 24 bit of x. The scaled exponent
144 *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
145 *              match x's up to 24 bits.
146 *
147 *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
148 *                      e0 = ilogb(z)-23
149 *                      z  = scalbn(z,-e0)
150 *              for i = 0,1,2
151 *                      x[i] = floor(z)
152 *                      z    = (z-x[i])*2**24
153 *
154 *
155 *      y[]     ouput result in an array of double precision numbers.
156 *              The dimension of y[] is:
157 *                      24-bit  precision       1
158 *                      53-bit  precision       2
159 *                      64-bit  precision       2
160 *                      113-bit precision       3
161 *              The actual value is the sum of them. Thus for 113-bit
162 *              precsion, one may have to do something like:
163 *
164 *              long double t,w,r_head, r_tail;
165 *              t = (long double)y[2] + (long double)y[1];
166 *              w = (long double)y[0];
167 *              r_head = t+w;
168 *              r_tail = w - (r_head - t);
169 *
170 *      e0      The exponent of x[0]
171 *
172 *      nx      dimension of x[]
173 *
174 *      prec    an interger indicating the precision:
175 *                      0       24  bits (single)
176 *                      1       53  bits (double)
177 *                      2       64  bits (extended)
178 *                      3       113 bits (quad)
179 *
180 *      ipio2[]
181 *              integer array, contains the (24*i)-th to (24*i+23)-th
182 *              bit of 2/pi after binary point. The corresponding
183 *              floating value is
184 *
185 *                      ipio2[i] * 2^(-24(i+1)).
186 *
187 * External function:
188 *      double scalbn(), floor();
189 *
190 *
191 * Here is the description of some local variables:
192 *
193 *      jk      jk+1 is the initial number of terms of ipio2[] needed
194 *              in the computation. The recommended value is 2,3,4,
195 *              6 for single, double, extended,and quad.
196 *
197 *      jz      local integer variable indicating the number of
198 *              terms of ipio2[] used.
199 *
200 *      jx      nx - 1
201 *
202 *      jv      index for pointing to the suitable ipio2[] for the
203 *              computation. In general, we want
204 *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
205 *              is an integer. Thus
206 *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
207 *              Hence jv = max(0,(e0-3)/24).
208 *
209 *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
210 *
211 *      q[]     double array with integral value, representing the
212 *              24-bits chunk of the product of x and 2/pi.
213 *
214 *      q0      the corresponding exponent of q[0]. Note that the
215 *              exponent for q[i] would be q0-24*i.
216 *
217 *      PIo2[]  double precision array, obtained by cutting pi/2
218 *              into 24 bits chunks.
219 *
220 *      f[]     ipio2[] in floating point
221 *
222 *      iq[]    integer array by breaking up q[] in 24-bits chunk.
223 *
224 *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
225 *
226 *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
227 *              it also indicates the *sign* of the result.
228 *
229 */
230
231
232/*
233 * Constants:
234 * The hexadecimal values are the intended ones for the following
235 * constants. The decimal values may be used, provided that the
236 * compiler will convert from decimal to binary accurately enough
237 * to produce the hexadecimal values shown.
238 */
239
240
241static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
242
243static const double PIo2[] = {
244  1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
245  7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
246  5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
247  3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
248  1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
249  1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
250  2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
251  2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
252};
253
254static const double
255zeroB   = 0.0,
256one     = 1.0,
257two24B  = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
258twon24  = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
259
260static SAFEBUF int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int *ipio2) {
261  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
262  double z,fw,f[20],fq[20],q[20];
263
264  /* initialize jk*/
265  jk = init_jk[prec];
266  jp = jk;
267
268  /* determine jx,jv,q0, note that 3>q0 */
269  jx =  nx-1;
270  jv = (e0-3)/24; if(jv<0) jv=0;
271  q0 =  e0-24*(jv+1);
272
273  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
274  j = jv-jx; m = jx+jk;
275  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : (double) ipio2[j];
276
277  /* compute q[0],q[1],...q[jk] */
278  for (i=0;i<=jk;i++) {
279    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
280  }
281
282  jz = jk;
283recompute:
284  /* distill q[] into iq[] reversingly */
285  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
286    fw    =  (double)((int)(twon24* z));
287    iq[i] =  (int)(z-two24B*fw);
288    z     =  q[j-1]+fw;
289  }
290
291  /* compute n */
292  z  = scalbnA(z,q0);           /* actual value of z */
293  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
294  n  = (int) z;
295  z -= (double)n;
296  ih = 0;
297  if(q0>0) {    /* need iq[jz-1] to determine n */
298    i  = (iq[jz-1]>>(24-q0)); n += i;
299    iq[jz-1] -= i<<(24-q0);
300    ih = iq[jz-1]>>(23-q0);
301  }
302  else if(q0==0) ih = iq[jz-1]>>23;
303  else if(z>=0.5) ih=2;
304
305  if(ih>0) {    /* q > 0.5 */
306    n += 1; carry = 0;
307    for(i=0;i<jz ;i++) {        /* compute 1-q */
308      j = iq[i];
309      if(carry==0) {
310        if(j!=0) {
311          carry = 1; iq[i] = 0x1000000- j;
312        }
313      } else  iq[i] = 0xffffff - j;
314    }
315    if(q0>0) {          /* rare case: chance is 1 in 12 */
316      switch(q0) {
317      case 1:
318        iq[jz-1] &= 0x7fffff; break;
319      case 2:
320        iq[jz-1] &= 0x3fffff; break;
321      }
322    }
323    if(ih==2) {
324      z = one - z;
325      if(carry!=0) z -= scalbnA(one,q0);
326    }
327  }
328
329  /* check if recomputation is needed */
330  if(z==zeroB) {
331    j = 0;
332    for (i=jz-1;i>=jk;i--) j |= iq[i];
333    if(j==0) { /* need recomputation */
334      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
335
336      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
337        f[jx+i] = (double) ipio2[jv+i];
338        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
339        q[i] = fw;
340      }
341      jz += k;
342      goto recompute;
343    }
344  }
345
346  /* chop off zero terms */
347  if(z==0.0) {
348    jz -= 1; q0 -= 24;
349    while(iq[jz]==0) { jz--; q0-=24;}
350  } else { /* break z into 24-bit if necessary */
351    z = scalbnA(z,-q0);
352    if(z>=two24B) {
353      fw = (double)((int)(twon24*z));
354      iq[jz] = (int)(z-two24B*fw);
355      jz += 1; q0 += 24;
356      iq[jz] = (int) fw;
357    } else iq[jz] = (int) z ;
358  }
359
360  /* convert integer "bit" chunk to floating-point value */
361  fw = scalbnA(one,q0);
362  for(i=jz;i>=0;i--) {
363    q[i] = fw*(double)iq[i]; fw*=twon24;
364  }
365
366  /* compute PIo2[0,...,jp]*q[jz,...,0] */
367  for(i=jz;i>=0;i--) {
368    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
369    fq[jz-i] = fw;
370  }
371
372  /* compress fq[] into y[] */
373  switch(prec) {
374  case 0:
375    fw = 0.0;
376    for (i=jz;i>=0;i--) fw += fq[i];
377    y[0] = (ih==0)? fw: -fw;
378    break;
379  case 1:
380  case 2:
381    fw = 0.0;
382    for (i=jz;i>=0;i--) fw += fq[i];
383    y[0] = (ih==0)? fw: -fw;
384    fw = fq[0]-fw;
385    for (i=1;i<=jz;i++) fw += fq[i];
386    y[1] = (ih==0)? fw: -fw;
387    break;
388  case 3:       /* painful */
389    for (i=jz;i>0;i--) {
390      fw      = fq[i-1]+fq[i];
391      fq[i]  += fq[i-1]-fw;
392      fq[i-1] = fw;
393    }
394    for (i=jz;i>1;i--) {
395      fw      = fq[i-1]+fq[i];
396      fq[i]  += fq[i-1]-fw;
397      fq[i-1] = fw;
398    }
399    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
400    if(ih==0) {
401      y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
402    } else {
403      y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
404    }
405  }
406  return n&7;
407}
408
409
410/*
411 * ====================================================
412 * Copyright (c) 1993 Oracle and/or its affiliates. All rights reserved.
413 *
414 * Developed at SunPro, a Sun Microsystems, Inc. business.
415 * Permission to use, copy, modify, and distribute this
416 * software is freely granted, provided that this notice
417 * is preserved.
418 * ====================================================
419 *
420 */
421
422/* __ieee754_rem_pio2(x,y)
423 *
424 * return the remainder of x rem pi/2 in y[0]+y[1]
425 * use __kernel_rem_pio2()
426 */
427
428/*
429 * Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
430 */
431static const int two_over_pi[] = {
432  0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
433  0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
434  0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
435  0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
436  0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
437  0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
438  0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
439  0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
440  0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
441  0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
442  0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
443};
444
445static const int npio2_hw[] = {
446  0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
447  0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
448  0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
449  0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C,
450  0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB,
451  0x404858EB, 0x404921FB,
452};
453
454/*
455 * invpio2:  53 bits of 2/pi
456 * pio2_1:   first  33 bit of pi/2
457 * pio2_1t:  pi/2 - pio2_1
458 * pio2_2:   second 33 bit of pi/2
459 * pio2_2t:  pi/2 - (pio2_1+pio2_2)
460 * pio2_3:   third  33 bit of pi/2
461 * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
462 */
463
464static const double
465zeroA =  0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
466half =  5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
467two24A =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
468invpio2 =  6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
469pio2_1  =  1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
470pio2_1t =  6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
471pio2_2  =  6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
472pio2_2t =  2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
473pio2_3  =  2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
474pio2_3t =  8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
475
476static SAFEBUF int __ieee754_rem_pio2(double x, double *y) {
477  double z,w,t,r,fn;
478  double tx[3];
479  int e0,i,j,nx,n,ix,hx,i0;
480
481  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
482  hx = *(i0+(int*)&x);          /* high word of x */
483  ix = hx&0x7fffffff;
484  if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
485    {y[0] = x; y[1] = 0; return 0;}
486  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
487    if(hx>0) {
488      z = x - pio2_1;
489      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
490        y[0] = z - pio2_1t;
491        y[1] = (z-y[0])-pio2_1t;
492      } else {                /* near pi/2, use 33+33+53 bit pi */
493        z -= pio2_2;
494        y[0] = z - pio2_2t;
495        y[1] = (z-y[0])-pio2_2t;
496      }
497      return 1;
498    } else {    /* negative x */
499      z = x + pio2_1;
500      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
501        y[0] = z + pio2_1t;
502        y[1] = (z-y[0])+pio2_1t;
503      } else {                /* near pi/2, use 33+33+53 bit pi */
504        z += pio2_2;
505        y[0] = z + pio2_2t;
506        y[1] = (z-y[0])+pio2_2t;
507      }
508      return -1;
509    }
510  }
511  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
512    t  = fabsd(x);
513    n  = (int) (t*invpio2+half);
514    fn = (double)n;
515    r  = t-fn*pio2_1;
516    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
517    if(n<32&&ix!=npio2_hw[n-1]) {
518      y[0] = r-w;       /* quick check no cancellation */
519    } else {
520      j  = ix>>20;
521      y[0] = r-w;
522      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
523      if(i>16) {  /* 2nd iteration needed, good to 118 */
524        t  = r;
525        w  = fn*pio2_2;
526        r  = t-w;
527        w  = fn*pio2_2t-((t-r)-w);
528        y[0] = r-w;
529        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
530        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
531          t  = r;       /* will cover all possible cases */
532          w  = fn*pio2_3;
533          r  = t-w;
534          w  = fn*pio2_3t-((t-r)-w);
535          y[0] = r-w;
536        }
537      }
538    }
539    y[1] = (r-y[0])-w;
540    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
541    else         return n;
542  }
543  /*
544   * all other (large) arguments
545   */
546  if(ix>=0x7ff00000) {          /* x is inf or NaN */
547    y[0]=y[1]=x-x; return 0;
548  }
549  /* set z = scalbn(|x|,ilogb(x)-23) */
550  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
551  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
552  *(i0+(int*)&z) = ix - (e0<<20);
553  for(i=0;i<2;i++) {
554    tx[i] = (double)((int)(z));
555    z     = (z-tx[i])*two24A;
556  }
557  tx[2] = z;
558  nx = 3;
559  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
560  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
561  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
562  return n;
563}
564
565
566/* __kernel_sin( x, y, iy)
567 * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
568 * Input x is assumed to be bounded by ~pi/4 in magnitude.
569 * Input y is the tail of x.
570 * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
571 *
572 * Algorithm
573 *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
574 *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
575 *      3. sin(x) is approximated by a polynomial of degree 13 on
576 *         [0,pi/4]
577 *                               3            13
578 *              sin(x) ~ x + S1*x + ... + S6*x
579 *         where
580 *
581 *      |sin(x)         2     4     6     8     10     12  |     -58
582 *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
583 *      |  x                                               |
584 *
585 *      4. sin(x+y) = sin(x) + sin'(x')*y
586 *                  ~ sin(x) + (1-x*x/2)*y
587 *         For better accuracy, let
588 *                   3      2      2      2      2
589 *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
590 *         then                   3    2
591 *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
592 */
593
594static const double
595S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
596S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
597S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
598S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
599S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
600S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
601
602static double __kernel_sin(double x, double y, int iy)
603{
604        double z,r,v;
605        int ix;
606        ix = __HI(x)&0x7fffffff;        /* high word of x */
607        if(ix<0x3e400000)                       /* |x| < 2**-27 */
608           {if((int)x==0) return x;}            /* generate inexact */
609        z       =  x*x;
610        v       =  z*x;
611        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
612        if(iy==0) return x+v*(S1+z*r);
613        else      return x-((z*(half*y-v*r)-y)-v*S1);
614}
615
616/*
617 * __kernel_cos( x,  y )
618 * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
619 * Input x is assumed to be bounded by ~pi/4 in magnitude.
620 * Input y is the tail of x.
621 *
622 * Algorithm
623 *      1. Since cos(-x) = cos(x), we need only to consider positive x.
624 *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
625 *      3. cos(x) is approximated by a polynomial of degree 14 on
626 *         [0,pi/4]
627 *                                       4            14
628 *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
629 *         where the remez error is
630 *
631 *      |              2     4     6     8     10    12     14 |     -58
632 *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
633 *      |                                                      |
634 *
635 *                     4     6     8     10    12     14
636 *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
637 *             cos(x) = 1 - x*x/2 + r
638 *         since cos(x+y) ~ cos(x) - sin(x)*y
639 *                        ~ cos(x) - x*y,
640 *         a correction term is necessary in cos(x) and hence
641 *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
642 *         For better accuracy when x > 0.3, let qx = |x|/4 with
643 *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
644 *         Then
645 *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
646 *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
647 *         magnitude of the latter is at least a quarter of x*x/2,
648 *         thus, reducing the rounding error in the subtraction.
649 */
650
651static const double
652C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
653C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
654C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
655C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
656C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
657C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
658
659static double __kernel_cos(double x, double y)
660{
661  double a,h,z,r,qx;
662  int ix;
663  ix = __HI(x)&0x7fffffff;      /* ix = |x|'s high word*/
664  if(ix<0x3e400000) {                   /* if x < 2**27 */
665    if(((int)x)==0) return one;         /* generate inexact */
666  }
667  z  = x*x;
668  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
669  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
670    return one - (0.5*z - (z*r - x*y));
671  else {
672    if(ix > 0x3fe90000) {               /* x > 0.78125 */
673      qx = 0.28125;
674    } else {
675      __HI(qx) = ix-0x00200000; /* x/4 */
676      __LO(qx) = 0;
677    }
678    h = 0.5*z-qx;
679    a = one-qx;
680    return a - (h - (z*r-x*y));
681  }
682}
683
684/* __kernel_tan( x, y, k )
685 * kernel tan function on [-pi/4, pi/4], pi/4 ~ 0.7854
686 * Input x is assumed to be bounded by ~pi/4 in magnitude.
687 * Input y is the tail of x.
688 * Input k indicates whether tan (if k=1) or
689 * -1/tan (if k= -1) is returned.
690 *
691 * Algorithm
692 *      1. Since tan(-x) = -tan(x), we need only to consider positive x.
693 *      2. if x < 2^-28 (hx<0x3e300000 0), return x with inexact if x!=0.
694 *      3. tan(x) is approximated by a odd polynomial of degree 27 on
695 *         [0,0.67434]
696 *                               3             27
697 *              tan(x) ~ x + T1*x + ... + T13*x
698 *         where
699 *
700 *              |tan(x)         2     4            26   |     -59.2
701 *              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
702 *              |  x                                    |
703 *
704 *         Note: tan(x+y) = tan(x) + tan'(x)*y
705 *                        ~ tan(x) + (1+x*x)*y
706 *         Therefore, for better accuracy in computing tan(x+y), let
707 *                   3      2      2       2       2
708 *              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
709 *         then
710 *                                  3    2
711 *              tan(x+y) = x + (T1*x + (x *(r+y)+y))
712 *
713 *      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
714 *              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
715 *                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
716 */
717
718static const double
719pio4  =  7.85398163397448278999e-01, /* 0x3FE921FB, 0x54442D18 */
720pio4lo=  3.06161699786838301793e-17, /* 0x3C81A626, 0x33145C07 */
721T[] =  {
722  3.33333333333334091986e-01, /* 0x3FD55555, 0x55555563 */
723  1.33333333333201242699e-01, /* 0x3FC11111, 0x1110FE7A */
724  5.39682539762260521377e-02, /* 0x3FABA1BA, 0x1BB341FE */
725  2.18694882948595424599e-02, /* 0x3F9664F4, 0x8406D637 */
726  8.86323982359930005737e-03, /* 0x3F8226E3, 0xE96E8493 */
727  3.59207910759131235356e-03, /* 0x3F6D6D22, 0xC9560328 */
728  1.45620945432529025516e-03, /* 0x3F57DBC8, 0xFEE08315 */
729  5.88041240820264096874e-04, /* 0x3F4344D8, 0xF2F26501 */
730  2.46463134818469906812e-04, /* 0x3F3026F7, 0x1A8D1068 */
731  7.81794442939557092300e-05, /* 0x3F147E88, 0xA03792A6 */
732  7.14072491382608190305e-05, /* 0x3F12B80F, 0x32F0A7E9 */
733 -1.85586374855275456654e-05, /* 0xBEF375CB, 0xDB605373 */
734  2.59073051863633712884e-05, /* 0x3EFB2A70, 0x74BF7AD4 */
735};
736
737static double __kernel_tan(double x, double y, int iy)
738{
739  double z,r,v,w,s;
740  int ix,hx;
741  hx = __HI(x);   /* high word of x */
742  ix = hx&0x7fffffff;     /* high word of |x| */
743  if(ix<0x3e300000) {                     /* x < 2**-28 */
744    if((int)x==0) {                       /* generate inexact */
745      if (((ix | __LO(x)) | (iy + 1)) == 0)
746        return one / fabsd(x);
747      else {
748        if (iy == 1)
749          return x;
750        else {    /* compute -1 / (x+y) carefully */
751          double a, t;
752
753          z = w = x + y;
754          __LO(z) = 0;
755          v = y - (z - x);
756          t = a = -one / w;
757          __LO(t) = 0;
758          s = one + t * z;
759          return t + a * (s + t * v);
760        }
761      }
762    }
763  }
764  if(ix>=0x3FE59428) {                    /* |x|>=0.6744 */
765    if(hx<0) {x = -x; y = -y;}
766    z = pio4-x;
767    w = pio4lo-y;
768    x = z+w; y = 0.0;
769  }
770  z       =  x*x;
771  w       =  z*z;
772  /* Break x^5*(T[1]+x^2*T[2]+...) into
773   *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
774   *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
775   */
776  r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
777  v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
778  s = z*x;
779  r = y + z*(s*(r+v)+y);
780  r += T[0]*s;
781  w = x+r;
782  if(ix>=0x3FE59428) {
783    v = (double)iy;
784    return (double)(1-((hx>>30)&2))*(v-2.0*(x-(w*w/(w+v)-r)));
785  }
786  if(iy==1) return w;
787  else {          /* if allow error up to 2 ulp,
788                     simply return -1.0/(x+r) here */
789    /*  compute -1.0/(x+r) accurately */
790    double a,t;
791    z  = w;
792    __LO(z) = 0;
793    v  = r-(z - x);     /* z+v = r+x */
794    t = a  = -1.0/w;    /* a = -1.0/w */
795    __LO(t) = 0;
796    s  = 1.0+t*z;
797    return t+a*(s+t*v);
798  }
799}
800
801
802//----------------------------------------------------------------------
803//
804// Routines for new sin/cos implementation
805//
806//----------------------------------------------------------------------
807
808/* sin(x)
809 * Return sine function of x.
810 *
811 * kernel function:
812 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
813 *      __kernel_cos            ... cose function on [-pi/4,pi/4]
814 *      __ieee754_rem_pio2      ... argument reduction routine
815 *
816 * Method.
817 *      Let S,C and T denote the sin, cos and tan respectively on
818 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
819 *      in [-pi/4 , +pi/4], and let n = k mod 4.
820 *      We have
821 *
822 *          n        sin(x)      cos(x)        tan(x)
823 *     ----------------------------------------------------------
824 *          0          S           C             T
825 *          1          C          -S            -1/T
826 *          2         -S          -C             T
827 *          3         -C           S            -1/T
828 *     ----------------------------------------------------------
829 *
830 * Special cases:
831 *      Let trig be any of sin, cos, or tan.
832 *      trig(+-INF)  is NaN, with signals;
833 *      trig(NaN)    is that NaN;
834 *
835 * Accuracy:
836 *      TRIG(x) returns trig(x) nearly rounded
837 */
838
839JRT_LEAF(jdouble, SharedRuntime::dsin(jdouble x))
840  double y[2],z=0.0;
841  int n, ix;
842
843  /* High word of x. */
844  ix = __HI(x);
845
846  /* |x| ~< pi/4 */
847  ix &= 0x7fffffff;
848  if(ix <= 0x3fe921fb) return __kernel_sin(x,z,0);
849
850  /* sin(Inf or NaN) is NaN */
851  else if (ix>=0x7ff00000) return x-x;
852
853  /* argument reduction needed */
854  else {
855    n = __ieee754_rem_pio2(x,y);
856    switch(n&3) {
857    case 0: return  __kernel_sin(y[0],y[1],1);
858    case 1: return  __kernel_cos(y[0],y[1]);
859    case 2: return -__kernel_sin(y[0],y[1],1);
860    default:
861      return -__kernel_cos(y[0],y[1]);
862    }
863  }
864JRT_END
865
866/* cos(x)
867 * Return cosine function of x.
868 *
869 * kernel function:
870 *      __kernel_sin            ... sine function on [-pi/4,pi/4]
871 *      __kernel_cos            ... cosine function on [-pi/4,pi/4]
872 *      __ieee754_rem_pio2      ... argument reduction routine
873 *
874 * Method.
875 *      Let S,C and T denote the sin, cos and tan respectively on
876 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
877 *      in [-pi/4 , +pi/4], and let n = k mod 4.
878 *      We have
879 *
880 *          n        sin(x)      cos(x)        tan(x)
881 *     ----------------------------------------------------------
882 *          0          S           C             T
883 *          1          C          -S            -1/T
884 *          2         -S          -C             T
885 *          3         -C           S            -1/T
886 *     ----------------------------------------------------------
887 *
888 * Special cases:
889 *      Let trig be any of sin, cos, or tan.
890 *      trig(+-INF)  is NaN, with signals;
891 *      trig(NaN)    is that NaN;
892 *
893 * Accuracy:
894 *      TRIG(x) returns trig(x) nearly rounded
895 */
896
897JRT_LEAF(jdouble, SharedRuntime::dcos(jdouble x))
898  double y[2],z=0.0;
899  int n, ix;
900
901  /* High word of x. */
902  ix = __HI(x);
903
904  /* |x| ~< pi/4 */
905  ix &= 0x7fffffff;
906  if(ix <= 0x3fe921fb) return __kernel_cos(x,z);
907
908  /* cos(Inf or NaN) is NaN */
909  else if (ix>=0x7ff00000) return x-x;
910
911  /* argument reduction needed */
912  else {
913    n = __ieee754_rem_pio2(x,y);
914    switch(n&3) {
915    case 0: return  __kernel_cos(y[0],y[1]);
916    case 1: return -__kernel_sin(y[0],y[1],1);
917    case 2: return -__kernel_cos(y[0],y[1]);
918    default:
919      return  __kernel_sin(y[0],y[1],1);
920    }
921  }
922JRT_END
923
924/* tan(x)
925 * Return tangent function of x.
926 *
927 * kernel function:
928 *      __kernel_tan            ... tangent function on [-pi/4,pi/4]
929 *      __ieee754_rem_pio2      ... argument reduction routine
930 *
931 * Method.
932 *      Let S,C and T denote the sin, cos and tan respectively on
933 *      [-PI/4, +PI/4]. Reduce the argument x to y1+y2 = x-k*pi/2
934 *      in [-pi/4 , +pi/4], and let n = k mod 4.
935 *      We have
936 *
937 *          n        sin(x)      cos(x)        tan(x)
938 *     ----------------------------------------------------------
939 *          0          S           C             T
940 *          1          C          -S            -1/T
941 *          2         -S          -C             T
942 *          3         -C           S            -1/T
943 *     ----------------------------------------------------------
944 *
945 * Special cases:
946 *      Let trig be any of sin, cos, or tan.
947 *      trig(+-INF)  is NaN, with signals;
948 *      trig(NaN)    is that NaN;
949 *
950 * Accuracy:
951 *      TRIG(x) returns trig(x) nearly rounded
952 */
953
954JRT_LEAF(jdouble, SharedRuntime::dtan(jdouble x))
955  double y[2],z=0.0;
956  int n, ix;
957
958  /* High word of x. */
959  ix = __HI(x);
960
961  /* |x| ~< pi/4 */
962  ix &= 0x7fffffff;
963  if(ix <= 0x3fe921fb) return __kernel_tan(x,z,1);
964
965  /* tan(Inf or NaN) is NaN */
966  else if (ix>=0x7ff00000) return x-x;            /* NaN */
967
968  /* argument reduction needed */
969  else {
970    n = __ieee754_rem_pio2(x,y);
971    return __kernel_tan(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
972                                                     -1 -- n odd */
973  }
974JRT_END
975
976
977#ifdef WIN32
978# pragma optimize ( "", on )
979#endif
980