1/* Copyright (c) 1998,2011,2014 Apple Inc.  All Rights Reserved.
2 *
3 * NOTICE: USE OF THE MATERIALS ACCOMPANYING THIS NOTICE IS SUBJECT
4 * TO THE TERMS OF THE SIGNED "FAST ELLIPTIC ENCRYPTION (FEE) REFERENCE
5 * SOURCE CODE EVALUATION AGREEMENT" BETWEEN APPLE, INC. AND THE
6 * ORIGINAL LICENSEE THAT OBTAINED THESE MATERIALS FROM APPLE,
7 * INC.  ANY USE OF THESE MATERIALS NOT PERMITTED BY SUCH AGREEMENT WILL
8 * EXPOSE YOU TO LIABILITY.
9 ***************************************************************************
10
11   giantFFT.c
12   Library for large-integer arithmetic via FFT. Currently unused
13   in CryptKit.
14
15
16 Revision History
17 ----------------
18 19 Jan 1998 at Apple
19 	Split off from NSGiantIntegers.c.
20
21*/
22
23/*
24 * FIXME - make sure platform-specific math lib has floor(), fmod(),
25 *         sin(), pow()
26 */
27#include <math.h>
28#include "NSGiantIntegers.h"
29
30#define AUTO_MUL 	0
31#define GRAMMAR_MUL 	1
32#define FFT_MUL 	2
33
34#define TWOPI 		(double)(2*3.1415926535897932384626433)
35#define SQRT2 		(double)(1.414213562373095048801688724209)
36#define SQRTHALF 	(double)(0.707106781186547524400844362104)
37#define TWO16 		(double)(65536.0)
38#define TWOM16 		(double)(0.0000152587890625)
39#define BREAK_SHORTS 	400    // Number of shorts at which FFT breaks over.
40
41static int lpt(int n, int *lambda);
42static void mul_hermitian(double *a, double *b, int n) ;
43static void square_hermitian(double *b, int n);
44static void addsignal(giant x, double *zs, int n);
45static void scramble_real(double *x, int n);
46static void fft_real_to_hermitian(double *zs, int n);
47static void fftinv_hermitian_to_real(double *zs, int n);
48static void GiantFFTSquare(giant gx);
49static void GiantFFTMul(giant,giant);
50static void giant_to_double(giant x, int sizex, double *zs, int L);
51
52static int mulmode = AUTO_MUL;
53
54void mulg(giant a, giant b) { /* b becomes a*b. */
55	PROF_START;
56	INCR_MULGS;
57	GiantAuxMul(a,b);
58	#if	FEE_DEBUG
59        (void)bitlen(b); // XXX
60	#endif	FEE_DEBUG
61        PROF_END(mulgTime);
62	PROF_INCR(numMulg);
63}
64
65static void GiantAuxMul(giant a, giant b) {
66/* Optimized general multiply, b becomes a*b. Modes are:
67   AUTO_MUL: switch according to empirical speed criteria.
68   GRAMMAR_MUL: force grammar-school algorithm.
69   FFT_MUL: force floating point FFT method.
70*/
71    int square = (a==b);
72
73    if (isZero(b)) return;
74    if (isZero(a)) {
75        gtog(a, b);
76        return;
77    }
78    switch(mulmode) {
79    case GRAMMAR_MUL:
80        GiantGrammarMul(a,b);
81        break;
82    case FFT_MUL:
83        if (square) {
84            GiantFFTSquare(b);
85        }
86        else {
87            GiantFFTMul(a,b);
88        }
89        break;
90    case AUTO_MUL: {
91        int sizea, sizeb;
92        float grammartime;
93        sizea = abs(a->sign);
94        sizeb = abs(b->sign);
95        grammartime = sizea; grammartime *= sizeb;
96        if(grammartime < BREAK_SHORTS*BREAK_SHORTS) {
97                GiantGrammarMul(a,b);
98        }
99        else {
100            if (square) GiantFFTSquare(b);
101            else GiantFFTMul(a,b);
102        }
103        break;
104      }
105   }
106}
107
108/***************** Commence FFT multiply routines ****************/
109
110static int CurrentRun = 0;
111double *sincos = NULL;
112static void init_sincos(int n) {
113    int j;
114    double e = TWOPI/n;
115
116    if (n <= CurrentRun) return;
117    CurrentRun = n;
118    if (sincos) free(sincos);
119    sincos = (double *)malloc(sizeof(double)*(1+(n>>2)));
120    for(j=0;j<=(n>>2);j++) {
121        sincos[j] = sin(e*j);
122    }
123}
124
125static double s_sin(int n) {
126    int seg = n/(CurrentRun>>2);
127
128    switch(seg) {
129    case 0: return(sincos[n]);
130    case 1: return(sincos[(CurrentRun>>1)-n]);
131    case 2: return(-sincos[n-(CurrentRun>>1)]);
132    case 3:
133    default: return(-sincos[CurrentRun-n]);
134    }
135}
136
137static double s_cos(int n) {
138    int quart = (CurrentRun>>2);
139
140    if (n < quart) return(s_sin(n+quart));
141    return(-s_sin(n-quart));
142}
143
144
145static int lpt(int n, int *lambda) {
146/* returns least power of two greater than n */
147    register int i = 1;
148
149    *lambda = 0;
150    while(i<n) {
151        i<<=1;
152        ++(*lambda);
153    }
154    return(i);
155}
156
157static void addsignal(giant x, double *zs, int n) {
158   register int j, k, m, car;
159   register double f, g;
160   /*double  err,  maxerr = 0.0;*/
161
162   for(j=0;j<n;j++) {
163   	f = floor(zs[j]+0.5);
164
165	/* err = fabs(zs[j]-f);
166	if(err>maxerr) maxerr = err;
167	*/
168
169	zs[j] =0;
170	k = 0;
171	do{
172           g = floor(f*TWOM16);
173	   zs[j+k] += f-g*TWO16;
174	   ++k;
175	   f=g;
176	} while(f != 0.0);
177   }
178   car = 0;
179   for(j=0;j<n;j++) {
180   	m = zs[j]+car;
181	x->n[j] = m & 0xffff;
182	car = (m>>16);
183   }
184   if(car) x->n[j] = car;
185      else --j;
186   while(!(x->n[j])) --j;
187   x->sign = j+1;
188   if (abs(x->sign) > x->capacity) NSGiantRaise("addsignal overflow");
189}
190
191static void GiantFFTSquare(giant gx) {
192    int j,size = abs(gx->sign);
193    register int L;
194
195    if(size<4) { GiantGrammarMul(gx,gx); return; }
196    L = lpt(size+size, &j);
197    {
198        //was...double doubles[L];
199	//is...
200	double *doubles = malloc(sizeof(double) * L);
201	// end
202        giant_to_double(gx, size, doubles, L);
203        fft_real_to_hermitian(doubles, L);
204        square_hermitian(doubles, L);
205        fftinv_hermitian_to_real(doubles, L);
206        addsignal(gx, doubles, L);
207	// new
208	free(doubles);
209    }
210    gx->sign = abs(gx->sign);
211    bitlen(gx); // XXX
212    if (abs(gx->sign) > gx->capacity) NSGiantRaise("GiantFFTSquare overflow");
213}
214
215static void GiantFFTMul(giant y, giant x) { /* x becomes y*x. */
216    int lambda, size, sizex = abs(x->sign), sizey = abs(y->sign);
217    int finalsign = gsign(x)*gsign(y);
218    register int L;
219
220    if((sizex<=4)||(sizey<=4)) { GiantGrammarMul(y,x); return; }
221    size = sizex; if(size<sizey) size=sizey;
222    L = lpt(size+size, &lambda);
223    {
224        //double doubles1[L];
225        //double doubles2[L];
226       	double *doubles1 = malloc(sizeof(double) * L);
227	double *doubles2 = malloc(sizeof(double) * L);
228
229        giant_to_double(x, sizex, doubles1, L);
230        giant_to_double(y, sizey, doubles2, L);
231        fft_real_to_hermitian(doubles1, L);
232        fft_real_to_hermitian(doubles2, L);
233        mul_hermitian(doubles2, doubles1, L);
234        fftinv_hermitian_to_real(doubles1, L);
235        addsignal(x, doubles1, L);
236
237	free(doubles1);
238	free(doubles2);
239    }
240    x->sign = finalsign*abs(x->sign);
241    bitlen(x); // XXX
242    if (abs(x->sign) > x->capacity) NSGiantRaise("GiantFFTMul overflow");
243}
244
245static void scramble_real(double *x, int n) {
246    register int i,j,k;
247    register double tmp;
248
249    for(i=0,j=0;i<n-1;i++) {
250        if(i<j) {
251            tmp = x[j];
252            x[j]=x[i];
253            x[i]=tmp;
254        }
255        k = n/2;
256        while(k<=j) {
257            j -= k;
258            k>>=1;
259        }
260        j += k;
261    }
262}
263
264static void fft_real_to_hermitian(double *zs, int n) {
265/* Output is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]).
266   This is a decimation-in-time, split-radix algorithm.
267 */
268	register double cc1, ss1, cc3, ss3;
269	register int is, iD, i0, i1, i2, i3, i4, i5, i6, i7, i8,
270		     a, a3, b, b3, nminus = n-1, dil, expand;
271	register double *x, e;
272	int nn = n>>1;
273	double t1, t2, t3, t4, t5, t6;
274	register int n2, n4, n8, i, j;
275
276        init_sincos(n);
277	expand = CurrentRun/n;
278	scramble_real(zs, n);
279	x = zs-1;  /* FORTRAN compatibility. */
280	is = 1;
281	iD = 4;
282	do{
283	   for(i0=is;i0<=n;i0+=iD) {
284		i1 = i0+1;
285		e = x[i0];
286		x[i0] = e + x[i1];
287		x[i1] = e - x[i1];
288	   }
289	   is = (iD<<1)-1;
290	   iD <<= 2;
291	} while(is<n);
292	n2 = 2;
293	while(nn>>=1) {
294		n2 <<= 1;
295		n4 = n2>>2;
296		n8 = n2>>3;
297		is = 0;
298		iD = n2<<1;
299		do {
300			for(i=is;i<n;i+=iD) {
301				i1 = i+1;
302				i2 = i1 + n4;
303				i3 = i2 + n4;
304				i4 = i3 + n4;
305				t1 = x[i4]+x[i3];
306				x[i4] -= x[i3];
307				x[i3] = x[i1] - t1;
308				x[i1] += t1;
309				if(n4==1) continue;
310				i1 += n8;
311				i2 += n8;
312				i3 += n8;
313				i4 += n8;
314				t1 = (x[i3]+x[i4])*SQRTHALF;
315				t2 = (x[i3]-x[i4])*SQRTHALF;
316				x[i4] = x[i2] - t1;
317				x[i3] = -x[i2] - t1;
318				x[i2] = x[i1] - t2;
319				x[i1] += t2;
320			}
321			is = (iD<<1) - n2;
322			iD <<= 2;
323		} while(is<n);
324		dil = n/n2;
325		a = dil;
326		for(j=2;j<=n8;j++) {
327		    	a3 = (a+(a<<1))&nminus;
328			b = a*expand;
329			b3 = a3*expand;
330			cc1 = s_cos(b);
331			ss1 = s_sin(b);
332			cc3 = s_cos(b3);
333			ss3 = s_sin(b3);
334			a = (a+dil)&nminus;
335			is = 0;
336			iD = n2<<1;
337		        do {
338				for(i=is;i<n;i+=iD) {
339					i1 = i+j;
340					i2 = i1 + n4;
341					i3 = i2 + n4;
342					i4 = i3 + n4;
343					i5 = i + n4 - j + 2;
344					i6 = i5 + n4;
345					i7 = i6 + n4;
346					i8 = i7 + n4;
347					t1 = x[i3]*cc1 + x[i7]*ss1;
348					t2 = x[i7]*cc1 - x[i3]*ss1;
349					t3 = x[i4]*cc3 + x[i8]*ss3;
350					t4 = x[i8]*cc3 - x[i4]*ss3;
351					t5 = t1 + t3;
352					t6 = t2 + t4;
353					t3 = t1 - t3;
354					t4 = t2 - t4;
355					t2 = x[i6] + t6;
356					x[i3] = t6 - x[i6];
357					x[i8] = t2;
358					t2 = x[i2] - t3;
359					x[i7] = -x[i2] - t3;
360					x[i4] = t2;
361					t1 = x[i1] + t5;
362					x[i6] = x[i1] - t5;
363					x[i1] = t1;
364					t1 = x[i5] + t4;
365					x[i5] -= t4;
366					x[i2] = t1;
367				}
368			        is = (iD<<1) - n2;
369				iD <<= 2;
370			} while(is<n);
371		}
372	}
373}
374
375static void fftinv_hermitian_to_real(double *zs, int n) {
376/* Input is {Re(z^[0]),...,Re(z^[n/2),Im(z^[n/2-1]),...,Im(z^[1]).
377   This is a decimation-in-frequency, split-radix algorithm.
378 */
379	register double cc1, ss1, cc3, ss3;
380	register int is, iD, i0, i1, i2, i3, i4, i5, i6, i7, i8,
381		 a, a3, b, b3, nminus = n-1, dil, expand;
382	register double *x, e;
383	int nn = n>>1;
384	double t1, t2, t3, t4, t5;
385	int n2, n4, n8, i, j;
386
387        init_sincos(n);
388	expand = CurrentRun/n;
389	x = zs-1;
390	n2 = n<<1;
391	while(nn >>= 1) {
392		is = 0;
393		iD = n2;
394		n2 >>= 1;
395		n4 = n2>>2;
396		n8 = n4>>1;
397		do {
398			for(i=is;i<n;i+=iD) {
399				i1 = i+1;
400				i2 = i1 + n4;
401				i3 = i2 + n4;
402				i4 = i3 + n4;
403				t1 = x[i1] - x[i3];
404				x[i1] += x[i3];
405				x[i2] += x[i2];
406				x[i3] = t1 - 2.0*x[i4];
407				x[i4] = t1 + 2.0*x[i4];
408				if(n4==1) continue;
409				i1 += n8;
410				i2 += n8;
411				i3 += n8;
412				i4 += n8;
413				t1 = (x[i2]-x[i1])*SQRTHALF;
414				t2 = (x[i4]+x[i3])*SQRTHALF;
415				x[i1] += x[i2];
416				x[i2] = x[i4]-x[i3];
417				x[i3] = -2.0*(t2+t1);
418				x[i4] = 2.0*(t1-t2);
419			}
420			is = (iD<<1) - n2;
421			iD <<= 2;
422		} while(is<n-1);
423		dil = n/n2;
424		a = dil;
425		for(j=2;j<=n8;j++) {
426		    	a3 = (a+(a<<1))&nminus;
427			b = a*expand;
428			b3 = a3*expand;
429			cc1 = s_cos(b);
430			ss1 = s_sin(b);
431			cc3 = s_cos(b3);
432			ss3 = s_sin(b3);
433			a = (a+dil)&nminus;
434			is = 0;
435			iD = n2<<1;
436			do {
437			   for(i=is;i<n;i+=iD) {
438				i1 = i+j;
439				i2 = i1+n4;
440				i3 = i2+n4;
441				i4 = i3+n4;
442				i5 = i+n4-j+2;
443				i6 = i5+n4;
444				i7 = i6+n4;
445				i8 = i7+n4;
446				t1 = x[i1] - x[i6];
447				x[i1] += x[i6];
448				t2 = x[i5] - x[i2];
449				x[i5] += x[i2];
450				t3 = x[i8] + x[i3];
451				x[i6] = x[i8] - x[i3];
452				t4 = x[i4] + x[i7];
453				x[i2] = x[i4] - x[i7];
454				t5 = t1 - t4;
455				t1 += t4;
456				t4 = t2 - t3;
457				t2 += t3;
458				x[i3] = t5*cc1 + t4*ss1;
459				x[i7] = -t4*cc1 + t5*ss1;
460				x[i4] = t1*cc3 - t2*ss3;
461				x[i8] = t2*cc3 + t1*ss3;
462			   }
463			   is = (iD<<1) - n2;
464			   iD <<= 2;
465			} while(is<n-1);
466		}
467	}
468	is = 1;
469	iD = 4;
470	do {
471	  for(i0=is;i0<=n;i0+=iD){
472		i1 = i0+1;
473		e = x[i0];
474		x[i0] = e + x[i1];
475		x[i1] = e - x[i1];
476	  }
477	  is = (iD<<1) - 1;
478	  iD <<= 2;
479	} while(is<n);
480	scramble_real(zs, n);
481	e = 1/(double)n;
482	for(i=0;i<n;i++) zs[i] *= e;
483}
484
485
486static void mul_hermitian(double *a, double *b, int n) {
487	register int k, half = n>>1;
488	register double aa, bb, am, bm;
489
490	b[0] *= a[0];
491	b[half] *= a[half];
492	for(k=1;k<half;k++) {
493	        aa = a[k]; bb = b[k];
494		am = a[n-k]; bm = b[n-k];
495		b[k] = aa*bb - am*bm;
496		b[n-k] = aa*bm + am*bb;
497	}
498}
499
500static void square_hermitian(double *b, int n) {
501	register int k, half = n>>1;
502	register double c, d;
503
504	b[0] *= b[0];
505	b[half] *= b[half];
506	for(k=1;k<half;k++) {
507	        c = b[k]; d = b[n-k];
508		b[n-k] = 2.0*c*d;
509		b[k] = (c+d)*(c-d);
510	}
511}
512
513static void giant_to_double(giant x, int sizex, double *zs, int L) {
514	register int j;
515	for(j=sizex;j<L;j++) zs[j]=0.0;
516	for(j=0;j<sizex;j++) {
517		 zs[j] = x->n[j];
518	}
519}
520