1/*--------------------------------------------------------------------
2
3  NAS Parallel Benchmarks 2.3 OpenMP C versions - FT
4
5  This benchmark is an OpenMP C version of the NPB FT code.
6
7  The OpenMP C versions are developed by RWCP and derived from the serial
8  Fortran versions in "NPB 2.3-serial" developed by NAS.
9
10  Permission to use, copy, distribute and modify this software for any
11  purpose with or without fee is hereby granted.
12  This software is provided "as is" without express or implied warranty.
13
14  Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp
15
16  Information on OpenMP activities at RWCP is available at:
17
18           http://pdplab.trc.rwcp.or.jp/pdperf/Omni/
19
20  Information on NAS Parallel Benchmarks 2.3 is available at:
21
22           http://www.nas.nasa.gov/NAS/NPB/
23
24--------------------------------------------------------------------*/
25/*--------------------------------------------------------------------
26
27  Authors: D. Bailey
28           W. Saphir
29
30  OpenMP C version: S. Satoh
31
32--------------------------------------------------------------------*/
33
34#include <stdint.h>
35
36#include "npb-C.h"
37
38/* global variables */
39#include "global.h"
40
41/* function declarations */
42static void evolve(dcomplex u0[NZ][NY][NX], dcomplex u1[NZ][NY][NX],
43		   int t, int indexmap[NZ][NY][NX], int d[3]);
44static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]);
45static void ipow46(double a, int exponent, double *result);
46static void setup(void);
47static void compute_indexmap(int indexmap[NZ][NY][NX], int d[3]);
48static void print_timers(void);
49static void fft(int dir, dcomplex x1[NZ][NY][NX], dcomplex x2[NZ][NY][NX]);
50static void cffts1(int is, int d[3], dcomplex x[NZ][NY][NX],
51		   dcomplex xout[NZ][NY][NX],
52		   dcomplex y0[NX][FFTBLOCKPAD],
53		   dcomplex y1[NX][FFTBLOCKPAD]);
54static void cffts2(int is, int d[3], dcomplex x[NZ][NY][NX],
55		   dcomplex xout[NZ][NY][NX],
56		   dcomplex y0[NX][FFTBLOCKPAD],
57		   dcomplex y1[NX][FFTBLOCKPAD]);
58static void cffts3(int is, int d[3], dcomplex x[NZ][NY][NX],
59		   dcomplex xout[NZ][NY][NX],
60		   dcomplex y0[NX][FFTBLOCKPAD],
61		   dcomplex y1[NX][FFTBLOCKPAD]);
62static void fft_init (int n);
63static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD],
64		   dcomplex y[NX][FFTBLOCKPAD]);
65static void fftz2 (int is, int l, int m, int n, int ny, int ny1,
66		   dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD],
67		   dcomplex y[NX][FFTBLOCKPAD]);
68static int ilog2(int n);
69static void checksum(int i, dcomplex u1[NZ][NY][NX], int d[3]);
70static void verify (int d1, int d2, int d3, int nt,
71		    boolean *verified, char *class);
72
73/*--------------------------------------------------------------------
74c FT benchmark
75c-------------------------------------------------------------------*/
76
77static int realmain(void *carg)
78{
79    unsigned arg = (uintptr_t)carg;
80
81/*c-------------------------------------------------------------------
82c-------------------------------------------------------------------*/
83
84    int i, ierr;
85
86/*------------------------------------------------------------------
87c u0, u1, u2 are the main arrays in the problem.
88c Depending on the decomposition, these arrays will have different
89c dimensions. To accomodate all possibilities, we allocate them as
90c one-dimensional arrays and pass them to subroutines for different
91c views
92c  - u0 contains the initial (transformed) initial condition
93c  - u1 and u2 are working arrays
94c  - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the
95c    time evolution operator.
96c-----------------------------------------------------------------*/
97
98/*--------------------------------------------------------------------
99c Large arrays are in common so that they are allocated on the
100c heap rather than the stack. This common block is not
101c referenced directly anywhere else. Padding is to avoid accidental
102c cache problems, since all array sizes are powers of two.
103c-------------------------------------------------------------------*/
104    static dcomplex u0[NZ][NY][NX];
105    static dcomplex pad1[3];
106    static dcomplex u1[NZ][NY][NX];
107    static dcomplex pad2[3];
108    static dcomplex u2[NZ][NY][NX];
109    static dcomplex pad3[3];
110    static int indexmap[NZ][NY][NX];
111
112    int iter;
113    int nthreads = 1;
114    double total_time, mflops;
115    boolean verified;
116    char class;
117
118    omp_set_num_threads(arg);
119/*--------------------------------------------------------------------
120c Run the entire problem once to make sure all data is touched.
121c This reduces variable startup costs, which is important for such a
122c short benchmark. The other NPB 2 implementations are similar.
123c-------------------------------------------------------------------*/
124    for (i = 0; i < T_MAX; i++) {
125	timer_clear(i);
126    }
127    setup();
128#pragma omp parallel
129 {
130    compute_indexmap(indexmap, dims[2]);
131#pragma omp single
132   {
133    compute_initial_conditions(u1, dims[0]);
134    fft_init (dims[0][0]);
135   }
136    fft(1, u1, u0);
137 } /* end parallel */
138
139/*--------------------------------------------------------------------
140c Start over from the beginning. Note that all operations must
141c be timed, in contrast to other benchmarks.
142c-------------------------------------------------------------------*/
143    for (i = 0; i < T_MAX; i++) {
144	timer_clear(i);
145    }
146
147    timer_start(T_TOTAL);
148    if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP);
149
150#pragma omp parallel private(iter) firstprivate(niter)
151  {
152    compute_indexmap(indexmap, dims[2]);
153
154#pragma omp single
155   {
156    compute_initial_conditions(u1, dims[0]);
157
158    fft_init (dims[0][0]);
159   }
160
161    if (TIMERS_ENABLED == TRUE) {
162#pragma omp master
163      timer_stop(T_SETUP);
164    }
165    if (TIMERS_ENABLED == TRUE) {
166#pragma omp master
167      timer_start(T_FFT);
168    }
169    fft(1, u1, u0);
170    if (TIMERS_ENABLED == TRUE) {
171#pragma omp master
172      timer_stop(T_FFT);
173    }
174
175    for (iter = 1; iter <= niter; iter++) {
176	if (TIMERS_ENABLED == TRUE) {
177#pragma omp master
178	  timer_start(T_EVOLVE);
179	}
180
181	evolve(u0, u1, iter, indexmap, dims[0]);
182
183        if (TIMERS_ENABLED == TRUE) {
184#pragma omp master
185	  timer_stop(T_EVOLVE);
186	}
187        if (TIMERS_ENABLED == TRUE) {
188#pragma omp master
189	  timer_start(T_FFT);
190	}
191
192        fft(-1, u1, u2);
193
194        if (TIMERS_ENABLED == TRUE) {
195#pragma omp master
196	  timer_stop(T_FFT);
197	}
198        if (TIMERS_ENABLED == TRUE) {
199#pragma omp master
200	  timer_start(T_CHECKSUM);
201	}
202
203        checksum(iter, u2, dims[0]);
204
205        if (TIMERS_ENABLED == TRUE) {
206#pragma omp master
207	  timer_stop(T_CHECKSUM);
208	}
209    }
210
211#pragma omp single
212    verify(NX, NY, NZ, niter, &verified, &class);
213
214#if defined(_OPENMP)
215#pragma omp master
216    nthreads = omp_get_num_threads();
217#endif /* _OPENMP */
218  } /* end parallel */
219
220    timer_stop(T_TOTAL);
221    total_time = timer_read(T_TOTAL);
222
223    if( total_time != 0.0) {
224	mflops = 1.0e-6*(double)(NTOTAL) *
225	    (14.8157+7.19641*log((double)(NTOTAL))
226	     +  (5.23518+7.21113*log((double)(NTOTAL)))*niter)
227	    /total_time;
228    } else {
229	mflops = 0.0;
230    }
231#ifdef BOMP
232//backend_create_time(arg);
233#endif
234printf("Computetime %d %f\n", arg, total_time);
235printf("client done\n");
236/*     c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */
237/* 		    total_time, mflops, "          floating point", verified,  */
238/* 		    NPBVERSION, COMPILETIME, */
239/* 		    CS1, CS2, CS3, CS4, CS5, CS6, CS7); */
240    if (TIMERS_ENABLED == TRUE) print_timers();
241}
242
243/*--------------------------------------------------------------------
244c-------------------------------------------------------------------*/
245
246static void evolve(dcomplex u0[NZ][NY][NX], dcomplex u1[NZ][NY][NX],
247		   int t, int indexmap[NZ][NY][NX], int d[3]) {
248
249/*--------------------------------------------------------------------
250c-------------------------------------------------------------------*/
251
252/*--------------------------------------------------------------------
253c evolve u0 -> u1 (t time steps) in fourier space
254c-------------------------------------------------------------------*/
255
256    int i, j, k;
257
258#pragma omp for
259    for (k = 0; k < d[2]; k++) {
260	for (j = 0; j < d[1]; j++) {
261            for (i = 0; i < d[0]; i++) {
262	      crmul(u1[k][j][i], u0[k][j][i], ex[t*indexmap[k][j][i]]);
263	    }
264	}
265    }
266}
267
268/*--------------------------------------------------------------------
269c-------------------------------------------------------------------*/
270
271static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) {
272
273/*--------------------------------------------------------------------
274c-------------------------------------------------------------------*/
275
276/*--------------------------------------------------------------------
277c Fill in array u0 with initial conditions from
278c random number generator
279c-------------------------------------------------------------------*/
280
281    int k;
282    double x0, start, an, dummy;
283    static double tmp[NX*2*MAXDIM+1];
284    int i,j,t;
285
286    start = SEED;
287/*--------------------------------------------------------------------
288c Jump to the starting element for our first plane.
289c-------------------------------------------------------------------*/
290    ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an);
291    dummy = randlc(&start, an);
292    ipow46(A, 2*NX*NY, &an);
293
294/*--------------------------------------------------------------------
295c Go through by z planes filling in one square at a time.
296c-------------------------------------------------------------------*/
297    for (k = 0; k < dims[0][2]; k++) {
298	x0 = start;
299        vranlc(2*NX*dims[0][1], &x0, A, tmp);
300
301	t = 1;
302	for (j = 0; j < dims[0][1]; j++)
303	  for (i = 0; i < NX; i++) {
304	    u0[k][j][i].real = tmp[t++];
305	    u0[k][j][i].imag = tmp[t++];
306	  }
307
308        if (k != dims[0][2]) dummy = randlc(&start, an);
309    }
310}
311
312/*--------------------------------------------------------------------
313c-------------------------------------------------------------------*/
314
315static void ipow46(double a, int exponent, double *result) {
316
317/*--------------------------------------------------------------------
318c-------------------------------------------------------------------*/
319
320/*--------------------------------------------------------------------
321c compute a^exponent mod 2^46
322c-------------------------------------------------------------------*/
323
324    double dummy, q, r;
325    int n, n2;
326
327/*--------------------------------------------------------------------
328c Use
329c   a^n = a^(n/2)*a^(n/2) if n even else
330c   a^n = a*a^(n-1)       if n odd
331c-------------------------------------------------------------------*/
332    *result = 1;
333    if (exponent == 0) return;
334    q = a;
335    r = 1;
336    n = exponent;
337
338    while (n > 1) {
339	n2 = n/2;
340	if (n2 * 2 == n) {
341            dummy = randlc(&q, q);
342            n = n2;
343	} else {
344            dummy = randlc(&r, q);
345            n = n-1;
346	}
347    }
348    dummy = randlc(&r, q);
349    *result = r;
350}
351
352/*--------------------------------------------------------------------
353c-------------------------------------------------------------------*/
354
355static void setup(void) {
356
357/*--------------------------------------------------------------------
358c-------------------------------------------------------------------*/
359
360    int ierr, i, j, fstatus;
361
362    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
363	   " - FT Benchmark\n\n");
364
365    niter = NITER_DEFAULT;
366
367    printf(" Size                : %3dx%3dx%3d\n", NX, NY, NZ);
368    printf(" Iterations          :     %7d\n", niter);
369
370/* 1004 format(' Number of processes :     ', i7)
371 1005 format(' Processor array     :     ', i3, 'x', i3)
372 1006 format(' WARNING: compiled for ', i5, ' processes. ',
373     >       ' Will not verify. ')*/
374
375    for (i = 0;i < 3 ; i++) {
376	dims[i][0] = NX;
377	dims[i][1] = NY;
378	dims[i][2] = NZ;
379    }
380
381
382    for (i = 0; i < 3; i++) {
383	xstart[i] = 1;
384	xend[i]   = NX;
385	ystart[i] = 1;
386        yend[i]   = NY;
387        zstart[i] = 1;
388        zend[i]   = NZ;
389    }
390
391/*--------------------------------------------------------------------
392c Set up info for blocking of ffts and transposes.  This improves
393c performance on cache-based systems. Blocking involves
394c working on a chunk of the problem at a time, taking chunks
395c along the first, second, or third dimension.
396c
397c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
398c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
399
400c Since 1st dim is always in processor, we'll assume it's long enough
401c (default blocking factor is 16 so min size for 1st dim is 16)
402c The only case we have to worry about is cffts1 in a 2d decomposition.
403c so the blocking factor should not be larger than the 2nd dimension.
404c-------------------------------------------------------------------*/
405
406    fftblock = FFTBLOCK_DEFAULT;
407    fftblockpad = FFTBLOCKPAD_DEFAULT;
408
409    if (fftblock != FFTBLOCK_DEFAULT) fftblockpad = fftblock+3;
410}
411
412/*--------------------------------------------------------------------
413c-------------------------------------------------------------------*/
414
415static void compute_indexmap(int indexmap[NZ][NY][NX], int d[3]) {
416
417/*--------------------------------------------------------------------
418c-------------------------------------------------------------------*/
419
420/*--------------------------------------------------------------------
421c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
422c for time evolution exponent.
423c-------------------------------------------------------------------*/
424
425    int i, j, k, ii, ii2, jj, ij2, kk;
426    double ap;
427
428/*--------------------------------------------------------------------
429c basically we want to convert the fortran indices
430c   1 2 3 4 5 6 7 8
431c to
432c   0 1 2 3 -4 -3 -2 -1
433c The following magic formula does the trick:
434c mod(i-1+n/2, n) - n/2
435c-------------------------------------------------------------------*/
436
437#pragma omp for
438    for (i = 0; i < dims[2][0]; i++) {
439	ii =  (i+1+xstart[2]-2+NX/2)%NX - NX/2;
440	ii2 = ii*ii;
441	for (j = 0; j < dims[2][1]; j++) {
442            jj = (j+1+ystart[2]-2+NY/2)%NY - NY/2;
443            ij2 = jj*jj+ii2;
444            for (k = 0; k < dims[2][2]; k++) {
445		kk = (k+1+zstart[2]-2+NZ/2)%NZ - NZ/2;
446		indexmap[k][j][i] = kk*kk+ij2;
447	    }
448	}
449    }
450
451/*--------------------------------------------------------------------
452c compute array of exponentials for time evolution.
453c-------------------------------------------------------------------*/
454#pragma omp single
455  {
456    ap = - 4.0 * ALPHA * PI * PI;
457
458    ex[0] = 1.0;
459    ex[1] = exp(ap);
460    for (i = 2; i <= EXPMAX; i++) {
461	ex[i] = ex[i-1]*ex[1];
462    }
463  } /* end single */
464}
465
466/*--------------------------------------------------------------------
467c-------------------------------------------------------------------*/
468
469static void print_timers(void) {
470
471/*--------------------------------------------------------------------
472c-------------------------------------------------------------------*/
473
474    int i;
475    char *tstrings[] = { "          total ",
476			 "          setup ",
477			 "            fft ",
478			 "         evolve ",
479			 "       checksum ",
480			 "         fftlow ",
481			 "        fftcopy " };
482
483    for (i = 0; i < T_MAX; i++) {
484	if (timer_read(i) != 0.0) {
485            printf("timer %2d(%16s( :%10.6f\n", i, tstrings[i], timer_read(i));
486	}
487    }
488}
489
490
491/*--------------------------------------------------------------------
492c-------------------------------------------------------------------*/
493
494static void fft(int dir, dcomplex x1[NZ][NY][NX], dcomplex x2[NZ][NY][NX]) {
495
496/*--------------------------------------------------------------------
497c-------------------------------------------------------------------*/
498
499    dcomplex y0[NX][FFTBLOCKPAD];
500    dcomplex y1[NX][FFTBLOCKPAD];
501
502/*--------------------------------------------------------------------
503c note: args x1, x2 must be different arrays
504c note: args for cfftsx are (direction, layout, xin, xout, scratch)
505c       xin/xout may be the same and it can be somewhat faster
506c       if they are
507c-------------------------------------------------------------------*/
508
509    if (dir == 1) {
510        cffts1(1, dims[0], x1, x1, y0, y1);	/* x1 -> x1 */
511        cffts2(1, dims[1], x1, x1, y0, y1);	/* x1 -> x1 */
512        cffts3(1, dims[2], x1, x2, y0, y1);	/* x1 -> x2 */
513    } else {
514	cffts3(-1, dims[2], x1, x1, y0, y1);	/* x1 -> x1 */
515        cffts2(-1, dims[1], x1, x1, y0, y1);	/* x1 -> x1 */
516        cffts1(-1, dims[0], x1, x2, y0, y1);	/* x1 -> x2 */
517    }
518}
519
520
521/*--------------------------------------------------------------------
522c-------------------------------------------------------------------*/
523
524static void cffts1(int is, int d[3], dcomplex x[NZ][NY][NX],
525		   dcomplex xout[NZ][NY][NX],
526		   dcomplex y0[NX][FFTBLOCKPAD],
527		   dcomplex y1[NX][FFTBLOCKPAD]) {
528
529/*--------------------------------------------------------------------
530c-------------------------------------------------------------------*/
531
532    int logd[3];
533    int i, j, k, jj;
534
535    for (i = 0; i < 3; i++) {
536	logd[i] = ilog2(d[i]);
537    }
538
539#pragma omp for
540    for (k = 0; k < d[2]; k++) {
541	for (jj = 0; jj <= d[1] - fftblock; jj+=fftblock) {
542/*          if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
543            for (j = 0; j < fftblock; j++) {
544		for (i = 0; i < d[0]; i++) {
545		    y0[i][j].real = x[k][j+jj][i].real;
546		    y0[i][j].imag = x[k][j+jj][i].imag;
547		}
548	    }
549/*          if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
550
551/*          if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */
552            cfftz (is, logd[0],
553		   d[0], y0, y1);
554
555/*          if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */
556/*          if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
557            for (j = 0; j < fftblock; j++) {
558		for (i = 0; i < d[0]; i++) {
559		  xout[k][j+jj][i].real = y0[i][j].real;
560		  xout[k][j+jj][i].imag = y0[i][j].imag;
561		}
562	    }
563/*          if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
564	}
565    }
566}
567
568
569/*--------------------------------------------------------------------
570c-------------------------------------------------------------------*/
571
572static void cffts2(int is, int d[3], dcomplex x[NZ][NY][NX],
573		   dcomplex xout[NZ][NY][NX],
574		   dcomplex y0[NX][FFTBLOCKPAD],
575		   dcomplex y1[NX][FFTBLOCKPAD]) {
576
577/*--------------------------------------------------------------------
578c-------------------------------------------------------------------*/
579
580    int logd[3];
581    int i, j, k, ii;
582
583    for (i = 0; i < 3; i++) {
584	logd[i] = ilog2(d[i]);
585    }
586#pragma omp for
587    for (k = 0; k < d[2]; k++) {
588        for (ii = 0; ii <= d[0] - fftblock; ii+=fftblock) {
589/*	    if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
590	    for (j = 0; j < d[1]; j++) {
591		for (i = 0; i < fftblock; i++) {
592		    y0[j][i].real = x[k][j][i+ii].real;
593		    y0[j][i].imag = x[k][j][i+ii].imag;
594		}
595	    }
596/*	    if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
597/*	    if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */
598	    cfftz (is, logd[1],
599		   d[1], y0, y1);
600
601/*          if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */
602/*          if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
603           for (j = 0; j < d[1]; j++) {
604	       for (i = 0; i < fftblock; i++) {
605		   xout[k][j][i+ii].real = y0[j][i].real;
606		   xout[k][j][i+ii].imag = y0[j][i].imag;
607	       }
608	   }
609/*           if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
610	}
611    }
612}
613
614/*--------------------------------------------------------------------
615c-------------------------------------------------------------------*/
616
617static void cffts3(int is, int d[3], dcomplex x[NZ][NY][NX],
618		   dcomplex xout[NZ][NY][NX],
619		   dcomplex y0[NX][FFTBLOCKPAD],
620		   dcomplex y1[NX][FFTBLOCKPAD]) {
621
622/*--------------------------------------------------------------------
623c-------------------------------------------------------------------*/
624
625    int logd[3];
626    int i, j, k, ii;
627
628    for (i = 0;i < 3; i++) {
629	logd[i] = ilog2(d[i]);
630    }
631#pragma omp for
632    for (j = 0; j < d[1]; j++) {
633        for (ii = 0; ii <= d[0] - fftblock; ii+=fftblock) {
634/*	    if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
635	    for (k = 0; k < d[2]; k++) {
636		for (i = 0; i < fftblock; i++) {
637		    y0[k][i].real = x[k][j][i+ii].real;
638		    y0[k][i].imag = x[k][j][i+ii].imag;
639		}
640	    }
641
642/*           if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
643/*           if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */
644           cfftz (is, logd[2],
645		  d[2], y0, y1);
646/*           if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */
647/*           if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */
648           for (k = 0; k < d[2]; k++) {
649	       for (i = 0; i < fftblock; i++) {
650		   xout[k][j][i+ii].real = y0[k][i].real;
651		   xout[k][j][i+ii].imag = y0[k][i].imag;
652	       }
653	   }
654/*           if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */
655	}
656    }
657}
658
659
660/*--------------------------------------------------------------------
661c-------------------------------------------------------------------*/
662
663static void fft_init (int n) {
664
665/*--------------------------------------------------------------------
666c-------------------------------------------------------------------*/
667
668/*--------------------------------------------------------------------
669c compute the roots-of-unity array that will be used for subsequent FFTs.
670c-------------------------------------------------------------------*/
671
672    int m,nu,ku,i,j,ln;
673    double t, ti;
674
675
676/*--------------------------------------------------------------------
677c   Initialize the U array with sines and cosines in a manner that permits
678c   stride one access at each FFT iteration.
679c-------------------------------------------------------------------*/
680    nu = n;
681    m = ilog2(n);
682    u[0].real = (double)m;
683    u[0].imag = 0.0;
684    ku = 1;
685    ln = 1;
686
687    for (j = 1; j <= m; j++) {
688	t = PI / ln;
689
690	for (i = 0; i <= ln - 1; i++) {
691            ti = i * t;
692            u[i+ku].real = cos(ti);
693	    u[i+ku].imag = sin(ti);
694	}
695
696	ku = ku + ln;
697	ln = 2 * ln;
698    }
699}
700
701
702/*--------------------------------------------------------------------
703c-------------------------------------------------------------------*/
704
705static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD],
706		   dcomplex y[NX][FFTBLOCKPAD]) {
707
708/*--------------------------------------------------------------------
709c-------------------------------------------------------------------*/
710
711/*--------------------------------------------------------------------
712c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
713c   to Swarztrauber.  X is both the input and the output array, while Y is a
714c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
715c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
716c   set to 0 and M set to MX, where MX is the maximum value of M for any
717c   subsequent call.
718c-------------------------------------------------------------------*/
719
720    int i,j,l,mx;
721
722/*--------------------------------------------------------------------
723c   Check if input parameters are invalid.
724c-------------------------------------------------------------------*/
725    mx = (int)(u[0].real);
726    if ((is != 1 && is != -1) || m < 1 || m > mx) {
727	printf("CFFTZ: Either U has not been initialized, or else\n"
728	       "one of the input parameters is invalid%5d%5d%5d\n",
729	       is, m, mx);
730	exit(1);
731    }
732
733/*--------------------------------------------------------------------
734c   Perform one variant of the Stockham FFT.
735c-------------------------------------------------------------------*/
736    for (l = 1; l <= m; l+=2) {
737        fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y);
738        if (l == m) break;
739	fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x);
740    }
741
742/*--------------------------------------------------------------------
743c   Copy Y to X.
744c-------------------------------------------------------------------*/
745    if (m % 2 == 1) {
746	for (j = 0; j < n; j++) {
747	    for (i = 0; i < fftblock; i++) {
748		x[j][i].real = y[j][i].real;
749		x[j][i].imag = y[j][i].imag;
750	    }
751	}
752    }
753}
754
755
756/*--------------------------------------------------------------------
757c-------------------------------------------------------------------*/
758
759static void fftz2 (int is, int l, int m, int n, int ny, int ny1,
760		   dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD],
761		   dcomplex y[NX][FFTBLOCKPAD]) {
762
763/*--------------------------------------------------------------------
764c-------------------------------------------------------------------*/
765
766/*--------------------------------------------------------------------
767c   Performs the L-th iteration of the second variant of the Stockham FFT.
768c-------------------------------------------------------------------*/
769
770    int k,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22;
771    dcomplex u1,x11,x21;
772
773/*--------------------------------------------------------------------
774c   Set initial parameters.
775c-------------------------------------------------------------------*/
776
777    n1 = n / 2;
778    if (l-1 == 0) {
779	lk = 1;
780    } else {
781	lk = 2 << ((l - 1)-1);
782    }
783    if (m-l == 0) {
784	li = 1;
785    } else {
786	li = 2 << ((m - l)-1);
787    }
788    lj = 2 * lk;
789    ku = li;
790
791    for (i = 0; i < li; i++) {
792
793        i11 = i * lk;
794        i12 = i11 + n1;
795        i21 = i * lj;
796        i22 = i21 + lk;
797        if (is >= 1) {
798          u1.real = u[ku+i].real;
799          u1.imag = u[ku+i].imag;
800        } else {
801          u1.real = u[ku+i].real;
802          u1.imag = -u[ku+i].imag;
803        }
804
805/*--------------------------------------------------------------------
806c   This loop is vectorizable.
807c-------------------------------------------------------------------*/
808        for (k = 0; k < lk; k++) {
809	    for (j = 0; j < ny; j++) {
810		double x11real, x11imag;
811		double x21real, x21imag;
812		x11real = x[i11+k][j].real;
813		x11imag = x[i11+k][j].imag;
814		x21real = x[i12+k][j].real;
815		x21imag = x[i12+k][j].imag;
816		y[i21+k][j].real = x11real + x21real;
817		y[i21+k][j].imag = x11imag + x21imag;
818		y[i22+k][j].real = u1.real * (x11real - x21real)
819		    - u1.imag * (x11imag - x21imag);
820		y[i22+k][j].imag = u1.real * (x11imag - x21imag)
821		    + u1.imag * (x11real - x21real);
822	    }
823	}
824    }
825}
826
827
828/*--------------------------------------------------------------------
829c-------------------------------------------------------------------*/
830
831static int ilog2(int n) {
832
833/*--------------------------------------------------------------------
834c-------------------------------------------------------------------*/
835
836    int nn, lg;
837
838    if (n == 1) {
839	return 0;
840    }
841    lg = 1;
842    nn = 2;
843    while (nn < n) {
844	nn = nn << 1;
845	lg++;
846    }
847
848    return lg;
849}
850
851
852/*--------------------------------------------------------------------
853c-------------------------------------------------------------------*/
854
855static void checksum(int i, dcomplex u1[NZ][NY][NX], int d[3]) {
856
857/*--------------------------------------------------------------------
858c-------------------------------------------------------------------*/
859
860    int j, q,r,s, ierr;
861    dcomplex chk,allchk;
862
863    chk.real = 0.0;
864    chk.imag = 0.0;
865
866#pragma omp for nowait
867    for (j = 1; j <= 1024; j++) {
868	q = j%NX+1;
869	if (q >= xstart[0] && q <= xend[0]) {
870            r = (3*j)%NY+1;
871            if (r >= ystart[0] && r <= yend[0]) {
872		s = (5*j)%NZ+1;
873		if (s >= zstart[0] && s <= zend[0]) {
874		  cadd(chk,chk,u1[s-zstart[0]][r-ystart[0]][q-xstart[0]]);
875		}
876	    }
877	}
878    }
879#pragma omp critical
880    {
881	sums[i].real += chk.real;
882	sums[i].imag += chk.imag;
883    }
884#pragma omp barrier
885#pragma omp single
886  {
887    /* complex % real */
888    sums[i].real = sums[i].real/(double)(NTOTAL);
889    sums[i].imag = sums[i].imag/(double)(NTOTAL);
890
891/*     printf("T = %5d     Checksum = %22.12e %22.12e\n", */
892/* 	   i, sums[i].real, sums[i].imag); */
893  }
894}
895
896
897/*--------------------------------------------------------------------
898c-------------------------------------------------------------------*/
899
900static void verify (int d1, int d2, int d3, int nt,
901		    boolean *verified, char *class) {
902
903/*--------------------------------------------------------------------
904c-------------------------------------------------------------------*/
905
906    int ierr, size, i;
907    double err, epsilon;
908
909/*--------------------------------------------------------------------
910c   Sample size reference checksums
911c-------------------------------------------------------------------*/
912
913/*--------------------------------------------------------------------
914c   Class S size reference checksums
915c-------------------------------------------------------------------*/
916    double vdata_real_s[6+1] = { 0.0,
917				 5.546087004964e+02,
918				 5.546385409189e+02,
919				 5.546148406171e+02,
920				 5.545423607415e+02,
921				 5.544255039624e+02,
922				 5.542683411902e+02 };
923    double vdata_imag_s[6+1] = { 0.0,
924				 4.845363331978e+02,
925				 4.865304269511e+02,
926				 4.883910722336e+02,
927				 4.901273169046e+02,
928				 4.917475857993e+02,
929				 4.932597244941e+02 };
930/*--------------------------------------------------------------------
931c   Class W size reference checksums
932c-------------------------------------------------------------------*/
933    double vdata_real_w[6+1] = { 0.0,
934				 5.673612178944e+02,
935				 5.631436885271e+02,
936				 5.594024089970e+02,
937				 5.560698047020e+02,
938				 5.530898991250e+02,
939				 5.504159734538e+02 };
940    double vdata_imag_w[6+1] = { 0.0,
941				 5.293246849175e+02,
942				 5.282149986629e+02,
943				 5.270996558037e+02,
944				 5.260027904925e+02,
945				 5.249400845633e+02,
946				 5.239212247086e+02 };
947/*--------------------------------------------------------------------
948c   Class A size reference checksums
949c-------------------------------------------------------------------*/
950    double vdata_real_a[6+1] = { 0.0,
951				 5.046735008193e+02,
952				 5.059412319734e+02,
953				 5.069376896287e+02,
954				 5.077892868474e+02,
955				 5.085233095391e+02,
956				 5.091487099959e+02 };
957    double vdata_imag_a[6+1] = { 0.0,
958				 5.114047905510e+02,
959				 5.098809666433e+02,
960				 5.098144042213e+02,
961				 5.101336130759e+02,
962				 5.104914655194e+02,
963				 5.107917842803e+02 };
964/*--------------------------------------------------------------------
965c   Class B size reference checksums
966c-------------------------------------------------------------------*/
967    double vdata_real_b[20+1] = { 0.0,
968				  5.177643571579e+02,
969				  5.154521291263e+02,
970				  5.146409228649e+02,
971				  5.142378756213e+02,
972				  5.139626667737e+02,
973				  5.137423460082e+02,
974				  5.135547056878e+02,
975				  5.133910925466e+02,
976				  5.132470705390e+02,
977				  5.131197729984e+02,
978				  5.130070319283e+02,
979				  5.129070537032e+02,
980				  5.128182883502e+02,
981				  5.127393733383e+02,
982				  5.126691062020e+02,
983				  5.126064276004e+02,
984				  5.125504076570e+02,
985				  5.125002331720e+02,
986				  5.124551951846e+02,
987				  5.124146770029e+02 };
988    double vdata_imag_b[20+1] = { 0.0,
989				  5.077803458597e+02,
990				  5.088249431599e+02,
991				  5.096208912659e+02,
992				  5.101023387619e+02,
993				  5.103976610617e+02,
994				  5.105948019802e+02,
995				  5.107404165783e+02,
996				  5.108576573661e+02,
997				  5.109577278523e+02,
998				  5.110460304483e+02,
999				  5.111252433800e+02,
1000				  5.111968077718e+02,
1001				  5.112616233064e+02,
1002				  5.113203605551e+02,
1003				  5.113735928093e+02,
1004				  5.114218460548e+02,
1005				  5.114656139760e+02,
1006				  5.115053595966e+02,
1007				  5.115415130407e+02,
1008				  5.115744692211e+02 };
1009/*--------------------------------------------------------------------
1010c   Class C size reference checksums
1011c-------------------------------------------------------------------*/
1012    double vdata_real_c[20+1] = { 0.0,
1013				  5.195078707457e+02,
1014				  5.155422171134e+02,
1015				  5.144678022222e+02,
1016				  5.140150594328e+02,
1017				  5.137550426810e+02,
1018				  5.135811056728e+02,
1019				  5.134569343165e+02,
1020				  5.133651975661e+02,
1021				  5.132955192805e+02,
1022				  5.132410471738e+02,
1023				  5.131971141679e+02,
1024				  5.131605205716e+02,
1025				  5.131290734194e+02,
1026				  5.131012720314e+02,
1027				  5.130760908195e+02,
1028				  5.130528295923e+02,
1029				  5.130310107773e+02,
1030				  5.130103090133e+02,
1031				  5.129905029333e+02,
1032				  5.129714421109e+02 };
1033    double vdata_imag_c[20+1] = { 0.0,
1034				  5.149019699238e+02,
1035				  5.127578201997e+02,
1036				  5.122251847514e+02,
1037				  5.121090289018e+02,
1038				  5.121143685824e+02,
1039				  5.121496764568e+02,
1040				  5.121870921893e+02,
1041				  5.122193250322e+02,
1042				  5.122454735794e+02,
1043				  5.122663649603e+02,
1044				  5.122830879827e+02,
1045				  5.122965869718e+02,
1046				  5.123075927445e+02,
1047				  5.123166486553e+02,
1048				  5.123241541685e+02,
1049				  5.123304037599e+02,
1050				  5.123356167976e+02,
1051				  5.123399592211e+02,
1052				  5.123435588985e+02,
1053				  5.123465164008e+02 };
1054
1055    epsilon = 1.0e-12;
1056    *verified = TRUE;
1057    *class = 'U';
1058
1059    if (d1 == 64 &&
1060	d2 == 64 &&
1061	d3 == 64 &&
1062	nt == 6) {
1063	*class = 'S';
1064	for (i = 1; i <= nt; i++) {
1065            err = (get_real(sums[i]) - vdata_real_s[i]) / vdata_real_s[i];
1066            if (fabs(err) > epsilon) {
1067	      *verified = FALSE;
1068	      break;
1069	    }
1070            err = (get_imag(sums[i]) - vdata_imag_s[i]) / vdata_imag_s[i];
1071            if (fabs(err) > epsilon) {
1072	      *verified = FALSE;
1073	      break;
1074	    }
1075	}
1076    } else if (d1 == 128 &&
1077	       d2 == 128 &&
1078	       d3 == 32 &&
1079	       nt == 6) {
1080	*class = 'W';
1081	for (i = 1; i <= nt; i++) {
1082            err = (get_real(sums[i]) - vdata_real_w[i]) / vdata_real_w[i];
1083            if (fabs(err) > epsilon) {
1084	      *verified = FALSE;
1085	      break;
1086	    }
1087            err = (get_imag(sums[i]) - vdata_imag_w[i]) / vdata_imag_w[i];
1088            if (fabs(err) > epsilon) {
1089	      *verified = FALSE;
1090	      break;
1091	    }
1092	}
1093    } else if (d1 == 256 &&
1094	       d2 == 256 &&
1095	       d3 == 128 &&
1096	       nt == 6) {
1097	*class = 'A';
1098	for (i = 1; i <= nt; i++) {
1099            err = (get_real(sums[i]) - vdata_real_a[i]) / vdata_real_a[i];
1100            if (fabs(err) > epsilon) {
1101	      *verified = FALSE;
1102	      break;
1103	    }
1104            err = (get_imag(sums[i]) - vdata_imag_a[i]) / vdata_imag_a[i];
1105            if (fabs(err) > epsilon) {
1106	      *verified = FALSE;
1107	      break;
1108	    }
1109	}
1110    } else if (d1 == 512 &&
1111	       d2 == 256 &&
1112	       d3 == 256 &&
1113	       nt == 20) {
1114	*class = 'B';
1115	for (i = 1; i <= nt; i++) {
1116            err = (get_real(sums[i]) - vdata_real_b[i]) / vdata_real_b[i];
1117            if (fabs(err) > epsilon) {
1118	      *verified = FALSE;
1119	      break;
1120	    }
1121            err = (get_imag(sums[i]) - vdata_imag_b[i]) / vdata_imag_b[i];
1122            if (fabs(err) > epsilon) {
1123	      *verified = FALSE;
1124	      break;
1125	    }
1126	}
1127    } else if (d1 == 512 &&
1128	       d2 == 512 &&
1129	       d3 == 512 &&
1130	       nt == 20) {
1131	*class = 'C';
1132	for (i = 1; i <= nt; i++) {
1133            err = (get_real(sums[i]) - vdata_real_c[i]) / vdata_real_c[i];
1134            if (fabs(err) > epsilon) {
1135	      *verified = FALSE;
1136	      break;
1137	    }
1138            err = (get_imag(sums[i]) - vdata_imag_c[i]) / vdata_imag_c[i];
1139            if (fabs(err) > epsilon) {
1140	      *verified = FALSE;
1141	      break;
1142	    }
1143	}
1144    }
1145
1146    if (*class != 'U') {
1147	printf("Result verification successful\n");
1148    } else {
1149	printf("Result verification failed\n");
1150    }
1151    printf("class = %1c\n", *class);
1152}
1153
1154#define STACK_SIZE (8 * 1024 * 1024)
1155
1156int main(int argc, char** argv)
1157{
1158    if (argc != 2) { /* Print usage */
1159        printf("Usage: %s <Number of threads>\n", argv[0]);
1160        exit(-1);
1161    }
1162
1163    uint64_t arg1 = (uint64_t)atoi(argv[1]);
1164
1165#ifdef BOMP
1166    bomp_bomp_init_varstack(arg1, STACK_SIZE);
1167    // need to create thread with large enough stack on Barrelfish, as BF
1168    // thread stacks are not dynamically resized.
1169    bomp_run_main(realmain, (void*)arg1, STACK_SIZE);
1170#else
1171    return realmain((void*)arg1);
1172#endif
1173}
1174