1/*--------------------------------------------------------------------
2
3  NAS Parallel Benchmarks 2.3 OpenMP C versions - EP
4
5  This benchmark is an OpenMP C version of the NPB EP code.
6
7  The OpenMP C versions are developed by RWCP and derived from the serial
8  Fortran versions in "NPB 2.3-serial" developed by NAS.
9
10  Permission to use, copy, distribute and modify this software for any
11  purpose with or without fee is hereby granted.
12  This software is provided "as is" without express or implied warranty.
13
14  Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp
15
16  Information on OpenMP activities at RWCP is available at:
17
18           http://pdplab.trc.rwcp.or.jp/pdperf/Omni/
19
20  Information on NAS Parallel Benchmarks 2.3 is available at:
21
22           http://www.nas.nasa.gov/NAS/NPB/
23
24--------------------------------------------------------------------*/
25/*--------------------------------------------------------------------
26
27  Author: P. O. Frederickson
28          D. H. Bailey
29          A. C. Woo
30
31  OpenMP C version: S. Satoh
32
33--------------------------------------------------------------------*/
34
35#include "npb-C.h"
36#include "npbparams.h"
37
38/* parameters */
39#define	MK	        16
40#define	MM		(M - MK)
41#define	NN		(1 << MM)
42#define	NK		(1 << MK)
43#define	NQ		10
44#define EPSILON		1.0e-8
45#define	A		1220703125.0
46#define	S		271828183.0
47#define	TIMERS_ENABLED	FALSE
48
49/* global variables */
50/* common /storage/ */
51static double x[2*NK];
52#pragma omp threadprivate(x)
53static double q[NQ];
54
55/*--------------------------------------------------------------------
56      program EMBAR
57c-------------------------------------------------------------------*/
58/*
59c   This is the serial version of the APP Benchmark 1,
60c   the "embarassingly parallel" benchmark.
61c
62c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
63c   numbers.  MK is the Log_2 of the size of each batch of uniform random
64c   numbers.  MK can be set for convenience on a given system, since it does
65c   not affect the results.
66*/
67int main(int argc, char **argv) {
68
69    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
70    double dum[3] = { 1.0, 1.0, 1.0 };
71    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
72	no_large_nodes, np_add, k_offset, j;
73    int nthreads = 1;
74    boolean verified;
75    char size[13+1];	/* character*13 */
76
77/*
78c   Because the size of the problem is too large to store in a 32-bit
79c   integer for some classes, we put it into a string (for printing).
80c   Have to strip off the decimal point put in there by the floating
81c   point print statement (internal file)
82*/
83#ifndef POSIX
84#ifndef NOBOMP
85    bomp_bomp_init(1);
86#endif
87#endif
88    omp_set_num_threads(1);
89    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
90	   " - EP Benchmark\n");
91    sprintf(size, "%12.0f", pow(2.0, M+1));
92    for (j = 13; j >= 1; j--) {
93	if (size[j] == '.') size[j] = ' ';
94    }
95    printf(" Number of random numbers generated: %13s\n", size);
96
97    verified = FALSE;
98
99/*
100c   Compute the number of "batches" of random number pairs generated
101c   per processor. Adjust if the number of processors does not evenly
102c   divide the total number
103*/
104    np = NN;
105
106/*
107c   Call the random number generator functions and initialize
108c   the x-array to reduce the effects of paging on the timings.
109c   Also, call all mathematical functions that are used. Make
110c   sure these initializations cannot be eliminated as dead code.
111*/
112    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
113    dum[0] = randlc(&(dum[1]), dum[2]);
114
115    for (i = 0; i < 2*NK; i++)
116    {
117	x[i] = -1.0e99;
118    }
119
120    printf("Reached here ");
121    Mops = log(sqrt(fabs(max(1.0, 1.0))));
122
123    timer_clear(1);
124    timer_clear(2);
125    timer_clear(3);
126    timer_start(1);
127
128    vranlc(0, &t1, A, x);
129
130/*   Compute AN = A ^ (2 * NK) (mod 2^46). */
131
132    t1 = A;
133
134    for ( i = 1; i <= MK+1; i++) {
135	t2 = randlc(&t1, t1);
136    }
137
138    an = t1;
139    tt = S;
140    gc = 0.0;
141    sx = 0.0;
142    sy = 0.0;
143
144    for ( i = 0; i <= NQ - 1; i++) {
145	q[i] = 0.0;
146    }
147
148/*
149c   Each instance of this loop may be performed independently. We compute
150c   the k offsets separately to take into account the fact that some nodes
151c   have more numbers to generate than others
152*/
153    k_offset = -1;
154
155#pragma omp parallel copyin(x)
156{
157    double t1, t2, t3, t4, x1, x2;
158    int kk, i, ik, l;
159    double qq[NQ];		/* private copy of q[0:NQ-1] */
160
161    for (i = 0; i < NQ; i++) qq[i] = 0.0;
162
163#pragma omp for reduction(+:sx,sy) schedule(static)
164    for (k = 1; k <= np; k++) {
165	kk = k_offset + k;
166	t1 = S;
167	t2 = an;
168
169/*      Find starting seed t1 for this kk. */
170
171	for (i = 1; i <= 100; i++) {
172            ik = kk / 2;
173            if (2 * ik != kk) t3 = randlc(&t1, t2);
174            if (ik == 0) break;
175            t3 = randlc(&t2, t2);
176            kk = ik;
177	}
178
179/*      Compute uniform pseudorandom numbers. */
180
181	if (TIMERS_ENABLED == TRUE) timer_start(3);
182	vranlc(2*NK, &t1, A, x-1);
183	if (TIMERS_ENABLED == TRUE) timer_stop(3);
184
185/*
186c       Compute Gaussian deviates by acceptance-rejection method and
187c       tally counts in concentric square annuli.  This loop is not
188c       vectorizable.
189*/
190	if (TIMERS_ENABLED == TRUE) timer_start(2);
191
192	for ( i = 0; i < NK; i++) {
193            x1 = 2.0 * x[2*i] - 1.0;
194            x2 = 2.0 * x[2*i+1] - 1.0;
195            t1 = pow2(x1) + pow2(x2);
196            if (t1 <= 1.0) {
197		t2 = sqrt(-2.0 * log(t1) / t1);
198		t3 = (x1 * t2);				/* Xi */
199		t4 = (x2 * t2);				/* Yi */
200		l = max(fabs(t3), fabs(t4));
201		qq[l] += 1.0;				/* counts */
202		sx = sx + t3;				/* sum of Xi */
203		sy = sy + t4;				/* sum of Yi */
204            }
205	}
206	if (TIMERS_ENABLED == TRUE) timer_stop(2);
207    }
208#pragma omp critical
209    {
210      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
211    }
212#if defined(_OPENMP)
213#pragma omp master
214    nthreads = omp_get_num_threads();
215#endif /* _OPENMP */
216} /* end of parallel region */
217
218    for (i = 0; i <= NQ-1; i++) {
219        gc = gc + q[i];
220    }
221
222    timer_stop(1);
223    tm = timer_read(1);
224
225    nit = 0;
226    if (M == 24) {
227	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
228	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
229	    verified = TRUE;
230	}
231    } else if (M == 25) {
232	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
233	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
234	    verified = TRUE;
235	}
236    } else if (M == 28) {
237	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
238	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
239	    verified = TRUE;
240	}
241    } else if (M == 30) {
242	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
243	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
244	    verified = TRUE;
245	}
246    } else if (M == 32) {
247	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
248	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
249	    verified = TRUE;
250	}
251    }
252
253    Mops = pow(2.0, M+1)/tm/1000000.0;
254
255    printf("EP Benchmark Results: \n"
256	   "CPU Time = %10.4f\n"
257	   "N = 2^%5d\n"
258	   "No. Gaussian Pairs = %15.0f\n"
259	   "Sums = %25.15e %25.15e\n"
260	   "Counts:\n",
261	   tm, M, gc, sx, sy);
262    for (i = 0; i  <= NQ-1; i++) {
263	printf("%3d %15.0f\n", i, q[i]);
264    }
265
266    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
267		  tm, Mops,
268		  "Random numbers generated",
269		  verified, NPBVERSION, COMPILETIME,
270		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);
271
272    if (TIMERS_ENABLED == TRUE) {
273	printf("Total time:     %f", timer_read(1));
274	printf("Gaussian pairs: %f", timer_read(2));
275	printf("Random numbers: %f", timer_read(3));
276    }
277}
278