1/*-------------------------------------------------------------------- 2 3 NAS Parallel Benchmarks 2.3 OpenMP C versions - EP 4 5 This benchmark is an OpenMP C version of the NPB EP code. 6 7 The OpenMP C versions are developed by RWCP and derived from the serial 8 Fortran versions in "NPB 2.3-serial" developed by NAS. 9 10 Permission to use, copy, distribute and modify this software for any 11 purpose with or without fee is hereby granted. 12 This software is provided "as is" without express or implied warranty. 13 14 Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp 15 16 Information on OpenMP activities at RWCP is available at: 17 18 http://pdplab.trc.rwcp.or.jp/pdperf/Omni/ 19 20 Information on NAS Parallel Benchmarks 2.3 is available at: 21 22 http://www.nas.nasa.gov/NAS/NPB/ 23 24--------------------------------------------------------------------*/ 25/*-------------------------------------------------------------------- 26 27 Author: P. O. Frederickson 28 D. H. Bailey 29 A. C. Woo 30 31 OpenMP C version: S. Satoh 32 33--------------------------------------------------------------------*/ 34 35#include "npb-C.h" 36#include "npbparams.h" 37 38/* parameters */ 39#define MK 16 40#define MM (M - MK) 41#define NN (1 << MM) 42#define NK (1 << MK) 43#define NQ 10 44#define EPSILON 1.0e-8 45#define A 1220703125.0 46#define S 271828183.0 47#define TIMERS_ENABLED FALSE 48 49/* global variables */ 50/* common /storage/ */ 51static double x[2*NK]; 52#pragma omp threadprivate(x) 53static double q[NQ]; 54 55/*-------------------------------------------------------------------- 56 program EMBAR 57c-------------------------------------------------------------------*/ 58/* 59c This is the serial version of the APP Benchmark 1, 60c the "embarassingly parallel" benchmark. 61c 62c M is the Log_2 of the number of complex pairs of uniform (0, 1) random 63c numbers. MK is the Log_2 of the size of each batch of uniform random 64c numbers. MK can be set for convenience on a given system, since it does 65c not affect the results. 66*/ 67int main(int argc, char **argv) { 68 69 double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; 70 double dum[3] = { 1.0, 1.0, 1.0 }; 71 int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, 72 no_large_nodes, np_add, k_offset, j; 73 int nthreads = 1; 74 boolean verified; 75 char size[13+1]; /* character*13 */ 76 77/* 78c Because the size of the problem is too large to store in a 32-bit 79c integer for some classes, we put it into a string (for printing). 80c Have to strip off the decimal point put in there by the floating 81c point print statement (internal file) 82*/ 83#ifndef POSIX 84#ifndef NOBOMP 85 bomp_bomp_init(1); 86#endif 87#endif 88 omp_set_num_threads(1); 89 printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" 90 " - EP Benchmark\n"); 91 sprintf(size, "%12.0f", pow(2.0, M+1)); 92 for (j = 13; j >= 1; j--) { 93 if (size[j] == '.') size[j] = ' '; 94 } 95 printf(" Number of random numbers generated: %13s\n", size); 96 97 verified = FALSE; 98 99/* 100c Compute the number of "batches" of random number pairs generated 101c per processor. Adjust if the number of processors does not evenly 102c divide the total number 103*/ 104 np = NN; 105 106/* 107c Call the random number generator functions and initialize 108c the x-array to reduce the effects of paging on the timings. 109c Also, call all mathematical functions that are used. Make 110c sure these initializations cannot be eliminated as dead code. 111*/ 112 vranlc(0, &(dum[0]), dum[1], &(dum[2])); 113 dum[0] = randlc(&(dum[1]), dum[2]); 114 115 for (i = 0; i < 2*NK; i++) 116 { 117 x[i] = -1.0e99; 118 } 119 120 printf("Reached here "); 121 Mops = log(sqrt(fabs(max(1.0, 1.0)))); 122 123 timer_clear(1); 124 timer_clear(2); 125 timer_clear(3); 126 timer_start(1); 127 128 vranlc(0, &t1, A, x); 129 130/* Compute AN = A ^ (2 * NK) (mod 2^46). */ 131 132 t1 = A; 133 134 for ( i = 1; i <= MK+1; i++) { 135 t2 = randlc(&t1, t1); 136 } 137 138 an = t1; 139 tt = S; 140 gc = 0.0; 141 sx = 0.0; 142 sy = 0.0; 143 144 for ( i = 0; i <= NQ - 1; i++) { 145 q[i] = 0.0; 146 } 147 148/* 149c Each instance of this loop may be performed independently. We compute 150c the k offsets separately to take into account the fact that some nodes 151c have more numbers to generate than others 152*/ 153 k_offset = -1; 154 155#pragma omp parallel copyin(x) 156{ 157 double t1, t2, t3, t4, x1, x2; 158 int kk, i, ik, l; 159 double qq[NQ]; /* private copy of q[0:NQ-1] */ 160 161 for (i = 0; i < NQ; i++) qq[i] = 0.0; 162 163#pragma omp for reduction(+:sx,sy) schedule(static) 164 for (k = 1; k <= np; k++) { 165 kk = k_offset + k; 166 t1 = S; 167 t2 = an; 168 169/* Find starting seed t1 for this kk. */ 170 171 for (i = 1; i <= 100; i++) { 172 ik = kk / 2; 173 if (2 * ik != kk) t3 = randlc(&t1, t2); 174 if (ik == 0) break; 175 t3 = randlc(&t2, t2); 176 kk = ik; 177 } 178 179/* Compute uniform pseudorandom numbers. */ 180 181 if (TIMERS_ENABLED == TRUE) timer_start(3); 182 vranlc(2*NK, &t1, A, x-1); 183 if (TIMERS_ENABLED == TRUE) timer_stop(3); 184 185/* 186c Compute Gaussian deviates by acceptance-rejection method and 187c tally counts in concentric square annuli. This loop is not 188c vectorizable. 189*/ 190 if (TIMERS_ENABLED == TRUE) timer_start(2); 191 192 for ( i = 0; i < NK; i++) { 193 x1 = 2.0 * x[2*i] - 1.0; 194 x2 = 2.0 * x[2*i+1] - 1.0; 195 t1 = pow2(x1) + pow2(x2); 196 if (t1 <= 1.0) { 197 t2 = sqrt(-2.0 * log(t1) / t1); 198 t3 = (x1 * t2); /* Xi */ 199 t4 = (x2 * t2); /* Yi */ 200 l = max(fabs(t3), fabs(t4)); 201 qq[l] += 1.0; /* counts */ 202 sx = sx + t3; /* sum of Xi */ 203 sy = sy + t4; /* sum of Yi */ 204 } 205 } 206 if (TIMERS_ENABLED == TRUE) timer_stop(2); 207 } 208#pragma omp critical 209 { 210 for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; 211 } 212#if defined(_OPENMP) 213#pragma omp master 214 nthreads = omp_get_num_threads(); 215#endif /* _OPENMP */ 216} /* end of parallel region */ 217 218 for (i = 0; i <= NQ-1; i++) { 219 gc = gc + q[i]; 220 } 221 222 timer_stop(1); 223 tm = timer_read(1); 224 225 nit = 0; 226 if (M == 24) { 227 if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && 228 (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { 229 verified = TRUE; 230 } 231 } else if (M == 25) { 232 if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && 233 (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { 234 verified = TRUE; 235 } 236 } else if (M == 28) { 237 if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && 238 (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { 239 verified = TRUE; 240 } 241 } else if (M == 30) { 242 if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && 243 (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { 244 verified = TRUE; 245 } 246 } else if (M == 32) { 247 if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && 248 (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { 249 verified = TRUE; 250 } 251 } 252 253 Mops = pow(2.0, M+1)/tm/1000000.0; 254 255 printf("EP Benchmark Results: \n" 256 "CPU Time = %10.4f\n" 257 "N = 2^%5d\n" 258 "No. Gaussian Pairs = %15.0f\n" 259 "Sums = %25.15e %25.15e\n" 260 "Counts:\n", 261 tm, M, gc, sx, sy); 262 for (i = 0; i <= NQ-1; i++) { 263 printf("%3d %15.0f\n", i, q[i]); 264 } 265 266 c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, 267 tm, Mops, 268 "Random numbers generated", 269 verified, NPBVERSION, COMPILETIME, 270 CS1, CS2, CS3, CS4, CS5, CS6, CS7); 271 272 if (TIMERS_ENABLED == TRUE) { 273 printf("Total time: %f", timer_read(1)); 274 printf("Gaussian pairs: %f", timer_read(2)); 275 printf("Random numbers: %f", timer_read(3)); 276 } 277} 278