1/*-------------------------------------------------------------------- 2 3 NAS Parallel Benchmarks 2.3 OpenMP C versions - FT 4 5 This benchmark is an OpenMP C version of the NPB FT code. 6 7 The OpenMP C versions are developed by RWCP and derived from the serial 8 Fortran versions in "NPB 2.3-serial" developed by NAS. 9 10 Permission to use, copy, distribute and modify this software for any 11 purpose with or without fee is hereby granted. 12 This software is provided "as is" without express or implied warranty. 13 14 Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp 15 16 Information on OpenMP activities at RWCP is available at: 17 18 http://pdplab.trc.rwcp.or.jp/pdperf/Omni/ 19 20 Information on NAS Parallel Benchmarks 2.3 is available at: 21 22 http://www.nas.nasa.gov/NAS/NPB/ 23 24--------------------------------------------------------------------*/ 25/*-------------------------------------------------------------------- 26 27 Authors: D. Bailey 28 W. Saphir 29 30 OpenMP C version: S. Satoh 31 32--------------------------------------------------------------------*/ 33 34#include <stdint.h> 35 36#include "npb-C.h" 37 38/* global variables */ 39#include "global.h" 40 41/* function declarations */ 42static void evolve(dcomplex u0[NZ][NY][NX], dcomplex u1[NZ][NY][NX], 43 int t, int indexmap[NZ][NY][NX], int d[3]); 44static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]); 45static void ipow46(double a, int exponent, double *result); 46static void setup(void); 47static void compute_indexmap(int indexmap[NZ][NY][NX], int d[3]); 48static void print_timers(void); 49static void fft(int dir, dcomplex x1[NZ][NY][NX], dcomplex x2[NZ][NY][NX]); 50static void cffts1(int is, int d[3], dcomplex x[NZ][NY][NX], 51 dcomplex xout[NZ][NY][NX], 52 dcomplex y0[NX][FFTBLOCKPAD], 53 dcomplex y1[NX][FFTBLOCKPAD]); 54static void cffts2(int is, int d[3], dcomplex x[NZ][NY][NX], 55 dcomplex xout[NZ][NY][NX], 56 dcomplex y0[NX][FFTBLOCKPAD], 57 dcomplex y1[NX][FFTBLOCKPAD]); 58static void cffts3(int is, int d[3], dcomplex x[NZ][NY][NX], 59 dcomplex xout[NZ][NY][NX], 60 dcomplex y0[NX][FFTBLOCKPAD], 61 dcomplex y1[NX][FFTBLOCKPAD]); 62static void fft_init (int n); 63static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], 64 dcomplex y[NX][FFTBLOCKPAD]); 65static void fftz2 (int is, int l, int m, int n, int ny, int ny1, 66 dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], 67 dcomplex y[NX][FFTBLOCKPAD]); 68static int ilog2(int n); 69static void checksum(int i, dcomplex u1[NZ][NY][NX], int d[3]); 70static void verify (int d1, int d2, int d3, int nt, 71 boolean *verified, char *class); 72 73/*-------------------------------------------------------------------- 74c FT benchmark 75c-------------------------------------------------------------------*/ 76 77static int realmain(void *carg) 78{ 79 unsigned arg = (uintptr_t)carg; 80 81/*c------------------------------------------------------------------- 82c-------------------------------------------------------------------*/ 83 84 int i, ierr; 85 86/*------------------------------------------------------------------ 87c u0, u1, u2 are the main arrays in the problem. 88c Depending on the decomposition, these arrays will have different 89c dimensions. To accomodate all possibilities, we allocate them as 90c one-dimensional arrays and pass them to subroutines for different 91c views 92c - u0 contains the initial (transformed) initial condition 93c - u1 and u2 are working arrays 94c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the 95c time evolution operator. 96c-----------------------------------------------------------------*/ 97 98/*-------------------------------------------------------------------- 99c Large arrays are in common so that they are allocated on the 100c heap rather than the stack. This common block is not 101c referenced directly anywhere else. Padding is to avoid accidental 102c cache problems, since all array sizes are powers of two. 103c-------------------------------------------------------------------*/ 104 static dcomplex u0[NZ][NY][NX]; 105 static dcomplex pad1[3]; 106 static dcomplex u1[NZ][NY][NX]; 107 static dcomplex pad2[3]; 108 static dcomplex u2[NZ][NY][NX]; 109 static dcomplex pad3[3]; 110 static int indexmap[NZ][NY][NX]; 111 112 int iter; 113 int nthreads = 1; 114 double total_time, mflops; 115 boolean verified; 116 char class; 117 118 omp_set_num_threads(arg); 119/*-------------------------------------------------------------------- 120c Run the entire problem once to make sure all data is touched. 121c This reduces variable startup costs, which is important for such a 122c short benchmark. The other NPB 2 implementations are similar. 123c-------------------------------------------------------------------*/ 124 for (i = 0; i < T_MAX; i++) { 125 timer_clear(i); 126 } 127 setup(); 128#pragma omp parallel 129 { 130 compute_indexmap(indexmap, dims[2]); 131#pragma omp single 132 { 133 compute_initial_conditions(u1, dims[0]); 134 fft_init (dims[0][0]); 135 } 136 fft(1, u1, u0); 137 } /* end parallel */ 138 139/*-------------------------------------------------------------------- 140c Start over from the beginning. Note that all operations must 141c be timed, in contrast to other benchmarks. 142c-------------------------------------------------------------------*/ 143 for (i = 0; i < T_MAX; i++) { 144 timer_clear(i); 145 } 146 147 timer_start(T_TOTAL); 148 if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP); 149 150#pragma omp parallel private(iter) firstprivate(niter) 151 { 152 compute_indexmap(indexmap, dims[2]); 153 154#pragma omp single 155 { 156 compute_initial_conditions(u1, dims[0]); 157 158 fft_init (dims[0][0]); 159 } 160 161 if (TIMERS_ENABLED == TRUE) { 162#pragma omp master 163 timer_stop(T_SETUP); 164 } 165 if (TIMERS_ENABLED == TRUE) { 166#pragma omp master 167 timer_start(T_FFT); 168 } 169 fft(1, u1, u0); 170 if (TIMERS_ENABLED == TRUE) { 171#pragma omp master 172 timer_stop(T_FFT); 173 } 174 175 for (iter = 1; iter <= niter; iter++) { 176 if (TIMERS_ENABLED == TRUE) { 177#pragma omp master 178 timer_start(T_EVOLVE); 179 } 180 181 evolve(u0, u1, iter, indexmap, dims[0]); 182 183 if (TIMERS_ENABLED == TRUE) { 184#pragma omp master 185 timer_stop(T_EVOLVE); 186 } 187 if (TIMERS_ENABLED == TRUE) { 188#pragma omp master 189 timer_start(T_FFT); 190 } 191 192 fft(-1, u1, u2); 193 194 if (TIMERS_ENABLED == TRUE) { 195#pragma omp master 196 timer_stop(T_FFT); 197 } 198 if (TIMERS_ENABLED == TRUE) { 199#pragma omp master 200 timer_start(T_CHECKSUM); 201 } 202 203 checksum(iter, u2, dims[0]); 204 205 if (TIMERS_ENABLED == TRUE) { 206#pragma omp master 207 timer_stop(T_CHECKSUM); 208 } 209 } 210 211#pragma omp single 212 verify(NX, NY, NZ, niter, &verified, &class); 213 214#if defined(_OPENMP) 215#pragma omp master 216 nthreads = omp_get_num_threads(); 217#endif /* _OPENMP */ 218 } /* end parallel */ 219 220 timer_stop(T_TOTAL); 221 total_time = timer_read(T_TOTAL); 222 223 if( total_time != 0.0) { 224 mflops = 1.0e-6*(double)(NTOTAL) * 225 (14.8157+7.19641*log((double)(NTOTAL)) 226 + (5.23518+7.21113*log((double)(NTOTAL)))*niter) 227 /total_time; 228 } else { 229 mflops = 0.0; 230 } 231#ifdef BOMP 232//backend_create_time(arg); 233#endif 234printf("Computetime %d %f\n", arg, total_time); 235printf("client done\n"); 236/* c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */ 237/* total_time, mflops, " floating point", verified, */ 238/* NPBVERSION, COMPILETIME, */ 239/* CS1, CS2, CS3, CS4, CS5, CS6, CS7); */ 240 if (TIMERS_ENABLED == TRUE) print_timers(); 241} 242 243/*-------------------------------------------------------------------- 244c-------------------------------------------------------------------*/ 245 246static void evolve(dcomplex u0[NZ][NY][NX], dcomplex u1[NZ][NY][NX], 247 int t, int indexmap[NZ][NY][NX], int d[3]) { 248 249/*-------------------------------------------------------------------- 250c-------------------------------------------------------------------*/ 251 252/*-------------------------------------------------------------------- 253c evolve u0 -> u1 (t time steps) in fourier space 254c-------------------------------------------------------------------*/ 255 256 int i, j, k; 257 258#pragma omp for 259 for (k = 0; k < d[2]; k++) { 260 for (j = 0; j < d[1]; j++) { 261 for (i = 0; i < d[0]; i++) { 262 crmul(u1[k][j][i], u0[k][j][i], ex[t*indexmap[k][j][i]]); 263 } 264 } 265 } 266} 267 268/*-------------------------------------------------------------------- 269c-------------------------------------------------------------------*/ 270 271static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) { 272 273/*-------------------------------------------------------------------- 274c-------------------------------------------------------------------*/ 275 276/*-------------------------------------------------------------------- 277c Fill in array u0 with initial conditions from 278c random number generator 279c-------------------------------------------------------------------*/ 280 281 int k; 282 double x0, start, an, dummy; 283 static double tmp[NX*2*MAXDIM+1]; 284 int i,j,t; 285 286 start = SEED; 287/*-------------------------------------------------------------------- 288c Jump to the starting element for our first plane. 289c-------------------------------------------------------------------*/ 290 ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an); 291 dummy = randlc(&start, an); 292 ipow46(A, 2*NX*NY, &an); 293 294/*-------------------------------------------------------------------- 295c Go through by z planes filling in one square at a time. 296c-------------------------------------------------------------------*/ 297 for (k = 0; k < dims[0][2]; k++) { 298 x0 = start; 299 vranlc(2*NX*dims[0][1], &x0, A, tmp); 300 301 t = 1; 302 for (j = 0; j < dims[0][1]; j++) 303 for (i = 0; i < NX; i++) { 304 u0[k][j][i].real = tmp[t++]; 305 u0[k][j][i].imag = tmp[t++]; 306 } 307 308 if (k != dims[0][2]) dummy = randlc(&start, an); 309 } 310} 311 312/*-------------------------------------------------------------------- 313c-------------------------------------------------------------------*/ 314 315static void ipow46(double a, int exponent, double *result) { 316 317/*-------------------------------------------------------------------- 318c-------------------------------------------------------------------*/ 319 320/*-------------------------------------------------------------------- 321c compute a^exponent mod 2^46 322c-------------------------------------------------------------------*/ 323 324 double dummy, q, r; 325 int n, n2; 326 327/*-------------------------------------------------------------------- 328c Use 329c a^n = a^(n/2)*a^(n/2) if n even else 330c a^n = a*a^(n-1) if n odd 331c-------------------------------------------------------------------*/ 332 *result = 1; 333 if (exponent == 0) return; 334 q = a; 335 r = 1; 336 n = exponent; 337 338 while (n > 1) { 339 n2 = n/2; 340 if (n2 * 2 == n) { 341 dummy = randlc(&q, q); 342 n = n2; 343 } else { 344 dummy = randlc(&r, q); 345 n = n-1; 346 } 347 } 348 dummy = randlc(&r, q); 349 *result = r; 350} 351 352/*-------------------------------------------------------------------- 353c-------------------------------------------------------------------*/ 354 355static void setup(void) { 356 357/*-------------------------------------------------------------------- 358c-------------------------------------------------------------------*/ 359 360 int ierr, i, j, fstatus; 361 362 printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" 363 " - FT Benchmark\n\n"); 364 365 niter = NITER_DEFAULT; 366 367 printf(" Size : %3dx%3dx%3d\n", NX, NY, NZ); 368 printf(" Iterations : %7d\n", niter); 369 370/* 1004 format(' Number of processes : ', i7) 371 1005 format(' Processor array : ', i3, 'x', i3) 372 1006 format(' WARNING: compiled for ', i5, ' processes. ', 373 > ' Will not verify. ')*/ 374 375 for (i = 0;i < 3 ; i++) { 376 dims[i][0] = NX; 377 dims[i][1] = NY; 378 dims[i][2] = NZ; 379 } 380 381 382 for (i = 0; i < 3; i++) { 383 xstart[i] = 1; 384 xend[i] = NX; 385 ystart[i] = 1; 386 yend[i] = NY; 387 zstart[i] = 1; 388 zend[i] = NZ; 389 } 390 391/*-------------------------------------------------------------------- 392c Set up info for blocking of ffts and transposes. This improves 393c performance on cache-based systems. Blocking involves 394c working on a chunk of the problem at a time, taking chunks 395c along the first, second, or third dimension. 396c 397c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) 398c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) 399 400c Since 1st dim is always in processor, we'll assume it's long enough 401c (default blocking factor is 16 so min size for 1st dim is 16) 402c The only case we have to worry about is cffts1 in a 2d decomposition. 403c so the blocking factor should not be larger than the 2nd dimension. 404c-------------------------------------------------------------------*/ 405 406 fftblock = FFTBLOCK_DEFAULT; 407 fftblockpad = FFTBLOCKPAD_DEFAULT; 408 409 if (fftblock != FFTBLOCK_DEFAULT) fftblockpad = fftblock+3; 410} 411 412/*-------------------------------------------------------------------- 413c-------------------------------------------------------------------*/ 414 415static void compute_indexmap(int indexmap[NZ][NY][NX], int d[3]) { 416 417/*-------------------------------------------------------------------- 418c-------------------------------------------------------------------*/ 419 420/*-------------------------------------------------------------------- 421c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 422c for time evolution exponent. 423c-------------------------------------------------------------------*/ 424 425 int i, j, k, ii, ii2, jj, ij2, kk; 426 double ap; 427 428/*-------------------------------------------------------------------- 429c basically we want to convert the fortran indices 430c 1 2 3 4 5 6 7 8 431c to 432c 0 1 2 3 -4 -3 -2 -1 433c The following magic formula does the trick: 434c mod(i-1+n/2, n) - n/2 435c-------------------------------------------------------------------*/ 436 437#pragma omp for 438 for (i = 0; i < dims[2][0]; i++) { 439 ii = (i+1+xstart[2]-2+NX/2)%NX - NX/2; 440 ii2 = ii*ii; 441 for (j = 0; j < dims[2][1]; j++) { 442 jj = (j+1+ystart[2]-2+NY/2)%NY - NY/2; 443 ij2 = jj*jj+ii2; 444 for (k = 0; k < dims[2][2]; k++) { 445 kk = (k+1+zstart[2]-2+NZ/2)%NZ - NZ/2; 446 indexmap[k][j][i] = kk*kk+ij2; 447 } 448 } 449 } 450 451/*-------------------------------------------------------------------- 452c compute array of exponentials for time evolution. 453c-------------------------------------------------------------------*/ 454#pragma omp single 455 { 456 ap = - 4.0 * ALPHA * PI * PI; 457 458 ex[0] = 1.0; 459 ex[1] = exp(ap); 460 for (i = 2; i <= EXPMAX; i++) { 461 ex[i] = ex[i-1]*ex[1]; 462 } 463 } /* end single */ 464} 465 466/*-------------------------------------------------------------------- 467c-------------------------------------------------------------------*/ 468 469static void print_timers(void) { 470 471/*-------------------------------------------------------------------- 472c-------------------------------------------------------------------*/ 473 474 int i; 475 char *tstrings[] = { " total ", 476 " setup ", 477 " fft ", 478 " evolve ", 479 " checksum ", 480 " fftlow ", 481 " fftcopy " }; 482 483 for (i = 0; i < T_MAX; i++) { 484 if (timer_read(i) != 0.0) { 485 printf("timer %2d(%16s( :%10.6f\n", i, tstrings[i], timer_read(i)); 486 } 487 } 488} 489 490 491/*-------------------------------------------------------------------- 492c-------------------------------------------------------------------*/ 493 494static void fft(int dir, dcomplex x1[NZ][NY][NX], dcomplex x2[NZ][NY][NX]) { 495 496/*-------------------------------------------------------------------- 497c-------------------------------------------------------------------*/ 498 499 dcomplex y0[NX][FFTBLOCKPAD]; 500 dcomplex y1[NX][FFTBLOCKPAD]; 501 502/*-------------------------------------------------------------------- 503c note: args x1, x2 must be different arrays 504c note: args for cfftsx are (direction, layout, xin, xout, scratch) 505c xin/xout may be the same and it can be somewhat faster 506c if they are 507c-------------------------------------------------------------------*/ 508 509 if (dir == 1) { 510 cffts1(1, dims[0], x1, x1, y0, y1); /* x1 -> x1 */ 511 cffts2(1, dims[1], x1, x1, y0, y1); /* x1 -> x1 */ 512 cffts3(1, dims[2], x1, x2, y0, y1); /* x1 -> x2 */ 513 } else { 514 cffts3(-1, dims[2], x1, x1, y0, y1); /* x1 -> x1 */ 515 cffts2(-1, dims[1], x1, x1, y0, y1); /* x1 -> x1 */ 516 cffts1(-1, dims[0], x1, x2, y0, y1); /* x1 -> x2 */ 517 } 518} 519 520 521/*-------------------------------------------------------------------- 522c-------------------------------------------------------------------*/ 523 524static void cffts1(int is, int d[3], dcomplex x[NZ][NY][NX], 525 dcomplex xout[NZ][NY][NX], 526 dcomplex y0[NX][FFTBLOCKPAD], 527 dcomplex y1[NX][FFTBLOCKPAD]) { 528 529/*-------------------------------------------------------------------- 530c-------------------------------------------------------------------*/ 531 532 int logd[3]; 533 int i, j, k, jj; 534 535 for (i = 0; i < 3; i++) { 536 logd[i] = ilog2(d[i]); 537 } 538 539#pragma omp for 540 for (k = 0; k < d[2]; k++) { 541 for (jj = 0; jj <= d[1] - fftblock; jj+=fftblock) { 542/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 543 for (j = 0; j < fftblock; j++) { 544 for (i = 0; i < d[0]; i++) { 545 y0[i][j].real = x[k][j+jj][i].real; 546 y0[i][j].imag = x[k][j+jj][i].imag; 547 } 548 } 549/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 550 551/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */ 552 cfftz (is, logd[0], 553 d[0], y0, y1); 554 555/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */ 556/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 557 for (j = 0; j < fftblock; j++) { 558 for (i = 0; i < d[0]; i++) { 559 xout[k][j+jj][i].real = y0[i][j].real; 560 xout[k][j+jj][i].imag = y0[i][j].imag; 561 } 562 } 563/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 564 } 565 } 566} 567 568 569/*-------------------------------------------------------------------- 570c-------------------------------------------------------------------*/ 571 572static void cffts2(int is, int d[3], dcomplex x[NZ][NY][NX], 573 dcomplex xout[NZ][NY][NX], 574 dcomplex y0[NX][FFTBLOCKPAD], 575 dcomplex y1[NX][FFTBLOCKPAD]) { 576 577/*-------------------------------------------------------------------- 578c-------------------------------------------------------------------*/ 579 580 int logd[3]; 581 int i, j, k, ii; 582 583 for (i = 0; i < 3; i++) { 584 logd[i] = ilog2(d[i]); 585 } 586#pragma omp for 587 for (k = 0; k < d[2]; k++) { 588 for (ii = 0; ii <= d[0] - fftblock; ii+=fftblock) { 589/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 590 for (j = 0; j < d[1]; j++) { 591 for (i = 0; i < fftblock; i++) { 592 y0[j][i].real = x[k][j][i+ii].real; 593 y0[j][i].imag = x[k][j][i+ii].imag; 594 } 595 } 596/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 597/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */ 598 cfftz (is, logd[1], 599 d[1], y0, y1); 600 601/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */ 602/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 603 for (j = 0; j < d[1]; j++) { 604 for (i = 0; i < fftblock; i++) { 605 xout[k][j][i+ii].real = y0[j][i].real; 606 xout[k][j][i+ii].imag = y0[j][i].imag; 607 } 608 } 609/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 610 } 611 } 612} 613 614/*-------------------------------------------------------------------- 615c-------------------------------------------------------------------*/ 616 617static void cffts3(int is, int d[3], dcomplex x[NZ][NY][NX], 618 dcomplex xout[NZ][NY][NX], 619 dcomplex y0[NX][FFTBLOCKPAD], 620 dcomplex y1[NX][FFTBLOCKPAD]) { 621 622/*-------------------------------------------------------------------- 623c-------------------------------------------------------------------*/ 624 625 int logd[3]; 626 int i, j, k, ii; 627 628 for (i = 0;i < 3; i++) { 629 logd[i] = ilog2(d[i]); 630 } 631#pragma omp for 632 for (j = 0; j < d[1]; j++) { 633 for (ii = 0; ii <= d[0] - fftblock; ii+=fftblock) { 634/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 635 for (k = 0; k < d[2]; k++) { 636 for (i = 0; i < fftblock; i++) { 637 y0[k][i].real = x[k][j][i+ii].real; 638 y0[k][i].imag = x[k][j][i+ii].imag; 639 } 640 } 641 642/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 643/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTLOW); */ 644 cfftz (is, logd[2], 645 d[2], y0, y1); 646/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTLOW); */ 647/* if (TIMERS_ENABLED == TRUE) timer_start(T_FFTCOPY); */ 648 for (k = 0; k < d[2]; k++) { 649 for (i = 0; i < fftblock; i++) { 650 xout[k][j][i+ii].real = y0[k][i].real; 651 xout[k][j][i+ii].imag = y0[k][i].imag; 652 } 653 } 654/* if (TIMERS_ENABLED == TRUE) timer_stop(T_FFTCOPY); */ 655 } 656 } 657} 658 659 660/*-------------------------------------------------------------------- 661c-------------------------------------------------------------------*/ 662 663static void fft_init (int n) { 664 665/*-------------------------------------------------------------------- 666c-------------------------------------------------------------------*/ 667 668/*-------------------------------------------------------------------- 669c compute the roots-of-unity array that will be used for subsequent FFTs. 670c-------------------------------------------------------------------*/ 671 672 int m,nu,ku,i,j,ln; 673 double t, ti; 674 675 676/*-------------------------------------------------------------------- 677c Initialize the U array with sines and cosines in a manner that permits 678c stride one access at each FFT iteration. 679c-------------------------------------------------------------------*/ 680 nu = n; 681 m = ilog2(n); 682 u[0].real = (double)m; 683 u[0].imag = 0.0; 684 ku = 1; 685 ln = 1; 686 687 for (j = 1; j <= m; j++) { 688 t = PI / ln; 689 690 for (i = 0; i <= ln - 1; i++) { 691 ti = i * t; 692 u[i+ku].real = cos(ti); 693 u[i+ku].imag = sin(ti); 694 } 695 696 ku = ku + ln; 697 ln = 2 * ln; 698 } 699} 700 701 702/*-------------------------------------------------------------------- 703c-------------------------------------------------------------------*/ 704 705static void cfftz (int is, int m, int n, dcomplex x[NX][FFTBLOCKPAD], 706 dcomplex y[NX][FFTBLOCKPAD]) { 707 708/*-------------------------------------------------------------------- 709c-------------------------------------------------------------------*/ 710 711/*-------------------------------------------------------------------- 712c Computes NY N-point complex-to-complex FFTs of X using an algorithm due 713c to Swarztrauber. X is both the input and the output array, while Y is a 714c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to 715c perform FFTs, the array U must be initialized by calling CFFTZ with IS 716c set to 0 and M set to MX, where MX is the maximum value of M for any 717c subsequent call. 718c-------------------------------------------------------------------*/ 719 720 int i,j,l,mx; 721 722/*-------------------------------------------------------------------- 723c Check if input parameters are invalid. 724c-------------------------------------------------------------------*/ 725 mx = (int)(u[0].real); 726 if ((is != 1 && is != -1) || m < 1 || m > mx) { 727 printf("CFFTZ: Either U has not been initialized, or else\n" 728 "one of the input parameters is invalid%5d%5d%5d\n", 729 is, m, mx); 730 exit(1); 731 } 732 733/*-------------------------------------------------------------------- 734c Perform one variant of the Stockham FFT. 735c-------------------------------------------------------------------*/ 736 for (l = 1; l <= m; l+=2) { 737 fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y); 738 if (l == m) break; 739 fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x); 740 } 741 742/*-------------------------------------------------------------------- 743c Copy Y to X. 744c-------------------------------------------------------------------*/ 745 if (m % 2 == 1) { 746 for (j = 0; j < n; j++) { 747 for (i = 0; i < fftblock; i++) { 748 x[j][i].real = y[j][i].real; 749 x[j][i].imag = y[j][i].imag; 750 } 751 } 752 } 753} 754 755 756/*-------------------------------------------------------------------- 757c-------------------------------------------------------------------*/ 758 759static void fftz2 (int is, int l, int m, int n, int ny, int ny1, 760 dcomplex u[NX], dcomplex x[NX][FFTBLOCKPAD], 761 dcomplex y[NX][FFTBLOCKPAD]) { 762 763/*-------------------------------------------------------------------- 764c-------------------------------------------------------------------*/ 765 766/*-------------------------------------------------------------------- 767c Performs the L-th iteration of the second variant of the Stockham FFT. 768c-------------------------------------------------------------------*/ 769 770 int k,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22; 771 dcomplex u1,x11,x21; 772 773/*-------------------------------------------------------------------- 774c Set initial parameters. 775c-------------------------------------------------------------------*/ 776 777 n1 = n / 2; 778 if (l-1 == 0) { 779 lk = 1; 780 } else { 781 lk = 2 << ((l - 1)-1); 782 } 783 if (m-l == 0) { 784 li = 1; 785 } else { 786 li = 2 << ((m - l)-1); 787 } 788 lj = 2 * lk; 789 ku = li; 790 791 for (i = 0; i < li; i++) { 792 793 i11 = i * lk; 794 i12 = i11 + n1; 795 i21 = i * lj; 796 i22 = i21 + lk; 797 if (is >= 1) { 798 u1.real = u[ku+i].real; 799 u1.imag = u[ku+i].imag; 800 } else { 801 u1.real = u[ku+i].real; 802 u1.imag = -u[ku+i].imag; 803 } 804 805/*-------------------------------------------------------------------- 806c This loop is vectorizable. 807c-------------------------------------------------------------------*/ 808 for (k = 0; k < lk; k++) { 809 for (j = 0; j < ny; j++) { 810 double x11real, x11imag; 811 double x21real, x21imag; 812 x11real = x[i11+k][j].real; 813 x11imag = x[i11+k][j].imag; 814 x21real = x[i12+k][j].real; 815 x21imag = x[i12+k][j].imag; 816 y[i21+k][j].real = x11real + x21real; 817 y[i21+k][j].imag = x11imag + x21imag; 818 y[i22+k][j].real = u1.real * (x11real - x21real) 819 - u1.imag * (x11imag - x21imag); 820 y[i22+k][j].imag = u1.real * (x11imag - x21imag) 821 + u1.imag * (x11real - x21real); 822 } 823 } 824 } 825} 826 827 828/*-------------------------------------------------------------------- 829c-------------------------------------------------------------------*/ 830 831static int ilog2(int n) { 832 833/*-------------------------------------------------------------------- 834c-------------------------------------------------------------------*/ 835 836 int nn, lg; 837 838 if (n == 1) { 839 return 0; 840 } 841 lg = 1; 842 nn = 2; 843 while (nn < n) { 844 nn = nn << 1; 845 lg++; 846 } 847 848 return lg; 849} 850 851 852/*-------------------------------------------------------------------- 853c-------------------------------------------------------------------*/ 854 855static void checksum(int i, dcomplex u1[NZ][NY][NX], int d[3]) { 856 857/*-------------------------------------------------------------------- 858c-------------------------------------------------------------------*/ 859 860 int j, q,r,s, ierr; 861 dcomplex chk,allchk; 862 863 chk.real = 0.0; 864 chk.imag = 0.0; 865 866#pragma omp for nowait 867 for (j = 1; j <= 1024; j++) { 868 q = j%NX+1; 869 if (q >= xstart[0] && q <= xend[0]) { 870 r = (3*j)%NY+1; 871 if (r >= ystart[0] && r <= yend[0]) { 872 s = (5*j)%NZ+1; 873 if (s >= zstart[0] && s <= zend[0]) { 874 cadd(chk,chk,u1[s-zstart[0]][r-ystart[0]][q-xstart[0]]); 875 } 876 } 877 } 878 } 879#pragma omp critical 880 { 881 sums[i].real += chk.real; 882 sums[i].imag += chk.imag; 883 } 884#pragma omp barrier 885#pragma omp single 886 { 887 /* complex % real */ 888 sums[i].real = sums[i].real/(double)(NTOTAL); 889 sums[i].imag = sums[i].imag/(double)(NTOTAL); 890 891/* printf("T = %5d Checksum = %22.12e %22.12e\n", */ 892/* i, sums[i].real, sums[i].imag); */ 893 } 894} 895 896 897/*-------------------------------------------------------------------- 898c-------------------------------------------------------------------*/ 899 900static void verify (int d1, int d2, int d3, int nt, 901 boolean *verified, char *class) { 902 903/*-------------------------------------------------------------------- 904c-------------------------------------------------------------------*/ 905 906 int ierr, size, i; 907 double err, epsilon; 908 909/*-------------------------------------------------------------------- 910c Sample size reference checksums 911c-------------------------------------------------------------------*/ 912 913/*-------------------------------------------------------------------- 914c Class S size reference checksums 915c-------------------------------------------------------------------*/ 916 double vdata_real_s[6+1] = { 0.0, 917 5.546087004964e+02, 918 5.546385409189e+02, 919 5.546148406171e+02, 920 5.545423607415e+02, 921 5.544255039624e+02, 922 5.542683411902e+02 }; 923 double vdata_imag_s[6+1] = { 0.0, 924 4.845363331978e+02, 925 4.865304269511e+02, 926 4.883910722336e+02, 927 4.901273169046e+02, 928 4.917475857993e+02, 929 4.932597244941e+02 }; 930/*-------------------------------------------------------------------- 931c Class W size reference checksums 932c-------------------------------------------------------------------*/ 933 double vdata_real_w[6+1] = { 0.0, 934 5.673612178944e+02, 935 5.631436885271e+02, 936 5.594024089970e+02, 937 5.560698047020e+02, 938 5.530898991250e+02, 939 5.504159734538e+02 }; 940 double vdata_imag_w[6+1] = { 0.0, 941 5.293246849175e+02, 942 5.282149986629e+02, 943 5.270996558037e+02, 944 5.260027904925e+02, 945 5.249400845633e+02, 946 5.239212247086e+02 }; 947/*-------------------------------------------------------------------- 948c Class A size reference checksums 949c-------------------------------------------------------------------*/ 950 double vdata_real_a[6+1] = { 0.0, 951 5.046735008193e+02, 952 5.059412319734e+02, 953 5.069376896287e+02, 954 5.077892868474e+02, 955 5.085233095391e+02, 956 5.091487099959e+02 }; 957 double vdata_imag_a[6+1] = { 0.0, 958 5.114047905510e+02, 959 5.098809666433e+02, 960 5.098144042213e+02, 961 5.101336130759e+02, 962 5.104914655194e+02, 963 5.107917842803e+02 }; 964/*-------------------------------------------------------------------- 965c Class B size reference checksums 966c-------------------------------------------------------------------*/ 967 double vdata_real_b[20+1] = { 0.0, 968 5.177643571579e+02, 969 5.154521291263e+02, 970 5.146409228649e+02, 971 5.142378756213e+02, 972 5.139626667737e+02, 973 5.137423460082e+02, 974 5.135547056878e+02, 975 5.133910925466e+02, 976 5.132470705390e+02, 977 5.131197729984e+02, 978 5.130070319283e+02, 979 5.129070537032e+02, 980 5.128182883502e+02, 981 5.127393733383e+02, 982 5.126691062020e+02, 983 5.126064276004e+02, 984 5.125504076570e+02, 985 5.125002331720e+02, 986 5.124551951846e+02, 987 5.124146770029e+02 }; 988 double vdata_imag_b[20+1] = { 0.0, 989 5.077803458597e+02, 990 5.088249431599e+02, 991 5.096208912659e+02, 992 5.101023387619e+02, 993 5.103976610617e+02, 994 5.105948019802e+02, 995 5.107404165783e+02, 996 5.108576573661e+02, 997 5.109577278523e+02, 998 5.110460304483e+02, 999 5.111252433800e+02, 1000 5.111968077718e+02, 1001 5.112616233064e+02, 1002 5.113203605551e+02, 1003 5.113735928093e+02, 1004 5.114218460548e+02, 1005 5.114656139760e+02, 1006 5.115053595966e+02, 1007 5.115415130407e+02, 1008 5.115744692211e+02 }; 1009/*-------------------------------------------------------------------- 1010c Class C size reference checksums 1011c-------------------------------------------------------------------*/ 1012 double vdata_real_c[20+1] = { 0.0, 1013 5.195078707457e+02, 1014 5.155422171134e+02, 1015 5.144678022222e+02, 1016 5.140150594328e+02, 1017 5.137550426810e+02, 1018 5.135811056728e+02, 1019 5.134569343165e+02, 1020 5.133651975661e+02, 1021 5.132955192805e+02, 1022 5.132410471738e+02, 1023 5.131971141679e+02, 1024 5.131605205716e+02, 1025 5.131290734194e+02, 1026 5.131012720314e+02, 1027 5.130760908195e+02, 1028 5.130528295923e+02, 1029 5.130310107773e+02, 1030 5.130103090133e+02, 1031 5.129905029333e+02, 1032 5.129714421109e+02 }; 1033 double vdata_imag_c[20+1] = { 0.0, 1034 5.149019699238e+02, 1035 5.127578201997e+02, 1036 5.122251847514e+02, 1037 5.121090289018e+02, 1038 5.121143685824e+02, 1039 5.121496764568e+02, 1040 5.121870921893e+02, 1041 5.122193250322e+02, 1042 5.122454735794e+02, 1043 5.122663649603e+02, 1044 5.122830879827e+02, 1045 5.122965869718e+02, 1046 5.123075927445e+02, 1047 5.123166486553e+02, 1048 5.123241541685e+02, 1049 5.123304037599e+02, 1050 5.123356167976e+02, 1051 5.123399592211e+02, 1052 5.123435588985e+02, 1053 5.123465164008e+02 }; 1054 1055 epsilon = 1.0e-12; 1056 *verified = TRUE; 1057 *class = 'U'; 1058 1059 if (d1 == 64 && 1060 d2 == 64 && 1061 d3 == 64 && 1062 nt == 6) { 1063 *class = 'S'; 1064 for (i = 1; i <= nt; i++) { 1065 err = (get_real(sums[i]) - vdata_real_s[i]) / vdata_real_s[i]; 1066 if (fabs(err) > epsilon) { 1067 *verified = FALSE; 1068 break; 1069 } 1070 err = (get_imag(sums[i]) - vdata_imag_s[i]) / vdata_imag_s[i]; 1071 if (fabs(err) > epsilon) { 1072 *verified = FALSE; 1073 break; 1074 } 1075 } 1076 } else if (d1 == 128 && 1077 d2 == 128 && 1078 d3 == 32 && 1079 nt == 6) { 1080 *class = 'W'; 1081 for (i = 1; i <= nt; i++) { 1082 err = (get_real(sums[i]) - vdata_real_w[i]) / vdata_real_w[i]; 1083 if (fabs(err) > epsilon) { 1084 *verified = FALSE; 1085 break; 1086 } 1087 err = (get_imag(sums[i]) - vdata_imag_w[i]) / vdata_imag_w[i]; 1088 if (fabs(err) > epsilon) { 1089 *verified = FALSE; 1090 break; 1091 } 1092 } 1093 } else if (d1 == 256 && 1094 d2 == 256 && 1095 d3 == 128 && 1096 nt == 6) { 1097 *class = 'A'; 1098 for (i = 1; i <= nt; i++) { 1099 err = (get_real(sums[i]) - vdata_real_a[i]) / vdata_real_a[i]; 1100 if (fabs(err) > epsilon) { 1101 *verified = FALSE; 1102 break; 1103 } 1104 err = (get_imag(sums[i]) - vdata_imag_a[i]) / vdata_imag_a[i]; 1105 if (fabs(err) > epsilon) { 1106 *verified = FALSE; 1107 break; 1108 } 1109 } 1110 } else if (d1 == 512 && 1111 d2 == 256 && 1112 d3 == 256 && 1113 nt == 20) { 1114 *class = 'B'; 1115 for (i = 1; i <= nt; i++) { 1116 err = (get_real(sums[i]) - vdata_real_b[i]) / vdata_real_b[i]; 1117 if (fabs(err) > epsilon) { 1118 *verified = FALSE; 1119 break; 1120 } 1121 err = (get_imag(sums[i]) - vdata_imag_b[i]) / vdata_imag_b[i]; 1122 if (fabs(err) > epsilon) { 1123 *verified = FALSE; 1124 break; 1125 } 1126 } 1127 } else if (d1 == 512 && 1128 d2 == 512 && 1129 d3 == 512 && 1130 nt == 20) { 1131 *class = 'C'; 1132 for (i = 1; i <= nt; i++) { 1133 err = (get_real(sums[i]) - vdata_real_c[i]) / vdata_real_c[i]; 1134 if (fabs(err) > epsilon) { 1135 *verified = FALSE; 1136 break; 1137 } 1138 err = (get_imag(sums[i]) - vdata_imag_c[i]) / vdata_imag_c[i]; 1139 if (fabs(err) > epsilon) { 1140 *verified = FALSE; 1141 break; 1142 } 1143 } 1144 } 1145 1146 if (*class != 'U') { 1147 printf("Result verification successful\n"); 1148 } else { 1149 printf("Result verification failed\n"); 1150 } 1151 printf("class = %1c\n", *class); 1152} 1153 1154#define STACK_SIZE (8 * 1024 * 1024) 1155 1156int main(int argc, char** argv) 1157{ 1158 if (argc != 2) { /* Print usage */ 1159 printf("Usage: %s <Number of threads>\n", argv[0]); 1160 exit(-1); 1161 } 1162 1163 uint64_t arg1 = (uint64_t)atoi(argv[1]); 1164 1165#ifdef BOMP 1166 bomp_bomp_init_varstack(arg1, STACK_SIZE); 1167 // need to create thread with large enough stack on Barrelfish, as BF 1168 // thread stacks are not dynamically resized. 1169 bomp_run_main(realmain, (void*)arg1, STACK_SIZE); 1170#else 1171 return realmain((void*)arg1); 1172#endif 1173} 1174