1/*-------------------------------------------------------------------- 2 3 NAS Parallel Benchmarks 2.3 OpenMP C versions - IS 4 5 This benchmark is an OpenMP C version of the NPB IS code. 6 7 The OpenMP C versions are developed by RWCP and derived from the serial 8 Fortran versions in "NPB 2.3-serial" developed by NAS. 9 10 Permission to use, copy, distribute and modify this software for any 11 purpose with or without fee is hereby granted. 12 This software is provided "as is" without express or implied warranty. 13 14 Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp 15 16 Information on OpenMP activities at RWCP is available at: 17 18 http://pdplab.trc.rwcp.or.jp/pdperf/Omni/ 19 20 Information on NAS Parallel Benchmarks 2.3 is available at: 21 22 http://www.nas.nasa.gov/NAS/NPB/ 23 24--------------------------------------------------------------------*/ 25/*-------------------------------------------------------------------- 26 27 Author: M. Yarrow 28 29 OpenMP C version: S. Satoh 30 31--------------------------------------------------------------------*/ 32 33#include "npb-C.h" 34#include "npbparams.h" 35#include <stdlib.h> 36#include <stdio.h> 37#include <stdint.h> 38#if defined(_OPENMP) 39#include <omp.h> 40#endif /* _OPENMP */ 41 42#define STACK_SIZE (8 * 1024 * 1024) 43 44/*****************************************************************/ 45/* For serial IS, buckets are not really req'd to solve NPB1 IS */ 46/* spec, but their use on some machines improves performance, on */ 47/* other machines the use of buckets compromises performance, */ 48/* probably because it is extra computation which is not req'd. */ 49/* (Note: Mechanism not understood, probably cache related) */ 50/* Example: SP2-66MhzWN: 50% speedup with buckets */ 51/* Example: SGI Indy5000: 50% slowdown with buckets */ 52/* Example: SGI O2000: 400% slowdown with buckets (Wow!) */ 53/*****************************************************************/ 54/* #define USE_BUCKETS */ 55/* buckets are not used in the OpenMP C version */ 56 57 58/******************/ 59/* default values */ 60/******************/ 61#ifndef CLASS 62#define CLASS 'S' 63#endif 64 65 66/*************/ 67/* CLASS S */ 68/*************/ 69#if CLASS == 'S' 70#define TOTAL_KEYS_LOG_2 16 71#define MAX_KEY_LOG_2 11 72#define NUM_BUCKETS_LOG_2 9 73#endif 74 75 76/*************/ 77/* CLASS W */ 78/*************/ 79#if CLASS == 'W' 80#define TOTAL_KEYS_LOG_2 20 81#define MAX_KEY_LOG_2 16 82#define NUM_BUCKETS_LOG_2 10 83#endif 84 85/*************/ 86/* CLASS A */ 87/*************/ 88#if CLASS == 'A' 89#define TOTAL_KEYS_LOG_2 23 90#define MAX_KEY_LOG_2 19 91#define NUM_BUCKETS_LOG_2 10 92#endif 93 94 95/*************/ 96/* CLASS B */ 97/*************/ 98#if CLASS == 'B' 99#define TOTAL_KEYS_LOG_2 25 100#define MAX_KEY_LOG_2 21 101#define NUM_BUCKETS_LOG_2 10 102#endif 103 104 105/*************/ 106/* CLASS C */ 107/*************/ 108#if CLASS == 'C' 109#define TOTAL_KEYS_LOG_2 27 110#define MAX_KEY_LOG_2 23 111#define NUM_BUCKETS_LOG_2 10 112#endif 113 114 115#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2) 116#define MAX_KEY (1 << MAX_KEY_LOG_2) 117#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2) 118#define NUM_KEYS TOTAL_KEYS 119#define SIZE_OF_BUFFERS NUM_KEYS 120 121 122#define MAX_ITERATIONS 10 123#define TEST_ARRAY_SIZE 5 124 125 126/*************************************/ 127/* Typedef: if necessary, change the */ 128/* size of int here by changing the */ 129/* int type to, say, long */ 130/*************************************/ 131typedef int INT_TYPE; 132 133 134/********************/ 135/* Some global info */ 136/********************/ 137INT_TYPE *key_buff_ptr_global; /* used by full_verify to get */ 138 /* copies of rank info */ 139 140int passed_verification; 141 142 143/************************************/ 144/* These are the three main arrays. */ 145/* See SIZE_OF_BUFFERS def above */ 146/************************************/ 147INT_TYPE key_array[SIZE_OF_BUFFERS], 148 key_buff1[SIZE_OF_BUFFERS], 149 key_buff2[SIZE_OF_BUFFERS], 150 partial_verify_vals[TEST_ARRAY_SIZE]; 151 152#ifdef USE_BUCKETS 153INT_TYPE bucket_size[NUM_BUCKETS], 154 bucket_ptrs[NUM_BUCKETS]; 155#endif 156 157 158/**********************/ 159/* Partial verif info */ 160/**********************/ 161INT_TYPE test_index_array[TEST_ARRAY_SIZE], 162 test_rank_array[TEST_ARRAY_SIZE], 163 164 S_test_index_array[TEST_ARRAY_SIZE] = 165 {48427,17148,23627,62548,4431}, 166 S_test_rank_array[TEST_ARRAY_SIZE] = 167 {0,18,346,64917,65463}, 168 169 W_test_index_array[TEST_ARRAY_SIZE] = 170 {357773,934767,875723,898999,404505}, 171 W_test_rank_array[TEST_ARRAY_SIZE] = 172 {1249,11698,1039987,1043896,1048018}, 173 174 A_test_index_array[TEST_ARRAY_SIZE] = 175 {2112377,662041,5336171,3642833,4250760}, 176 A_test_rank_array[TEST_ARRAY_SIZE] = 177 {104,17523,123928,8288932,8388264}, 178 179 B_test_index_array[TEST_ARRAY_SIZE] = 180 {41869,812306,5102857,18232239,26860214}, 181 B_test_rank_array[TEST_ARRAY_SIZE] = 182 {33422937,10244,59149,33135281,99}, 183 184 C_test_index_array[TEST_ARRAY_SIZE] = 185 {44172927,72999161,74326391,129606274,21736814}, 186 C_test_rank_array[TEST_ARRAY_SIZE] = 187 {61147,882988,266290,133997595,133525895}; 188 189 190 191/***********************/ 192/* function prototypes */ 193/***********************/ 194double is_randlc( double *X, double *A ); 195 196void full_verify( void ); 197 198/* 199 * FUNCTION RANDLC (X, A) 200 * 201 * This routine returns a uniform pseudorandom double precision number in the 202 * range (0, 1) by using the linear congruential generator 203 * 204 * x_{k+1} = a x_k (mod 2^46) 205 * 206 * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers 207 * before repeating. The argument A is the same as 'a' in the above formula, 208 * and X is the same as x_0. A and X must be odd double precision integers 209 * in the range (1, 2^46). The returned value RANDLC is normalized to be 210 * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain 211 * the new seed x_1, so that subsequent calls to RANDLC using the same 212 * arguments will generate a continuous sequence. 213 * 214 * This routine should produce the same results on any computer with at least 215 * 48 mantissa bits in double precision floating point data. On Cray systems, 216 * double precision should be disabled. 217 * 218 * David H. Bailey October 26, 1990 219 * 220 * IMPLICIT DOUBLE PRECISION (A-H, O-Z) 221 * SAVE KS, R23, R46, T23, T46 222 * DATA KS/0/ 223 * 224 * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46, 225 * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than 226 * by merely using the ** operator, in order to insure that the results are 227 * exact on all systems. This code assumes that 0.5D0 is represented exactly. 228 */ 229 230 231/*****************************************************************/ 232/************* R A N D L C ************/ 233/************* ************/ 234/************* portable random number generator ************/ 235/*****************************************************************/ 236 237double is_randlc(X, A) 238double *X; 239double *A; 240{ 241 static int KS=0; 242 static double R23, R46, T23, T46; 243 double T1, T2, T3, T4; 244 double A1; 245 double A2; 246 double X1; 247 double X2; 248 double Z; 249 int i, j; 250 251 if (KS == 0) 252 { 253 R23 = 1.0; 254 R46 = 1.0; 255 T23 = 1.0; 256 T46 = 1.0; 257 258 for (i=1; i<=23; i++) 259 { 260 R23 = 0.50 * R23; 261 T23 = 2.0 * T23; 262 } 263 for (i=1; i<=46; i++) 264 { 265 R46 = 0.50 * R46; 266 T46 = 2.0 * T46; 267 } 268 KS = 1; 269 } 270 271/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */ 272 273 T1 = R23 * *A; 274 j = T1; 275 A1 = j; 276 A2 = *A - T23 * A1; 277 278/* Break X into two parts such that X = 2^23 * X1 + X2, compute 279 Z = A1 * X2 + A2 * X1 (mod 2^23), and then 280 X = 2^23 * Z + A2 * X2 (mod 2^46). */ 281 282 T1 = R23 * *X; 283 j = T1; 284 X1 = j; 285 X2 = *X - T23 * X1; 286 T1 = A1 * X2 + A2 * X1; 287 288 j = R23 * T1; 289 T2 = j; 290 Z = T1 - T23 * T2; 291 T3 = T23 * Z + A2 * X2; 292 j = R46 * T3; 293 T4 = j; 294 *X = T3 - T46 * T4; 295 return(R46 * *X); 296} 297 298 299 300 301/*****************************************************************/ 302/************* C R E A T E _ S E Q ************/ 303/*****************************************************************/ 304 305void create_seq( double seed, double a ) 306{ 307 double x; 308 int i, j, k; 309 310 k = MAX_KEY/4; 311 312 for (i=0; i<NUM_KEYS; i++) 313 { 314 x = is_randlc(&seed, &a); 315 x += is_randlc(&seed, &a); 316 x += is_randlc(&seed, &a); 317 x += is_randlc(&seed, &a); 318 319 key_array[i] = k*x; 320 } 321} 322 323 324 325 326/*****************************************************************/ 327/************* F U L L _ V E R I F Y ************/ 328/*****************************************************************/ 329 330 331void full_verify() 332{ 333 INT_TYPE i, j; 334 INT_TYPE k; 335 INT_TYPE m, unique_keys; 336 337 338 339/* Now, finally, sort the keys: */ 340 for( i=0; i<NUM_KEYS; i++ ) 341 key_array[--key_buff_ptr_global[key_buff2[i]]] = key_buff2[i]; 342 343 344/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ 345 346 j = 0; 347 for( i=1; i<NUM_KEYS; i++ ) 348 if( key_array[i-1] > key_array[i] ) 349 j++; 350 351 352 if( j != 0 ) 353 { 354 printf( "Full_verify: number of keys out of sort: %d\n", 355 j ); 356 } 357 else 358 passed_verification++; 359 360 361} 362 363 364 365 366/*****************************************************************/ 367/************* R A N K ****************/ 368/*****************************************************************/ 369 370void rank( int iteration ) 371{ 372 373 INT_TYPE i, j, k; 374 INT_TYPE l, m; 375 376 INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2; 377 INT_TYPE key; 378 INT_TYPE min_key_val, max_key_val; 379 380 INT_TYPE prv_buff1[MAX_KEY]; 381 382#pragma omp master 383 { 384 key_array[iteration] = iteration; 385 key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration; 386 387/* Determine where the partial verify test keys are, load into */ 388/* top of array bucket_size */ 389 for( i=0; i<TEST_ARRAY_SIZE; i++ ) 390 partial_verify_vals[i] = key_array[test_index_array[i]]; 391 392/* Clear the work array */ 393 for( i=0; i<MAX_KEY; i++ ) 394 key_buff1[i] = 0; 395 } 396#pragma omp barrier 397 398 for (i=0; i<MAX_KEY; i++) 399 prv_buff1[i] = 0; 400 401/* Copy keys into work array; keys in key_array will be reused each iter. */ 402#pragma omp for nowait 403 for( i=0; i<NUM_KEYS; i++ ) { 404 key_buff2[i] = key_array[i]; 405 406/* Ranking of all keys occurs in this section: */ 407 408/* In this section, the keys themselves are used as their 409 own indexes to determine how many of each there are: their 410 individual population */ 411 412 prv_buff1[key_buff2[i]]++; /* Now they have individual key */ 413 } 414 /* population */ 415 for( i=0; i<MAX_KEY-1; i++ ) 416 prv_buff1[i+1] += prv_buff1[i]; 417 418 419#pragma omp critical 420 { 421 for( i=0; i<MAX_KEY; i++ ) 422 key_buff1[i] += prv_buff1[i]; 423 } 424 425/* To obtain ranks of each key, successively add the individual key 426 population, not forgetting to add m, the total of lesser keys, 427 to the first key population */ 428 429#pragma omp barrier 430#pragma omp master 431 { 432 433/* This is the partial verify test section */ 434/* Observe that test_rank_array vals are */ 435/* shifted differently for different cases */ 436 for( i=0; i<TEST_ARRAY_SIZE; i++ ) 437 { 438 k = partial_verify_vals[i]; /* test vals were put here */ 439 if( 0 <= k && k <= NUM_KEYS-1 ) 440 switch( CLASS ) 441 { 442 case 'S': 443 if( i <= 2 ) 444 { 445 if( key_buff1[k-1] != test_rank_array[i]+iteration ) 446 { 447 printf( "Failed partial verification: " 448 "iteration %d, test key %d\n", 449 iteration, i ); 450 } 451 else 452 passed_verification++; 453 } 454 else 455 { 456 if( key_buff1[k-1] != test_rank_array[i]-iteration ) 457 { 458 printf( "Failed partial verification: " 459 "iteration %d, test key %d\n", 460 iteration, i ); 461 } 462 else 463 passed_verification++; 464 } 465 break; 466 case 'W': 467 if( i < 2 ) 468 { 469 if( key_buff1[k-1] != 470 test_rank_array[i]+(iteration-2) ) 471 { 472 printf( "Failed partial verification: " 473 "iteration %d, test key %d\n", 474 iteration, i ); 475 } 476 else 477 passed_verification++; 478 } 479 else 480 { 481 if( key_buff1[k-1] != test_rank_array[i]-iteration ) 482 { 483 printf( "Failed partial verification: " 484 "iteration %d, test key %d\n", 485 iteration, i ); 486 } 487 else 488 passed_verification++; 489 } 490 break; 491 case 'A': 492 if( i <= 2 ) 493 { 494 if( key_buff1[k-1] != 495 test_rank_array[i]+(iteration-1) ) 496 { 497 printf( "Failed partial verification: " 498 "iteration %d, test key %d\n", 499 iteration, i ); 500 } 501 else 502 passed_verification++; 503 } 504 else 505 { 506 if( key_buff1[k-1] != 507 test_rank_array[i]-(iteration-1) ) 508 { 509 printf( "Failed partial verification: " 510 "iteration %d, test key %d\n", 511 iteration, i ); 512 } 513 else 514 passed_verification++; 515 } 516 break; 517 case 'B': 518 if( i == 1 || i == 2 || i == 4 ) 519 { 520 if( key_buff1[k-1] != test_rank_array[i]+iteration ) 521 { 522 printf( "Failed partial verification: " 523 "iteration %d, test key %d\n", 524 iteration, i ); 525 } 526 else 527 passed_verification++; 528 } 529 else 530 { 531 if( key_buff1[k-1] != test_rank_array[i]-iteration ) 532 { 533 printf( "Failed partial verification: " 534 "iteration %d, test key %d\n", 535 iteration, i ); 536 } 537 else 538 passed_verification++; 539 } 540 break; 541 case 'C': 542 if( i <= 2 ) 543 { 544 if( key_buff1[k-1] != test_rank_array[i]+iteration ) 545 { 546 printf( "Failed partial verification: " 547 "iteration %d, test key %d\n", 548 iteration, i ); 549 } 550 else 551 passed_verification++; 552 } 553 else 554 { 555 if( key_buff1[k-1] != test_rank_array[i]-iteration ) 556 { 557 printf( "Failed partial verification: " 558 "iteration %d, test key %d\n", 559 iteration, i ); 560 } 561 else 562 passed_verification++; 563 } 564 break; 565 } 566 } 567 568 569 570 571/* Make copies of rank info for use by full_verify: these variables 572 in rank are local; making them global slows down the code, probably 573 since they cannot be made register by compiler */ 574 575 if( iteration == MAX_ITERATIONS ) 576 key_buff_ptr_global = key_buff1; 577 578 } /* end master */ 579} 580 581 582/*****************************************************************/ 583/************* M A I N ****************/ 584/*****************************************************************/ 585 586static int realmain(void *cargv) 587{ 588 unsigned argv = (unsigned)((long)cargv); 589 int i, iteration, itemp; 590 int nthreads = 1; 591 double timecounter, maxtime; 592 593 omp_set_num_threads(argv); 594 595/* Initialize the verification arrays if a valid class */ 596 for( i=0; i<TEST_ARRAY_SIZE; i++ ) 597 switch( CLASS ) 598 { 599 case 'S': 600 test_index_array[i] = S_test_index_array[i]; 601 test_rank_array[i] = S_test_rank_array[i]; 602 break; 603 case 'A': 604 test_index_array[i] = A_test_index_array[i]; 605 test_rank_array[i] = A_test_rank_array[i]; 606 break; 607 case 'W': 608 test_index_array[i] = W_test_index_array[i]; 609 test_rank_array[i] = W_test_rank_array[i]; 610 break; 611 case 'B': 612 test_index_array[i] = B_test_index_array[i]; 613 test_rank_array[i] = B_test_rank_array[i]; 614 break; 615 case 'C': 616 test_index_array[i] = C_test_index_array[i]; 617 test_rank_array[i] = C_test_rank_array[i]; 618 break; 619 }; 620 621 622 623/* Printout initial NPB info */ 624 printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" 625 " - IS Benchmark\n\n" ); 626 printf( " Size: %d (class %c)\n", TOTAL_KEYS, CLASS ); 627 printf( " Iterations: %d\n", MAX_ITERATIONS ); 628 629/* Initialize timer */ 630 timer_clear( 0 ); 631 632/* Generate random number sequence and subsequent keys on all procs */ 633 create_seq( 314159265.00, /* Random number gen seed */ 634 1220703125.00 ); /* Random number gen mult */ 635 636 637/* Do one interation for free (i.e., untimed) to guarantee initialization of 638 all data and code pages and respective tables */ 639#pragma omp parallel 640 rank( 1 ); 641 642/* Start verification counter */ 643 passed_verification = 0; 644 645 if( CLASS != 'S' ) printf( "\n iteration\n" ); 646 647/* Start timer */ 648 timer_start( 0 ); 649 650 651/* This is the main iteration */ 652 653#pragma omp parallel private(iteration) 654 for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) 655 { 656 657 //#pragma omp master 658 //if( CLASS != 'S' ) printf( " %d\n", iteration ); 659 660 rank( iteration ); 661 662#if defined(_OPENMP) 663#pragma omp master 664 nthreads = omp_get_num_threads(); 665#endif /* _OPENMP */ 666 } 667 668/* End of timing, obtain maximum time of all processors */ 669 timer_stop( 0 ); 670 timecounter = timer_read( 0 ); 671 672 673/* This tests that keys are in sequence: sorting of last ranked key seq 674 occurs here, but is an untimed operation */ 675 full_verify(); 676 677 678 679/* The final printout */ 680 if( passed_verification != 5*MAX_ITERATIONS + 1 ) { 681 passed_verification = 0; 682 } 683 684#ifdef BOMP 685//backend_create_time(argv); 686#endif 687printf("Computetime %d %f\n", argv, timecounter); 688printf("client done\n"); 689/* c_print_results( "IS", */ 690/* CLASS, */ 691/* TOTAL_KEYS, */ 692/* 0, */ 693/* 0, */ 694/* MAX_ITERATIONS, */ 695/* nthreads, */ 696/* timecounter, */ 697/* ((double) (MAX_ITERATIONS*TOTAL_KEYS)) */ 698/* /timecounter/1000000., */ 699/* "keys ranked", */ 700/* passed_verification, */ 701/* NPBVERSION, */ 702/* COMPILETIME, */ 703/* CC, */ 704/* CLINK, */ 705/* C_LIB, */ 706/* C_INC, */ 707/* CFLAGS, */ 708/* CLINKFLAGS, */ 709/* "randlc"); */ 710 711 712 713 /**************************/ 714} /* E N D P R O G R A M */ 715 /**************************/ 716 717 718#define STACK_SIZE (8 * 1024 * 1024) 719 720int main(int argc, char** argv) 721{ 722 if (argc != 2) { /* Print usage */ 723 printf("Usage: %s <Number of threads>\n", argv[0]); 724 exit(-1); 725 } 726 727#ifdef BOMP 728 bomp_bomp_init(atoi(argv[1])); 729#endif /* BOMP */ 730 realmain((void*)((long)atoi(argv[1]))); 731} 732