1/*-------------------------------------------------------------------- 2 3 NAS Parallel Benchmarks 2.3 OpenMP C versions - CG 4 5 This benchmark is an OpenMP C version of the NPB CG code. 6 7 The OpenMP C versions are developed by RWCP and derived from the serial 8 Fortran versions in "NPB 2.3-serial" developed by NAS. 9 10 Permission to use, copy, distribute and modify this software for any 11 purpose with or without fee is hereby granted. 12 This software is provided "as is" without express or implied warranty. 13 14 Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp 15 16 Information on OpenMP activities at RWCP is available at: 17 18 http://pdplab.trc.rwcp.or.jp/pdperf/Omni/ 19 20 Information on NAS Parallel Benchmarks 2.3 is available at: 21 22 http://www.nas.nasa.gov/NAS/NPB/ 23 24--------------------------------------------------------------------*/ 25/*-------------------------------------------------------------------- 26 27 Authors: M. Yarrow 28 C. Kuszmaul 29 30 OpenMP C version: S. Satoh 31 32--------------------------------------------------------------------*/ 33 34/* 35c--------------------------------------------------------------------- 36c Note: please observe that in the routine conj_grad three 37c implementations of the sparse matrix-vector multiply have 38c been supplied. The default matrix-vector multiply is not 39c loop unrolled. The alternate implementations are unrolled 40c to a depth of 2 and unrolled to a depth of 8. Please 41c experiment with these to find the fastest for your particular 42c architecture. If reporting timing results, any of these three may 43c be used without penalty. 44c--------------------------------------------------------------------- 45*/ 46 47#include "npb-C.h" 48#include "npbparams.h" 49 50#undef NZ 51#define NZ NA*(NONZER+1)*(NONZER+1)+NA*(NONZER+2) 52 53/* global variables */ 54 55/* common /partit_size/ */ 56static int naa; 57static int nzz; 58static int firstrow; 59static int lastrow; 60static int firstcol; 61static int lastcol; 62 63/* common /main_int_mem/ */ 64static int colidx[NZ+1]; /* colidx[1:NZ] */ 65static int rowstr[NA+1+1]; /* rowstr[1:NA+1] */ 66static int iv[2*NA+1+1]; /* iv[1:2*NA+1] */ 67static int arow[NZ+1]; /* arow[1:NZ] */ 68static int acol[NZ+1]; /* acol[1:NZ] */ 69 70/* common /main_flt_mem/ */ 71static double v[NA+1+1]; /* v[1:NA+1] */ 72static double aelt[NZ+1]; /* aelt[1:NZ] */ 73static double a[NZ+1]; /* a[1:NZ] */ 74static double x[NA+2+1]; /* x[1:NA+2] */ 75static double z[NA+2+1]; /* z[1:NA+2] */ 76static double p[NA+2+1]; /* p[1:NA+2] */ 77static double q[NA+2+1]; /* q[1:NA+2] */ 78static double r[NA+2+1]; /* r[1:NA+2] */ 79static double w[NA+2+1]; /* w[1:NA+2] */ 80 81/* common /urando/ */ 82static double amult; 83static double tran; 84 85/* function declarations */ 86static void conj_grad (int colidx[], int rowstr[], double x[], double z[], 87 double a[], double p[], double q[], double r[], 88 double w[], double *rnorm); 89static void makea(int n, int nz, double a[], int colidx[], int rowstr[], 90 int nonzer, int firstrow, int lastrow, int firstcol, 91 int lastcol, double rcond, int arow[], int acol[], 92 double aelt[], double v[], int iv[], double shift ); 93static void sparse(double a[], int colidx[], int rowstr[], int n, 94 int arow[], int acol[], double aelt[], 95 int firstrow, int lastrow, 96 double x[], boolean mark[], int nzloc[], int nnza); 97static void sprnvc(int n, int nz, double v[], int iv[], int nzloc[], 98 int mark[]); 99static int icnvrt(double x, int ipwr2); 100static void vecset(int n, double v[], int iv[], int *nzv, int i, double val); 101 102/*-------------------------------------------------------------------- 103 program cg 104--------------------------------------------------------------------*/ 105 106int main(int argc, char **argv) { 107 int i, j, k, it; 108 int nthreads = 1; 109 double zeta; 110 double rnorm; 111 double norm_temp11; 112 double norm_temp12; 113 double t, mflops; 114 char class; 115 boolean verified; 116 double zeta_verify_value, epsilon; 117 118 if (argc != 2) { /* Print usage */ 119 printf("Usage: %s <Number of threads>\n", argv[0]); 120 abort(); 121 } 122 123#ifdef BOMP 124 bomp_bomp_init(atoi(argv[1])); 125#endif 126 127 printf("Benchmark start"); 128 129 omp_set_num_threads(atoi(argv[1])); 130 firstrow = 1; 131 lastrow = NA; 132 firstcol = 1; 133 lastcol = NA; 134 135 if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { 136 class = 'S'; 137 zeta_verify_value = 8.5971775078648; 138 } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { 139 class = 'W'; 140 zeta_verify_value = 10.362595087124; 141 } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { 142 class = 'A'; 143 zeta_verify_value = 17.130235054029; 144 } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { 145 class = 'B'; 146 zeta_verify_value = 22.712745482631; 147 } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { 148 class = 'C'; 149 zeta_verify_value = 28.973605592845; 150 } else { 151 class = 'U'; 152 } 153 154 printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" 155 " - CG Benchmark\n"); 156 printf(" Size: %10d\n", NA); 157 printf(" Iterations: %5d\n", NITER); 158 159 naa = NA; 160 nzz = NZ; 161 162/*-------------------------------------------------------------------- 163c Initialize random number generator 164c-------------------------------------------------------------------*/ 165 tran = 314159265.0; 166 amult = 1220703125.0; 167 zeta = randlc( &tran, amult ); 168 169/*-------------------------------------------------------------------- 170c 171c-------------------------------------------------------------------*/ 172 makea(naa, nzz, a, colidx, rowstr, NONZER, 173 firstrow, lastrow, firstcol, lastcol, 174 RCOND, arow, acol, aelt, v, iv, SHIFT); 175 176/*--------------------------------------------------------------------- 177c Note: as a result of the above call to makea: 178c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 179c values of colidx which are col indexes go from firstcol --> lastcol 180c So: 181c Shift the col index vals from actual (firstcol --> lastcol ) 182c to local, i.e., (1 --> lastcol-firstcol+1) 183c---------------------------------------------------------------------*/ 184#pragma omp parallel private(it,i,j,k) 185{ 186#pragma omp for nowait 187 for (j = 1; j <= lastrow - firstrow + 1; j++) { 188 for (k = rowstr[j]; k < rowstr[j+1]; k++) { 189 colidx[k] = colidx[k] - firstcol + 1; 190 } 191 } 192 193/*-------------------------------------------------------------------- 194c set starting vector to (1, 1, .... 1) 195c-------------------------------------------------------------------*/ 196#pragma omp for nowait 197 for (i = 1; i <= NA+1; i++) { 198 x[i] = 1.0; 199 } 200#pragma omp single 201 zeta = 0.0; 202 203/*------------------------------------------------------------------- 204c----> 205c Do one iteration untimed to init all code and data page tables 206c----> (then reinit, start timing, to niter its) 207c-------------------------------------------------------------------*/ 208 209 for (it = 1; it <= 1; it++) { 210 211/*-------------------------------------------------------------------- 212c The call to the conjugate gradient routine: 213c-------------------------------------------------------------------*/ 214 conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm); 215 216/*-------------------------------------------------------------------- 217c zeta = shift + 1/(x.z) 218c So, first: (x.z) 219c Also, find norm of z 220c So, first: (z.z) 221c-------------------------------------------------------------------*/ 222#pragma omp single 223{ 224 norm_temp11 = 0.0; 225 norm_temp12 = 0.0; 226} /* end single */ 227 228#pragma omp for reduction(+:norm_temp11,norm_temp12) 229 for (j = 1; j <= lastcol-firstcol+1; j++) { 230 norm_temp11 = norm_temp11 + x[j]*z[j]; 231 norm_temp12 = norm_temp12 + z[j]*z[j]; 232 } 233#pragma omp single 234 norm_temp12 = 1.0 / sqrt( norm_temp12 ); 235 236/*-------------------------------------------------------------------- 237c Normalize z to obtain x 238c-------------------------------------------------------------------*/ 239#pragma omp for 240 for (j = 1; j <= lastcol-firstcol+1; j++) { 241 x[j] = norm_temp12*z[j]; 242 } 243 244 } /* end of do one iteration untimed */ 245 246/*-------------------------------------------------------------------- 247c set starting vector to (1, 1, .... 1) 248c-------------------------------------------------------------------*/ 249#pragma omp for nowait 250 for (i = 1; i <= NA+1; i++) { 251 x[i] = 1.0; 252 } 253#pragma omp single 254 zeta = 0.0; 255 256} /* end parallel */ 257 258 timer_clear( 1 ); 259 timer_start( 1 ); 260 261/*-------------------------------------------------------------------- 262c----> 263c Main Iteration for inverse power method 264c----> 265c-------------------------------------------------------------------*/ 266 267#pragma omp parallel private(it,i,j,k) 268{ 269 for (it = 1; it <= NITER; it++) { 270 271/*-------------------------------------------------------------------- 272c The call to the conjugate gradient routine: 273c-------------------------------------------------------------------*/ 274 conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm); 275 276/*-------------------------------------------------------------------- 277c zeta = shift + 1/(x.z) 278c So, first: (x.z) 279c Also, find norm of z 280c So, first: (z.z) 281c-------------------------------------------------------------------*/ 282#pragma omp single 283{ 284 norm_temp11 = 0.0; 285 norm_temp12 = 0.0; 286} /* end single */ 287 288#pragma omp for reduction(+:norm_temp11,norm_temp12) 289 for (j = 1; j <= lastcol-firstcol+1; j++) { 290 norm_temp11 = norm_temp11 + x[j]*z[j]; 291 norm_temp12 = norm_temp12 + z[j]*z[j]; 292 } 293 294#pragma omp single 295{ 296 norm_temp12 = 1.0 / sqrt( norm_temp12 ); 297 298 zeta = SHIFT + 1.0 / norm_temp11; 299} /* end single */ 300 301#pragma omp master 302{ 303/* if( it == 1 ) { */ 304/* printf(" iteration ||r|| zeta\n"); */ 305/* } */ 306/* printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); */ 307} /* end master */ 308 309/*-------------------------------------------------------------------- 310c Normalize z to obtain x 311c-------------------------------------------------------------------*/ 312#pragma omp for 313 for (j = 1; j <= lastcol-firstcol+1; j++) { 314 x[j] = norm_temp12*z[j]; 315 } 316 } /* end of main iter inv pow meth */ 317 318#if defined(_OPENMP) 319#pragma omp master 320 nthreads = omp_get_num_threads(); 321#endif /* _OPENMP */ 322} /* end parallel */ 323 324 timer_stop( 1 ); 325 326/*-------------------------------------------------------------------- 327c End of timed section 328c-------------------------------------------------------------------*/ 329 330 t = timer_read( 1 ); 331 332 printf(" Benchmark completed\n"); 333 334 epsilon = 1.0e-10; 335 if (class != 'U') { 336 if (fabs(zeta - zeta_verify_value) <= epsilon) { 337 verified = TRUE; 338 printf(" VERIFICATION SUCCESSFUL\n"); 339 printf(" Zeta is %20.12e\n", zeta); 340 printf(" Error is %20.12e\n", zeta - zeta_verify_value); 341 } else { 342 verified = FALSE; 343 printf(" VERIFICATION FAILED\n"); 344 printf(" Zeta %20.12e\n", zeta); 345 printf(" The correct zeta is %20.12e\n", zeta_verify_value); 346 } 347 } else { 348 verified = FALSE; 349 printf(" Problem size unknown\n"); 350 printf(" NO VERIFICATION PERFORMED\n"); 351 } 352 353 if ( t != 0.0 ) { 354 mflops = (2.0*NITER*NA) 355 * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) 356 / t / 1000000.0; 357 } else { 358 mflops = 0.0; 359 } 360 361#ifdef BOMP 362//backend_create_time(atoi(argv[1])); 363#endif 364printf("Computetime %d %f\n", atoi(argv[1]), t); 365printf("client done\n"); 366 367/* c_print_results("CG", class, NA, 0, 0, NITER, nthreads, t, */ 368/* mflops, " floating point", */ 369/* verified, NPBVERSION, COMPILETIME, */ 370/* CS1, CS2, CS3, CS4, CS5, CS6, CS7); */ 371} 372 373/*-------------------------------------------------------------------- 374c-------------------------------------------------------------------*/ 375static void conj_grad ( 376 int colidx[], /* colidx[1:nzz] */ 377 int rowstr[], /* rowstr[1:naa+1] */ 378 double x[], /* x[*] */ 379 double z[], /* z[*] */ 380 double a[], /* a[1:nzz] */ 381 double p[], /* p[*] */ 382 double q[], /* q[*] */ 383 double r[], /* r[*] */ 384 double w[], /* w[*] */ 385 double *rnorm ) 386/*-------------------------------------------------------------------- 387c-------------------------------------------------------------------*/ 388 389/*--------------------------------------------------------------------- 390c Floaging point arrays here are named as in NPB1 spec discussion of 391c CG algorithm 392c---------------------------------------------------------------------*/ 393{ 394 static double d, sum, rho, rho0, alpha, beta; 395 int i, j, k; 396 int cgit, cgitmax = 25; 397 398#pragma omp single nowait 399 rho = 0.0; 400 401/*-------------------------------------------------------------------- 402c Initialize the CG algorithm: 403c-------------------------------------------------------------------*/ 404#pragma omp for nowait 405 for (j = 1; j <= naa+1; j++) { 406 q[j] = 0.0; 407 z[j] = 0.0; 408 r[j] = x[j]; 409 p[j] = r[j]; 410 w[j] = 0.0; 411 } 412 413/*-------------------------------------------------------------------- 414c rho = r.r 415c Now, obtain the norm of r: First, sum squares of r elements locally... 416c-------------------------------------------------------------------*/ 417#pragma omp for reduction(+:rho) 418 for (j = 1; j <= lastcol-firstcol+1; j++) { 419 rho = rho + x[j]*x[j]; 420 } 421 422/*-------------------------------------------------------------------- 423c----> 424c The conj grad iteration loop 425c----> 426c-------------------------------------------------------------------*/ 427 for (cgit = 1; cgit <= cgitmax; cgit++) { 428#pragma omp single nowait 429{ 430 rho0 = rho; 431 d = 0.0; 432 rho = 0.0; 433} /* end single */ 434 435/*-------------------------------------------------------------------- 436c q = A.p 437c The partition submatrix-vector multiply: use workspace w 438c--------------------------------------------------------------------- 439C 440C NOTE: this version of the multiply is actually (slightly: maybe %5) 441C faster on the sp2 on 16 nodes than is the unrolled-by-2 version 442C below. On the Cray t3d, the reverse is true, i.e., the 443C unrolled-by-two version is some 10% faster. 444C The unrolled-by-8 version below is significantly faster 445C on the Cray t3d - overall speed of code is 1.5 times faster. 446*/ 447 448/* rolled version */ 449#pragma omp for private(sum,k) 450 for (j = 1; j <= lastrow-firstrow+1; j++) { 451 sum = 0.0; 452 for (k = rowstr[j]; k < rowstr[j+1]; k++) { 453 sum = sum + a[k]*p[colidx[k]]; 454 } 455 w[j] = sum; 456 } 457 458/* unrolled-by-two version 459#pragma omp for private(i,k) 460 for (j = 1; j <= lastrow-firstrow+1; j++) { 461 int iresidue; 462 double sum1, sum2; 463 i = rowstr[j]; 464 iresidue = (rowstr[j+1]-i) % 2; 465 sum1 = 0.0; 466 sum2 = 0.0; 467 if (iresidue == 1) sum1 = sum1 + a[i]*p[colidx[i]]; 468 for (k = i+iresidue; k <= rowstr[j+1]-2; k += 2) { 469 sum1 = sum1 + a[k] * p[colidx[k]]; 470 sum2 = sum2 + a[k+1] * p[colidx[k+1]]; 471 } 472 w[j] = sum1 + sum2; 473 } 474*/ 475/* unrolled-by-8 version 476#pragma omp for private(i,k,sum) 477 for (j = 1; j <= lastrow-firstrow+1; j++) { 478 int iresidue; 479 i = rowstr[j]; 480 iresidue = (rowstr[j+1]-i) % 8; 481 sum = 0.0; 482 for (k = i; k <= i+iresidue-1; k++) { 483 sum = sum + a[k] * p[colidx[k]]; 484 } 485 for (k = i+iresidue; k <= rowstr[j+1]-8; k += 8) { 486 sum = sum + a[k ] * p[colidx[k ]] 487 + a[k+1] * p[colidx[k+1]] 488 + a[k+2] * p[colidx[k+2]] 489 + a[k+3] * p[colidx[k+3]] 490 + a[k+4] * p[colidx[k+4]] 491 + a[k+5] * p[colidx[k+5]] 492 + a[k+6] * p[colidx[k+6]] 493 + a[k+7] * p[colidx[k+7]]; 494 } 495 w[j] = sum; 496 } 497*/ 498 499#pragma omp for 500 for (j = 1; j <= lastcol-firstcol+1; j++) { 501 q[j] = w[j]; 502 } 503 504/*-------------------------------------------------------------------- 505c Clear w for reuse... 506c-------------------------------------------------------------------*/ 507#pragma omp for nowait 508 for (j = 1; j <= lastcol-firstcol+1; j++) { 509 w[j] = 0.0; 510 } 511 512/*-------------------------------------------------------------------- 513c Obtain p.q 514c-------------------------------------------------------------------*/ 515#pragma omp for reduction(+:d) 516 for (j = 1; j <= lastcol-firstcol+1; j++) { 517 d = d + p[j]*q[j]; 518 } 519 520/*-------------------------------------------------------------------- 521c Obtain alpha = rho / (p.q) 522c-------------------------------------------------------------------*/ 523#pragma omp single 524 alpha = rho0 / d; 525 526/*-------------------------------------------------------------------- 527c Save a temporary of rho 528c-------------------------------------------------------------------*/ 529 /* rho0 = rho;*/ 530 531/*--------------------------------------------------------------------- 532c Obtain z = z + alpha*p 533c and r = r - alpha*q 534c---------------------------------------------------------------------*/ 535#pragma omp for 536 for (j = 1; j <= lastcol-firstcol+1; j++) { 537 z[j] = z[j] + alpha*p[j]; 538 r[j] = r[j] - alpha*q[j]; 539 } 540 541/*--------------------------------------------------------------------- 542c rho = r.r 543c Now, obtain the norm of r: First, sum squares of r elements locally... 544c---------------------------------------------------------------------*/ 545#pragma omp for reduction(+:rho) 546 for (j = 1; j <= lastcol-firstcol+1; j++) { 547 rho = rho + r[j]*r[j]; 548 } 549 550/*-------------------------------------------------------------------- 551c Obtain beta: 552c-------------------------------------------------------------------*/ 553#pragma omp single 554 beta = rho / rho0; 555 556/*-------------------------------------------------------------------- 557c p = r + beta*p 558c-------------------------------------------------------------------*/ 559#pragma omp for 560 for (j = 1; j <= lastcol-firstcol+1; j++) { 561 p[j] = r[j] + beta*p[j]; 562 } 563 } /* end of do cgit=1,cgitmax */ 564 565/*--------------------------------------------------------------------- 566c Compute residual norm explicitly: ||r|| = ||x - A.z|| 567c First, form A.z 568c The partition submatrix-vector multiply 569c---------------------------------------------------------------------*/ 570#pragma omp single nowait 571 sum = 0.0; 572 573#pragma omp for private(d, k) 574 for (j = 1; j <= lastrow-firstrow+1; j++) { 575 d = 0.0; 576 for (k = rowstr[j]; k <= rowstr[j+1]-1; k++) { 577 d = d + a[k]*z[colidx[k]]; 578 } 579 w[j] = d; 580 } 581 582#pragma omp for 583 for (j = 1; j <= lastcol-firstcol+1; j++) { 584 r[j] = w[j]; 585 } 586 587/*-------------------------------------------------------------------- 588c At this point, r contains A.z 589c-------------------------------------------------------------------*/ 590#pragma omp for reduction(+:sum) private(d) 591 for (j = 1; j <= lastcol-firstcol+1; j++) { 592 d = x[j] - r[j]; 593 sum = sum + d*d; 594 } 595 596#pragma omp single 597 { 598 (*rnorm) = sqrt(sum); 599 } /* end single */ 600} 601 602/*--------------------------------------------------------------------- 603c generate the test problem for benchmark 6 604c makea generates a sparse matrix with a 605c prescribed sparsity distribution 606c 607c parameter type usage 608c 609c input 610c 611c n i number of cols/rows of matrix 612c nz i nonzeros as declared array size 613c rcond r*8 condition number 614c shift r*8 main diagonal shift 615c 616c output 617c 618c a r*8 array for nonzeros 619c colidx i col indices 620c rowstr i row pointers 621c 622c workspace 623c 624c iv, arow, acol i 625c v, aelt r*8 626c---------------------------------------------------------------------*/ 627static void makea( 628 int n, 629 int nz, 630 double a[], /* a[1:nz] */ 631 int colidx[], /* colidx[1:nz] */ 632 int rowstr[], /* rowstr[1:n+1] */ 633 int nonzer, 634 int firstrow, 635 int lastrow, 636 int firstcol, 637 int lastcol, 638 double rcond, 639 int arow[], /* arow[1:nz] */ 640 int acol[], /* acol[1:nz] */ 641 double aelt[], /* aelt[1:nz] */ 642 double v[], /* v[1:n+1] */ 643 int iv[], /* iv[1:2*n+1] */ 644 double shift ) 645{ 646 int i, nnza, iouter, ivelt, ivelt1, irow, nzv; 647 648/*-------------------------------------------------------------------- 649c nonzer is approximately (int(sqrt(nnza /n))); 650c-------------------------------------------------------------------*/ 651 652 double size, ratio, scale; 653 int jcol; 654 655 size = 1.0; 656 ratio = pow(rcond, (1.0 / (double)n)); 657 nnza = 0; 658 659/*--------------------------------------------------------------------- 660c Initialize colidx(n+1 .. 2n) to zero. 661c Used by sprnvc to mark nonzero positions 662c---------------------------------------------------------------------*/ 663#pragma omp parallel for 664 for (i = 1; i <= n; i++) { 665 colidx[n+i] = 0; 666 } 667 for (iouter = 1; iouter <= n; iouter++) { 668 nzv = nonzer; 669 sprnvc(n, nzv, v, iv, &(colidx[0]), &(colidx[n])); 670 vecset(n, v, iv, &nzv, iouter, 0.5); 671 for (ivelt = 1; ivelt <= nzv; ivelt++) { 672 jcol = iv[ivelt]; 673 if (jcol >= firstcol && jcol <= lastcol) { 674 scale = size * v[ivelt]; 675 for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) { 676 irow = iv[ivelt1]; 677 if (irow >= firstrow && irow <= lastrow) { 678 nnza = nnza + 1; 679 if (nnza > nz) { 680 printf("Space for matrix elements exceeded in" 681 " makea\n"); 682 printf("nnza, nzmax = %d, %d\n", nnza, nz); 683 printf("iouter = %d\n", iouter); 684 exit(1); 685 } 686 acol[nnza] = jcol; 687 arow[nnza] = irow; 688 aelt[nnza] = v[ivelt1] * scale; 689 } 690 } 691 } 692 } 693 size = size * ratio; 694 } 695 696/*--------------------------------------------------------------------- 697c ... add the identity * rcond to the generated matrix to bound 698c the smallest eigenvalue from below by rcond 699c---------------------------------------------------------------------*/ 700 for (i = firstrow; i <= lastrow; i++) { 701 if (i >= firstcol && i <= lastcol) { 702 iouter = n + i; 703 nnza = nnza + 1; 704 if (nnza > nz) { 705 printf("Space for matrix elements exceeded in makea\n"); 706 printf("nnza, nzmax = %d, %d\n", nnza, nz); 707 printf("iouter = %d\n", iouter); 708 exit(1); 709 } 710 acol[nnza] = i; 711 arow[nnza] = i; 712 aelt[nnza] = rcond - shift; 713 } 714 } 715 716/*--------------------------------------------------------------------- 717c ... make the sparse matrix from list of elements with duplicates 718c (v and iv are used as workspace) 719c---------------------------------------------------------------------*/ 720 sparse(a, colidx, rowstr, n, arow, acol, aelt, 721 firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza); 722} 723 724/*--------------------------------------------------- 725c generate a sparse matrix from a list of 726c [col, row, element] tri 727c---------------------------------------------------*/ 728static void sparse( 729 double a[], /* a[1:*] */ 730 int colidx[], /* colidx[1:*] */ 731 int rowstr[], /* rowstr[1:*] */ 732 int n, 733 int arow[], /* arow[1:*] */ 734 int acol[], /* acol[1:*] */ 735 double aelt[], /* aelt[1:*] */ 736 int firstrow, 737 int lastrow, 738 double x[], /* x[1:n] */ 739 boolean mark[], /* mark[1:n] */ 740 int nzloc[], /* nzloc[1:n] */ 741 int nnza) 742/*--------------------------------------------------------------------- 743c rows range from firstrow to lastrow 744c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values 745c---------------------------------------------------------------------*/ 746{ 747 int nrows; 748 int i, j, jajp1, nza, k, nzrow; 749 double xi; 750 751/*-------------------------------------------------------------------- 752c how many rows of result 753c-------------------------------------------------------------------*/ 754 nrows = lastrow - firstrow + 1; 755 756/*-------------------------------------------------------------------- 757c ...count the number of triples in each row 758c-------------------------------------------------------------------*/ 759#pragma omp parallel for 760 for (j = 1; j <= n; j++) { 761 rowstr[j] = 0; 762 mark[j] = FALSE; 763 } 764 rowstr[n+1] = 0; 765 766 for (nza = 1; nza <= nnza; nza++) { 767 j = (arow[nza] - firstrow + 1) + 1; 768 rowstr[j] = rowstr[j] + 1; 769 } 770 771 rowstr[1] = 1; 772 for (j = 2; j <= nrows+1; j++) { 773 rowstr[j] = rowstr[j] + rowstr[j-1]; 774 } 775 776/*--------------------------------------------------------------------- 777c ... rowstr(j) now is the location of the first nonzero 778c of row j of a 779c---------------------------------------------------------------------*/ 780 781/*-------------------------------------------------------------------- 782c ... do a bucket sort of the triples on the row index 783c-------------------------------------------------------------------*/ 784 for (nza = 1; nza <= nnza; nza++) { 785 j = arow[nza] - firstrow + 1; 786 k = rowstr[j]; 787 a[k] = aelt[nza]; 788 colidx[k] = acol[nza]; 789 rowstr[j] = rowstr[j] + 1; 790 } 791 792/*-------------------------------------------------------------------- 793c ... rowstr(j) now points to the first element of row j+1 794c-------------------------------------------------------------------*/ 795 for (j = nrows; j >= 1; j--) { 796 rowstr[j+1] = rowstr[j]; 797 } 798 rowstr[1] = 1; 799 800/*-------------------------------------------------------------------- 801c ... generate the actual output rows by adding elements 802c-------------------------------------------------------------------*/ 803 nza = 0; 804#pragma omp parallel for 805 for (i = 1; i <= n; i++) { 806 x[i] = 0.0; 807 mark[i] = FALSE; 808 } 809 810 jajp1 = rowstr[1]; 811 for (j = 1; j <= nrows; j++) { 812 nzrow = 0; 813 814/*-------------------------------------------------------------------- 815c ...loop over the jth row of a 816c-------------------------------------------------------------------*/ 817 for (k = jajp1; k < rowstr[j+1]; k++) { 818 i = colidx[k]; 819 x[i] = x[i] + a[k]; 820 if ( mark[i] == FALSE && x[i] != 0.0) { 821 mark[i] = TRUE; 822 nzrow = nzrow + 1; 823 nzloc[nzrow] = i; 824 } 825 } 826 827/*-------------------------------------------------------------------- 828c ... extract the nonzeros of this row 829c-------------------------------------------------------------------*/ 830 for (k = 1; k <= nzrow; k++) { 831 i = nzloc[k]; 832 mark[i] = FALSE; 833 xi = x[i]; 834 x[i] = 0.0; 835 if (xi != 0.0) { 836 nza = nza + 1; 837 a[nza] = xi; 838 colidx[nza] = i; 839 } 840 } 841 jajp1 = rowstr[j+1]; 842 rowstr[j+1] = nza + rowstr[1]; 843 } 844} 845 846/*--------------------------------------------------------------------- 847c generate a sparse n-vector (v, iv) 848c having nzv nonzeros 849c 850c mark(i) is set to 1 if position i is nonzero. 851c mark is all zero on entry and is reset to all zero before exit 852c this corrects a performance bug found by John G. Lewis, caused by 853c reinitialization of mark on every one of the n calls to sprnvc 854---------------------------------------------------------------------*/ 855static void sprnvc( 856 int n, 857 int nz, 858 double v[], /* v[1:*] */ 859 int iv[], /* iv[1:*] */ 860 int nzloc[], /* nzloc[1:n] */ 861 int mark[] ) /* mark[1:n] */ 862{ 863 int nn1; 864 int nzrow, nzv, ii, i; 865 double vecelt, vecloc; 866 867 nzv = 0; 868 nzrow = 0; 869 nn1 = 1; 870 do { 871 nn1 = 2 * nn1; 872 } while (nn1 < n); 873 874/*-------------------------------------------------------------------- 875c nn1 is the smallest power of two not less than n 876c-------------------------------------------------------------------*/ 877 878 while (nzv < nz) { 879 vecelt = randlc(&tran, amult); 880 881/*-------------------------------------------------------------------- 882c generate an integer between 1 and n in a portable manner 883c-------------------------------------------------------------------*/ 884 vecloc = randlc(&tran, amult); 885 i = icnvrt(vecloc, nn1) + 1; 886 if (i > n) continue; 887 888/*-------------------------------------------------------------------- 889c was this integer generated already? 890c-------------------------------------------------------------------*/ 891 if (mark[i] == 0) { 892 mark[i] = 1; 893 nzrow = nzrow + 1; 894 nzloc[nzrow] = i; 895 nzv = nzv + 1; 896 v[nzv] = vecelt; 897 iv[nzv] = i; 898 } 899 } 900 901 for (ii = 1; ii <= nzrow; ii++) { 902 i = nzloc[ii]; 903 mark[i] = 0; 904 } 905} 906 907/*--------------------------------------------------------------------- 908* scale a double precision number x in (0,1) by a power of 2 and chop it 909*---------------------------------------------------------------------*/ 910static int icnvrt(double x, int ipwr2) { 911 return ((int)(ipwr2 * x)); 912} 913 914/*-------------------------------------------------------------------- 915c set ith element of sparse vector (v, iv) with 916c nzv nonzeros to val 917c-------------------------------------------------------------------*/ 918static void vecset( 919 int n, 920 double v[], /* v[1:*] */ 921 int iv[], /* iv[1:*] */ 922 int *nzv, 923 int i, 924 double val) 925{ 926 int k; 927 boolean set; 928 929 set = FALSE; 930 for (k = 1; k <= *nzv; k++) { 931 if (iv[k] == i) { 932 v[k] = val; 933 set = TRUE; 934 } 935 } 936 if (set == FALSE) { 937 *nzv = *nzv + 1; 938 v[*nzv] = val; 939 iv[*nzv] = i; 940 } 941} 942