1/*--------------------------------------------------------------------
2
3  NAS Parallel Benchmarks 2.3 OpenMP C versions - CG
4
5  This benchmark is an OpenMP C version of the NPB CG code.
6
7  The OpenMP C versions are developed by RWCP and derived from the serial
8  Fortran versions in "NPB 2.3-serial" developed by NAS.
9
10  Permission to use, copy, distribute and modify this software for any
11  purpose with or without fee is hereby granted.
12  This software is provided "as is" without express or implied warranty.
13
14  Send comments on the OpenMP C versions to pdp-openmp@rwcp.or.jp
15
16  Information on OpenMP activities at RWCP is available at:
17
18           http://pdplab.trc.rwcp.or.jp/pdperf/Omni/
19
20  Information on NAS Parallel Benchmarks 2.3 is available at:
21
22           http://www.nas.nasa.gov/NAS/NPB/
23
24--------------------------------------------------------------------*/
25/*--------------------------------------------------------------------
26
27  Authors: M. Yarrow
28           C. Kuszmaul
29
30  OpenMP C version: S. Satoh
31
32--------------------------------------------------------------------*/
33
34/*
35c---------------------------------------------------------------------
36c  Note: please observe that in the routine conj_grad three
37c  implementations of the sparse matrix-vector multiply have
38c  been supplied.  The default matrix-vector multiply is not
39c  loop unrolled.  The alternate implementations are unrolled
40c  to a depth of 2 and unrolled to a depth of 8.  Please
41c  experiment with these to find the fastest for your particular
42c  architecture.  If reporting timing results, any of these three may
43c  be used without penalty.
44c---------------------------------------------------------------------
45*/
46
47#include "npb-C.h"
48#include "npbparams.h"
49
50#undef NZ
51#define	NZ	NA*(NONZER+1)*(NONZER+1)+NA*(NONZER+2)
52
53/* global variables */
54
55/* common /partit_size/ */
56static int naa;
57static int nzz;
58static int firstrow;
59static int lastrow;
60static int firstcol;
61static int lastcol;
62
63/* common /main_int_mem/ */
64static int colidx[NZ+1];	/* colidx[1:NZ] */
65static int rowstr[NA+1+1];	/* rowstr[1:NA+1] */
66static int iv[2*NA+1+1];	/* iv[1:2*NA+1] */
67static int arow[NZ+1];		/* arow[1:NZ] */
68static int acol[NZ+1];		/* acol[1:NZ] */
69
70/* common /main_flt_mem/ */
71static double v[NA+1+1];	/* v[1:NA+1] */
72static double aelt[NZ+1];	/* aelt[1:NZ] */
73static double a[NZ+1];		/* a[1:NZ] */
74static double x[NA+2+1];	/* x[1:NA+2] */
75static double z[NA+2+1];	/* z[1:NA+2] */
76static double p[NA+2+1];	/* p[1:NA+2] */
77static double q[NA+2+1];	/* q[1:NA+2] */
78static double r[NA+2+1];	/* r[1:NA+2] */
79static double w[NA+2+1];	/* w[1:NA+2] */
80
81/* common /urando/ */
82static double amult;
83static double tran;
84
85/* function declarations */
86static void conj_grad (int colidx[], int rowstr[], double x[], double z[],
87		       double a[], double p[], double q[], double r[],
88		       double w[], double *rnorm);
89static void makea(int n, int nz, double a[], int colidx[], int rowstr[],
90		  int nonzer, int firstrow, int lastrow, int firstcol,
91		  int lastcol, double rcond, int arow[], int acol[],
92		  double aelt[], double v[], int iv[], double shift );
93static void sparse(double a[], int colidx[], int rowstr[], int n,
94		   int arow[], int acol[], double aelt[],
95		   int firstrow, int lastrow,
96		   double x[], boolean mark[], int nzloc[], int nnza);
97static void sprnvc(int n, int nz, double v[], int iv[], int nzloc[],
98		   int mark[]);
99static int icnvrt(double x, int ipwr2);
100static void vecset(int n, double v[], int iv[], int *nzv, int i, double val);
101
102/*--------------------------------------------------------------------
103      program cg
104--------------------------------------------------------------------*/
105
106int main(int argc, char **argv) {
107    int	i, j, k, it;
108    int nthreads = 1;
109    double zeta;
110    double rnorm;
111    double norm_temp11;
112    double norm_temp12;
113    double t, mflops;
114    char class;
115    boolean verified;
116    double zeta_verify_value, epsilon;
117
118    if (argc != 2) { /* Print usage */
119        printf("Usage: %s <Number of threads>\n", argv[0]);
120        abort();
121    }
122
123#ifdef BOMP
124    bomp_bomp_init(atoi(argv[1]));
125#endif
126
127    printf("Benchmark start");
128
129    omp_set_num_threads(atoi(argv[1]));
130    firstrow = 1;
131    lastrow  = NA;
132    firstcol = 1;
133    lastcol  = NA;
134
135    if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) {
136	class = 'S';
137	zeta_verify_value = 8.5971775078648;
138    } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) {
139	class = 'W';
140	zeta_verify_value = 10.362595087124;
141    } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) {
142	class = 'A';
143	zeta_verify_value = 17.130235054029;
144    } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) {
145	class = 'B';
146	zeta_verify_value = 22.712745482631;
147    } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) {
148	class = 'C';
149	zeta_verify_value = 28.973605592845;
150    } else {
151	class = 'U';
152    }
153
154    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
155	   " - CG Benchmark\n");
156    printf(" Size: %10d\n", NA);
157    printf(" Iterations: %5d\n", NITER);
158
159    naa = NA;
160    nzz = NZ;
161
162/*--------------------------------------------------------------------
163c  Initialize random number generator
164c-------------------------------------------------------------------*/
165    tran    = 314159265.0;
166    amult   = 1220703125.0;
167    zeta    = randlc( &tran, amult );
168
169/*--------------------------------------------------------------------
170c
171c-------------------------------------------------------------------*/
172    makea(naa, nzz, a, colidx, rowstr, NONZER,
173	  firstrow, lastrow, firstcol, lastcol,
174	  RCOND, arow, acol, aelt, v, iv, SHIFT);
175
176/*---------------------------------------------------------------------
177c  Note: as a result of the above call to makea:
178c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
179c        values of colidx which are col indexes go from firstcol --> lastcol
180c        So:
181c        Shift the col index vals from actual (firstcol --> lastcol )
182c        to local, i.e., (1 --> lastcol-firstcol+1)
183c---------------------------------------------------------------------*/
184#pragma omp parallel private(it,i,j,k)
185{
186#pragma omp for nowait
187    for (j = 1; j <= lastrow - firstrow + 1; j++) {
188	for (k = rowstr[j]; k < rowstr[j+1]; k++) {
189            colidx[k] = colidx[k] - firstcol + 1;
190	}
191    }
192
193/*--------------------------------------------------------------------
194c  set starting vector to (1, 1, .... 1)
195c-------------------------------------------------------------------*/
196#pragma omp for nowait
197    for (i = 1; i <= NA+1; i++) {
198	x[i] = 1.0;
199    }
200#pragma omp single
201    zeta  = 0.0;
202
203/*-------------------------------------------------------------------
204c---->
205c  Do one iteration untimed to init all code and data page tables
206c---->                    (then reinit, start timing, to niter its)
207c-------------------------------------------------------------------*/
208
209    for (it = 1; it <= 1; it++) {
210
211/*--------------------------------------------------------------------
212c  The call to the conjugate gradient routine:
213c-------------------------------------------------------------------*/
214	conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm);
215
216/*--------------------------------------------------------------------
217c  zeta = shift + 1/(x.z)
218c  So, first: (x.z)
219c  Also, find norm of z
220c  So, first: (z.z)
221c-------------------------------------------------------------------*/
222#pragma omp single
223{
224	norm_temp11 = 0.0;
225	norm_temp12 = 0.0;
226} /* end single */
227
228#pragma omp for reduction(+:norm_temp11,norm_temp12)
229	for (j = 1; j <= lastcol-firstcol+1; j++) {
230            norm_temp11 = norm_temp11 + x[j]*z[j];
231            norm_temp12 = norm_temp12 + z[j]*z[j];
232	}
233#pragma omp single
234	norm_temp12 = 1.0 / sqrt( norm_temp12 );
235
236/*--------------------------------------------------------------------
237c  Normalize z to obtain x
238c-------------------------------------------------------------------*/
239#pragma omp for
240	for (j = 1; j <= lastcol-firstcol+1; j++) {
241            x[j] = norm_temp12*z[j];
242	}
243
244    } /* end of do one iteration untimed */
245
246/*--------------------------------------------------------------------
247c  set starting vector to (1, 1, .... 1)
248c-------------------------------------------------------------------*/
249#pragma omp for nowait
250    for (i = 1; i <= NA+1; i++) {
251         x[i] = 1.0;
252    }
253#pragma omp single
254    zeta  = 0.0;
255
256} /* end parallel */
257
258    timer_clear( 1 );
259    timer_start( 1 );
260
261/*--------------------------------------------------------------------
262c---->
263c  Main Iteration for inverse power method
264c---->
265c-------------------------------------------------------------------*/
266
267#pragma omp parallel private(it,i,j,k)
268{
269    for (it = 1; it <= NITER; it++) {
270
271/*--------------------------------------------------------------------
272c  The call to the conjugate gradient routine:
273c-------------------------------------------------------------------*/
274	conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm);
275
276/*--------------------------------------------------------------------
277c  zeta = shift + 1/(x.z)
278c  So, first: (x.z)
279c  Also, find norm of z
280c  So, first: (z.z)
281c-------------------------------------------------------------------*/
282#pragma omp single
283{
284	norm_temp11 = 0.0;
285	norm_temp12 = 0.0;
286} /* end single */
287
288#pragma omp for reduction(+:norm_temp11,norm_temp12)
289	for (j = 1; j <= lastcol-firstcol+1; j++) {
290            norm_temp11 = norm_temp11 + x[j]*z[j];
291            norm_temp12 = norm_temp12 + z[j]*z[j];
292	}
293
294#pragma omp single
295{
296	norm_temp12 = 1.0 / sqrt( norm_temp12 );
297
298	zeta = SHIFT + 1.0 / norm_temp11;
299} /* end single */
300
301#pragma omp master
302{
303/* 	if( it == 1 ) { */
304/*             printf("   iteration           ||r||                 zeta\n"); */
305/* 	} */
306/* 	printf("    %5d       %20.14e%20.13e\n", it, rnorm, zeta); */
307} /* end master */
308
309/*--------------------------------------------------------------------
310c  Normalize z to obtain x
311c-------------------------------------------------------------------*/
312#pragma omp for
313	for (j = 1; j <= lastcol-firstcol+1; j++) {
314            x[j] = norm_temp12*z[j];
315	}
316    } /* end of main iter inv pow meth */
317
318#if defined(_OPENMP)
319#pragma omp master
320    nthreads = omp_get_num_threads();
321#endif /* _OPENMP */
322} /* end parallel */
323
324    timer_stop( 1 );
325
326/*--------------------------------------------------------------------
327c  End of timed section
328c-------------------------------------------------------------------*/
329
330    t = timer_read( 1 );
331
332    printf(" Benchmark completed\n");
333
334    epsilon = 1.0e-10;
335    if (class != 'U') {
336	if (fabs(zeta - zeta_verify_value) <= epsilon) {
337            verified = TRUE;
338	    printf(" VERIFICATION SUCCESSFUL\n");
339	    printf(" Zeta is    %20.12e\n", zeta);
340	    printf(" Error is   %20.12e\n", zeta - zeta_verify_value);
341	} else {
342            verified = FALSE;
343	    printf(" VERIFICATION FAILED\n");
344	    printf(" Zeta                %20.12e\n", zeta);
345	    printf(" The correct zeta is %20.12e\n", zeta_verify_value);
346	}
347    } else {
348	verified = FALSE;
349	printf(" Problem size unknown\n");
350	printf(" NO VERIFICATION PERFORMED\n");
351    }
352
353    if ( t != 0.0 ) {
354	mflops = (2.0*NITER*NA)
355	    * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 )
356	    / t / 1000000.0;
357    } else {
358	mflops = 0.0;
359    }
360
361#ifdef BOMP
362//backend_create_time(atoi(argv[1]));
363#endif
364printf("Computetime %d %f\n", atoi(argv[1]), t);
365printf("client done\n");
366
367/*     c_print_results("CG", class, NA, 0, 0, NITER, nthreads, t, */
368/* 		    mflops, "          floating point", */
369/* 		    verified, NPBVERSION, COMPILETIME, */
370/* 		    CS1, CS2, CS3, CS4, CS5, CS6, CS7); */
371}
372
373/*--------------------------------------------------------------------
374c-------------------------------------------------------------------*/
375static void conj_grad (
376    int colidx[],	/* colidx[1:nzz] */
377    int rowstr[],	/* rowstr[1:naa+1] */
378    double x[],		/* x[*] */
379    double z[],		/* z[*] */
380    double a[],		/* a[1:nzz] */
381    double p[],		/* p[*] */
382    double q[],		/* q[*] */
383    double r[],		/* r[*] */
384    double w[],		/* w[*] */
385    double *rnorm )
386/*--------------------------------------------------------------------
387c-------------------------------------------------------------------*/
388
389/*---------------------------------------------------------------------
390c  Floaging point arrays here are named as in NPB1 spec discussion of
391c  CG algorithm
392c---------------------------------------------------------------------*/
393{
394    static double d, sum, rho, rho0, alpha, beta;
395    int i, j, k;
396    int cgit, cgitmax = 25;
397
398#pragma omp single nowait
399    rho = 0.0;
400
401/*--------------------------------------------------------------------
402c  Initialize the CG algorithm:
403c-------------------------------------------------------------------*/
404#pragma omp for nowait
405    for (j = 1; j <= naa+1; j++) {
406	q[j] = 0.0;
407	z[j] = 0.0;
408	r[j] = x[j];
409	p[j] = r[j];
410	w[j] = 0.0;
411    }
412
413/*--------------------------------------------------------------------
414c  rho = r.r
415c  Now, obtain the norm of r: First, sum squares of r elements locally...
416c-------------------------------------------------------------------*/
417#pragma omp for reduction(+:rho)
418    for (j = 1; j <= lastcol-firstcol+1; j++) {
419	rho = rho + x[j]*x[j];
420    }
421
422/*--------------------------------------------------------------------
423c---->
424c  The conj grad iteration loop
425c---->
426c-------------------------------------------------------------------*/
427    for (cgit = 1; cgit <= cgitmax; cgit++) {
428#pragma omp single nowait
429{
430      rho0 = rho;
431      d = 0.0;
432      rho = 0.0;
433} /* end single */
434
435/*--------------------------------------------------------------------
436c  q = A.p
437c  The partition submatrix-vector multiply: use workspace w
438c---------------------------------------------------------------------
439C
440C  NOTE: this version of the multiply is actually (slightly: maybe %5)
441C        faster on the sp2 on 16 nodes than is the unrolled-by-2 version
442C        below.   On the Cray t3d, the reverse is true, i.e., the
443C        unrolled-by-two version is some 10% faster.
444C        The unrolled-by-8 version below is significantly faster
445C        on the Cray t3d - overall speed of code is 1.5 times faster.
446*/
447
448/* rolled version */
449#pragma omp for private(sum,k)
450	for (j = 1; j <= lastrow-firstrow+1; j++) {
451            sum = 0.0;
452	    for (k = rowstr[j]; k < rowstr[j+1]; k++) {
453		sum = sum + a[k]*p[colidx[k]];
454	    }
455            w[j] = sum;
456	}
457
458/* unrolled-by-two version
459#pragma omp for private(i,k)
460        for (j = 1; j <= lastrow-firstrow+1; j++) {
461	    int iresidue;
462	    double sum1, sum2;
463	    i = rowstr[j];
464            iresidue = (rowstr[j+1]-i) % 2;
465            sum1 = 0.0;
466            sum2 = 0.0;
467            if (iresidue == 1) sum1 = sum1 + a[i]*p[colidx[i]];
468	    for (k = i+iresidue; k <= rowstr[j+1]-2; k += 2) {
469		sum1 = sum1 + a[k]   * p[colidx[k]];
470		sum2 = sum2 + a[k+1] * p[colidx[k+1]];
471	    }
472            w[j] = sum1 + sum2;
473        }
474*/
475/* unrolled-by-8 version
476#pragma omp for private(i,k,sum)
477        for (j = 1; j <= lastrow-firstrow+1; j++) {
478	    int iresidue;
479            i = rowstr[j];
480            iresidue = (rowstr[j+1]-i) % 8;
481            sum = 0.0;
482            for (k = i; k <= i+iresidue-1; k++) {
483                sum = sum +  a[k] * p[colidx[k]];
484            }
485            for (k = i+iresidue; k <= rowstr[j+1]-8; k += 8) {
486                sum = sum + a[k  ] * p[colidx[k  ]]
487                          + a[k+1] * p[colidx[k+1]]
488                          + a[k+2] * p[colidx[k+2]]
489                          + a[k+3] * p[colidx[k+3]]
490                          + a[k+4] * p[colidx[k+4]]
491                          + a[k+5] * p[colidx[k+5]]
492                          + a[k+6] * p[colidx[k+6]]
493                          + a[k+7] * p[colidx[k+7]];
494            }
495            w[j] = sum;
496        }
497*/
498
499#pragma omp for
500	for (j = 1; j <= lastcol-firstcol+1; j++) {
501            q[j] = w[j];
502	}
503
504/*--------------------------------------------------------------------
505c  Clear w for reuse...
506c-------------------------------------------------------------------*/
507#pragma omp for	nowait
508	for (j = 1; j <= lastcol-firstcol+1; j++) {
509            w[j] = 0.0;
510	}
511
512/*--------------------------------------------------------------------
513c  Obtain p.q
514c-------------------------------------------------------------------*/
515#pragma omp for reduction(+:d)
516	for (j = 1; j <= lastcol-firstcol+1; j++) {
517            d = d + p[j]*q[j];
518	}
519
520/*--------------------------------------------------------------------
521c  Obtain alpha = rho / (p.q)
522c-------------------------------------------------------------------*/
523#pragma omp single
524	alpha = rho0 / d;
525
526/*--------------------------------------------------------------------
527c  Save a temporary of rho
528c-------------------------------------------------------------------*/
529	/*	rho0 = rho;*/
530
531/*---------------------------------------------------------------------
532c  Obtain z = z + alpha*p
533c  and    r = r - alpha*q
534c---------------------------------------------------------------------*/
535#pragma omp for
536	for (j = 1; j <= lastcol-firstcol+1; j++) {
537            z[j] = z[j] + alpha*p[j];
538            r[j] = r[j] - alpha*q[j];
539	}
540
541/*---------------------------------------------------------------------
542c  rho = r.r
543c  Now, obtain the norm of r: First, sum squares of r elements locally...
544c---------------------------------------------------------------------*/
545#pragma omp for reduction(+:rho)
546	for (j = 1; j <= lastcol-firstcol+1; j++) {
547            rho = rho + r[j]*r[j];
548	}
549
550/*--------------------------------------------------------------------
551c  Obtain beta:
552c-------------------------------------------------------------------*/
553#pragma omp single
554	beta = rho / rho0;
555
556/*--------------------------------------------------------------------
557c  p = r + beta*p
558c-------------------------------------------------------------------*/
559#pragma omp for
560	for (j = 1; j <= lastcol-firstcol+1; j++) {
561            p[j] = r[j] + beta*p[j];
562	}
563    } /* end of do cgit=1,cgitmax */
564
565/*---------------------------------------------------------------------
566c  Compute residual norm explicitly:  ||r|| = ||x - A.z||
567c  First, form A.z
568c  The partition submatrix-vector multiply
569c---------------------------------------------------------------------*/
570#pragma omp single nowait
571    sum = 0.0;
572
573#pragma omp for private(d, k)
574    for (j = 1; j <= lastrow-firstrow+1; j++) {
575	d = 0.0;
576	for (k = rowstr[j]; k <= rowstr[j+1]-1; k++) {
577            d = d + a[k]*z[colidx[k]];
578	}
579	w[j] = d;
580    }
581
582#pragma omp for
583    for (j = 1; j <= lastcol-firstcol+1; j++) {
584	r[j] = w[j];
585    }
586
587/*--------------------------------------------------------------------
588c  At this point, r contains A.z
589c-------------------------------------------------------------------*/
590#pragma omp for reduction(+:sum) private(d)
591    for (j = 1; j <= lastcol-firstcol+1; j++) {
592	d = x[j] - r[j];
593	sum = sum + d*d;
594    }
595
596#pragma omp single
597  {
598    (*rnorm) = sqrt(sum);
599  } /* end single */
600}
601
602/*---------------------------------------------------------------------
603c       generate the test problem for benchmark 6
604c       makea generates a sparse matrix with a
605c       prescribed sparsity distribution
606c
607c       parameter    type        usage
608c
609c       input
610c
611c       n            i           number of cols/rows of matrix
612c       nz           i           nonzeros as declared array size
613c       rcond        r*8         condition number
614c       shift        r*8         main diagonal shift
615c
616c       output
617c
618c       a            r*8         array for nonzeros
619c       colidx       i           col indices
620c       rowstr       i           row pointers
621c
622c       workspace
623c
624c       iv, arow, acol i
625c       v, aelt        r*8
626c---------------------------------------------------------------------*/
627static void makea(
628    int n,
629    int nz,
630    double a[],		/* a[1:nz] */
631    int colidx[],	/* colidx[1:nz] */
632    int rowstr[],	/* rowstr[1:n+1] */
633    int nonzer,
634    int firstrow,
635    int lastrow,
636    int firstcol,
637    int lastcol,
638    double rcond,
639    int arow[],		/* arow[1:nz] */
640    int acol[],		/* acol[1:nz] */
641    double aelt[],	/* aelt[1:nz] */
642    double v[],		/* v[1:n+1] */
643    int iv[],		/* iv[1:2*n+1] */
644    double shift )
645{
646    int i, nnza, iouter, ivelt, ivelt1, irow, nzv;
647
648/*--------------------------------------------------------------------
649c      nonzer is approximately  (int(sqrt(nnza /n)));
650c-------------------------------------------------------------------*/
651
652    double size, ratio, scale;
653    int jcol;
654
655    size = 1.0;
656    ratio = pow(rcond, (1.0 / (double)n));
657    nnza = 0;
658
659/*---------------------------------------------------------------------
660c  Initialize colidx(n+1 .. 2n) to zero.
661c  Used by sprnvc to mark nonzero positions
662c---------------------------------------------------------------------*/
663#pragma omp parallel for
664    for (i = 1; i <= n; i++) {
665	colidx[n+i] = 0;
666    }
667    for (iouter = 1; iouter <= n; iouter++) {
668	nzv = nonzer;
669	sprnvc(n, nzv, v, iv, &(colidx[0]), &(colidx[n]));
670	vecset(n, v, iv, &nzv, iouter, 0.5);
671	for (ivelt = 1; ivelt <= nzv; ivelt++) {
672	    jcol = iv[ivelt];
673	    if (jcol >= firstcol && jcol <= lastcol) {
674		scale = size * v[ivelt];
675		for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) {
676	            irow = iv[ivelt1];
677                    if (irow >= firstrow && irow <= lastrow) {
678			nnza = nnza + 1;
679			if (nnza > nz) {
680			    printf("Space for matrix elements exceeded in"
681				   " makea\n");
682			    printf("nnza, nzmax = %d, %d\n", nnza, nz);
683			    printf("iouter = %d\n", iouter);
684			    exit(1);
685			}
686			acol[nnza] = jcol;
687			arow[nnza] = irow;
688			aelt[nnza] = v[ivelt1] * scale;
689		    }
690		}
691	    }
692	}
693	size = size * ratio;
694    }
695
696/*---------------------------------------------------------------------
697c       ... add the identity * rcond to the generated matrix to bound
698c           the smallest eigenvalue from below by rcond
699c---------------------------------------------------------------------*/
700    for (i = firstrow; i <= lastrow; i++) {
701	if (i >= firstcol && i <= lastcol) {
702	    iouter = n + i;
703	    nnza = nnza + 1;
704	    if (nnza > nz) {
705		printf("Space for matrix elements exceeded in makea\n");
706		printf("nnza, nzmax = %d, %d\n", nnza, nz);
707		printf("iouter = %d\n", iouter);
708		exit(1);
709	    }
710	    acol[nnza] = i;
711	    arow[nnza] = i;
712	    aelt[nnza] = rcond - shift;
713	}
714    }
715
716/*---------------------------------------------------------------------
717c       ... make the sparse matrix from list of elements with duplicates
718c           (v and iv are used as  workspace)
719c---------------------------------------------------------------------*/
720    sparse(a, colidx, rowstr, n, arow, acol, aelt,
721	   firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza);
722}
723
724/*---------------------------------------------------
725c       generate a sparse matrix from a list of
726c       [col, row, element] tri
727c---------------------------------------------------*/
728static void sparse(
729    double a[],		/* a[1:*] */
730    int colidx[],	/* colidx[1:*] */
731    int rowstr[],	/* rowstr[1:*] */
732    int n,
733    int arow[],		/* arow[1:*] */
734    int acol[],		/* acol[1:*] */
735    double aelt[],	/* aelt[1:*] */
736    int firstrow,
737    int lastrow,
738    double x[],		/* x[1:n] */
739    boolean mark[],	/* mark[1:n] */
740    int nzloc[],	/* nzloc[1:n] */
741    int nnza)
742/*---------------------------------------------------------------------
743c       rows range from firstrow to lastrow
744c       the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
745c---------------------------------------------------------------------*/
746{
747    int nrows;
748    int i, j, jajp1, nza, k, nzrow;
749    double xi;
750
751/*--------------------------------------------------------------------
752c    how many rows of result
753c-------------------------------------------------------------------*/
754    nrows = lastrow - firstrow + 1;
755
756/*--------------------------------------------------------------------
757c     ...count the number of triples in each row
758c-------------------------------------------------------------------*/
759#pragma omp parallel for
760    for (j = 1; j <= n; j++) {
761	rowstr[j] = 0;
762	mark[j] = FALSE;
763    }
764    rowstr[n+1] = 0;
765
766    for (nza = 1; nza <= nnza; nza++) {
767	j = (arow[nza] - firstrow + 1) + 1;
768	rowstr[j] = rowstr[j] + 1;
769    }
770
771    rowstr[1] = 1;
772    for (j = 2; j <= nrows+1; j++) {
773	rowstr[j] = rowstr[j] + rowstr[j-1];
774    }
775
776/*---------------------------------------------------------------------
777c     ... rowstr(j) now is the location of the first nonzero
778c           of row j of a
779c---------------------------------------------------------------------*/
780
781/*--------------------------------------------------------------------
782c     ... do a bucket sort of the triples on the row index
783c-------------------------------------------------------------------*/
784    for (nza = 1; nza <= nnza; nza++) {
785	j = arow[nza] - firstrow + 1;
786	k = rowstr[j];
787	a[k] = aelt[nza];
788	colidx[k] = acol[nza];
789	rowstr[j] = rowstr[j] + 1;
790    }
791
792/*--------------------------------------------------------------------
793c       ... rowstr(j) now points to the first element of row j+1
794c-------------------------------------------------------------------*/
795    for (j = nrows; j >= 1; j--) {
796	rowstr[j+1] = rowstr[j];
797    }
798    rowstr[1] = 1;
799
800/*--------------------------------------------------------------------
801c       ... generate the actual output rows by adding elements
802c-------------------------------------------------------------------*/
803    nza = 0;
804#pragma omp parallel for
805    for (i = 1; i <= n; i++) {
806	x[i] = 0.0;
807	mark[i] = FALSE;
808    }
809
810    jajp1 = rowstr[1];
811    for (j = 1; j <= nrows; j++) {
812	nzrow = 0;
813
814/*--------------------------------------------------------------------
815c          ...loop over the jth row of a
816c-------------------------------------------------------------------*/
817	for (k = jajp1; k < rowstr[j+1]; k++) {
818            i = colidx[k];
819            x[i] = x[i] + a[k];
820            if ( mark[i] == FALSE && x[i] != 0.0) {
821		mark[i] = TRUE;
822		nzrow = nzrow + 1;
823		nzloc[nzrow] = i;
824	    }
825	}
826
827/*--------------------------------------------------------------------
828c          ... extract the nonzeros of this row
829c-------------------------------------------------------------------*/
830	for (k = 1; k <= nzrow; k++) {
831            i = nzloc[k];
832            mark[i] = FALSE;
833            xi = x[i];
834            x[i] = 0.0;
835            if (xi != 0.0) {
836		nza = nza + 1;
837		a[nza] = xi;
838		colidx[nza] = i;
839	    }
840	}
841	jajp1 = rowstr[j+1];
842	rowstr[j+1] = nza + rowstr[1];
843    }
844}
845
846/*---------------------------------------------------------------------
847c       generate a sparse n-vector (v, iv)
848c       having nzv nonzeros
849c
850c       mark(i) is set to 1 if position i is nonzero.
851c       mark is all zero on entry and is reset to all zero before exit
852c       this corrects a performance bug found by John G. Lewis, caused by
853c       reinitialization of mark on every one of the n calls to sprnvc
854---------------------------------------------------------------------*/
855static void sprnvc(
856    int n,
857    int nz,
858    double v[],		/* v[1:*] */
859    int iv[],		/* iv[1:*] */
860    int nzloc[],	/* nzloc[1:n] */
861    int mark[] ) 	/* mark[1:n] */
862{
863    int nn1;
864    int nzrow, nzv, ii, i;
865    double vecelt, vecloc;
866
867    nzv = 0;
868    nzrow = 0;
869    nn1 = 1;
870    do {
871	nn1 = 2 * nn1;
872    } while (nn1 < n);
873
874/*--------------------------------------------------------------------
875c    nn1 is the smallest power of two not less than n
876c-------------------------------------------------------------------*/
877
878    while (nzv < nz) {
879	vecelt = randlc(&tran, amult);
880
881/*--------------------------------------------------------------------
882c   generate an integer between 1 and n in a portable manner
883c-------------------------------------------------------------------*/
884	vecloc = randlc(&tran, amult);
885	i = icnvrt(vecloc, nn1) + 1;
886	if (i > n) continue;
887
888/*--------------------------------------------------------------------
889c  was this integer generated already?
890c-------------------------------------------------------------------*/
891	if (mark[i] == 0) {
892	    mark[i] = 1;
893	    nzrow = nzrow + 1;
894	    nzloc[nzrow] = i;
895	    nzv = nzv + 1;
896	    v[nzv] = vecelt;
897	    iv[nzv] = i;
898	}
899    }
900
901    for (ii = 1; ii <= nzrow; ii++) {
902	i = nzloc[ii];
903	mark[i] = 0;
904    }
905}
906
907/*---------------------------------------------------------------------
908* scale a double precision number x in (0,1) by a power of 2 and chop it
909*---------------------------------------------------------------------*/
910static int icnvrt(double x, int ipwr2) {
911    return ((int)(ipwr2 * x));
912}
913
914/*--------------------------------------------------------------------
915c       set ith element of sparse vector (v, iv) with
916c       nzv nonzeros to val
917c-------------------------------------------------------------------*/
918static void vecset(
919    int n,
920    double v[],	/* v[1:*] */
921    int iv[],	/* iv[1:*] */
922    int *nzv,
923    int i,
924    double val)
925{
926    int k;
927    boolean set;
928
929    set = FALSE;
930    for (k = 1; k <= *nzv; k++) {
931	if (iv[k] == i) {
932            v[k] = val;
933            set  = TRUE;
934	}
935    }
936    if (set == FALSE) {
937	*nzv = *nzv + 1;
938	v[*nzv] = val;
939	iv[*nzv] = i;
940    }
941}
942