1/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */
2
3/*
4 * This code has been contributed by the DARPA HPCS program.  Contact
5 * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu>
6 * if you have questions.
7 *
8 * GUPS (Giga UPdates per Second) is a measurement that profiles the memory
9 * architecture of a system and is a measure of performance similar to MFLOPS.
10 * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the
11 * GUPS capability of a system, much like the LINPACK benchmark is intended to
12 * exercise the MFLOPS capability of a computer.  In each case, we would
13 * expect these benchmarks to achieve close to the "peak" capability of the
14 * memory system. The extent of the similarities between RandomAccess and
15 * LINPACK are limited to both benchmarks attempting to calculate a peak system
16 * capability.
17 *
18 * GUPS is calculated by identifying the number of memory locations that can be
19 * randomly updated in one second, divided by 1 billion (1e9). The term "randomly"
20 * means that there is little relationship between one address to be updated and
21 * the next, except that they occur in the space of one half the total system
22 * memory.  An update is a read-modify-write operation on a table of 64-bit words.
23 * An address is generated, the value at that address read from memory, modified
24 * by an integer operation (add, and, or, xor) with a literal value, and that
25 * new value is written back to memory.
26 *
27 * We are interested in knowing the GUPS performance of both entire systems and
28 * system subcomponents --- e.g., the GUPS rating of a distributed memory
29 * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a
30 * single processor.  While there is typically a scaling of FLOPS with processor
31 * count, a similar phenomenon may not always occur for GUPS.
32 *
33 * For additional information on the GUPS metric, the HPCchallenge RandomAccess
34 * Benchmark,and the rules to run RandomAccess or modify it to optimize
35 * performance -- see http://icl.cs.utk.edu/hpcc/
36 *
37 */
38
39/*
40 * This file contains the computational core of the single cpu version
41 * of GUPS.  The inner loop should easily be vectorized by compilers
42 * with such support.
43 *
44 * This core is used by both the single_cpu and star_single_cpu tests.
45 */
46
47#include <stdio.h>
48#ifdef BARRELFISH
49#include <barrelfish/barrelfish.h>
50#endif
51#include "RandomAccess.h"
52
53/* Number of updates to table (suggested: 4x number of table entries) */
54#define NUPDATE (4 * TableSize)
55
56/* Utility routine to start random number generator at Nth step */
57static uint64_t HPCC_starts(int64_t n)
58{
59    int i, j;
60    uint64_t m2[64];
61    uint64_t temp, ran;
62
63    while (n < 0)
64        n += PERIOD;
65    while (n > PERIOD)
66        n -= PERIOD;
67    if (n == 0)
68        return 0x1;
69
70    temp = 0x1;
71    for (i = 0; i < 64; i++) {
72        m2[i] = temp;
73        temp = (temp << 1) ^ ((int64_t) temp < 0 ? POLY : 0);
74        temp = (temp << 1) ^ ((int64_t) temp < 0 ? POLY : 0);
75    }
76
77    for (i = 62; i >= 0; i--)
78        if ((n >> i) & 1)
79            break;
80
81    ran = 0x2;
82    while (i > 0) {
83        temp = 0;
84        for (j = 0; j < 64; j++)
85            if ((ran >> j) & 1)
86                temp ^= m2[j];
87        ran = temp;
88        i -= 1;
89        if ((n >> i) & 1)
90            ran = (ran << 1) ^ ((int64_t) ran < 0 ? POLY : 0);
91    }
92
93    return ran;
94}
95
96static void RandomAccessUpdate(uint64_t TableSize, uint64_t *Table)
97{
98    uint64_t i;
99    uint64_t ran[128]; /* Current random numbers */
100    int j;
101
102    /* Perform updates to main table.  The scalar equivalent is:
103     *
104     *     uint64_t ran;
105     *     ran = 1;
106     *     for (i=0; i<NUPDATE; i++) {
107     *       ran = (ran << 1) ^ (((int64_t) ran < 0) ? POLY : 0);
108     *       table[ran & (TableSize-1)] ^= ran;
109     *     }
110     */
111    for (j = 0; j < 128; j++)
112        ran[j] = HPCC_starts((NUPDATE / 128) * j);
113
114    for (i = 0; i < NUPDATE / 128; i++) {
115        /* #pragma ivdep */
116#ifdef _OPENMP
117#pragma omp parallel for
118#endif
119        for (j = 0; j < 128; j++) {
120            ran[j] = (ran[j] << 1) ^ ((int64_t) ran[j] < 0 ? POLY : 0);
121            Table[ran[j] & (TableSize - 1)] ^= ran[j];
122        }
123    }
124}
125
126
127
128int HPCC_RandomAccess(HPCC_Params *params, int doIO, double *GUPs, int *failure)
129{
130    uint64_t i;
131    uint64_t temp;
132    double totalMem;
133    uint64_t *Table;
134    uint64_t logTableSize, TableSize;
135
136    /* calculate local memory per node for the update table */
137    totalMem = params->HPLMaxProcMem;
138    totalMem /= sizeof(uint64_t);
139
140    /* calculate the size of update array (must be a power of 2) */
141    for (totalMem *= 0.5, logTableSize = 0, TableSize = 1; totalMem >= 1.0;
142                    totalMem *= 0.5, logTableSize++, TableSize <<= 1)
143        ; /* EMPTY */
144
145    Table = HPCC_malloc(sizeof(uint64_t) * TableSize, params->TableAlignment);
146    if (!Table) {
147        printf("could not allocate table");
148        return 1;
149    }
150
151    params->RandomAccess_N = (int64_t) TableSize;
152
153    /* Print parameters for run */
154
155    printf("# GUPS: Main table (@%p)size   = 2^%" PRIu64 " = %" PRIu64 " words\n",
156           Table, logTableSize, TableSize);
157    printf("# GUPS: Number of updates = %" PRIu64 "\n", NUPDATE);
158
159    printf("# GUPS: Starting GUPS benchmark with %" PRIu32 " runs\n", params->NumReps);
160    bench_ctl_t *bench = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, params->NumReps);
161
162    cycles_t t_diff;
163
164    do {
165        /* Initialize main table for each run */
166        for (i = 0; i < TableSize; i++) {
167            Table[i] = i;
168        }
169
170        /* Begin timing here */
171#ifdef BARRELFISH
172        cycles_t t_start = bench_tsc();
173#else
174        cycles_t t_start = get_timems();
175#endif
176        RandomAccessUpdate(TableSize, Table);
177
178        /* End timed section */
179#ifdef BARRELFISH
180        cycles_t t_end = bench_tsc();
181        t_diff = bench_tsc_to_ms(bench_time_diff(t_start, t_end));
182#else
183        cycles_t t_end = get_timems();
184        t_diff = bench_time_diff(t_start, t_end);
185#endif
186        /* time elapsed in seconds */
187
188        printf("# GUPS: Round: %" PRIu64 "ms\n", t_diff);
189
190    } while(!bench_ctl_add_run(bench, &t_diff));
191
192    cycles_t *bench_data = bench->data;
193    cycles_t avg;
194    cycles_t stddev;
195    bench_stddev(bench_data,  bench->result_count, 0, &avg, &stddev);
196    double t_elapsed = ((double)avg) / 1000.0;
197    double t_err = ((double)stddev) / 1000000.0;
198
199    /* make sure no division by zero */
200    *GUPs = (t_elapsed > 0.0 ? 1.0 / t_elapsed : -1.0);
201    *GUPs *= 1e-9 * NUPDATE;
202    /* Print timing results */
203
204    printf("GUPS: CPU time used  = %.6f seconds\n", t_elapsed);
205    printf("GUPS: %.9f Billion(10^9) (s=%.9f) Updates per second [GUP/s] using %" PRIu64
206           " pages\n", *GUPs, t_err, params->TableAlignment);
207
208    /* Verification of results (in serial or "safe" mode; optional) */
209
210    temp = 0x1;
211    for (i = 0; i < NUPDATE; i++) {
212        temp = (temp << 1) ^ (((int64_t) temp < 0) ? POLY : 0);
213        Table[temp & (TableSize - 1)] ^= temp;
214    }
215
216    temp = 0;
217    for (i = 0; i < TableSize; i++) {
218        if (Table[i] != i) {
219            temp++;
220        }
221    }
222
223    printf("GUPS: Found %" PRIu64 " errors in %" PRIu64 " locations (%s).\n", temp,
224           TableSize, (temp <= 0.01 * TableSize) ? "passed" : "failed");
225
226    if (temp <= 0.01 * TableSize) {
227        *failure = 0;
228    } else {
229        *failure = 1;
230    }
231
232
233    bench_ctl_destroy(bench);
234    HPCC_free(Table);
235
236    return 0;
237}
238