1/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */ 2 3/* 4 * This code has been contributed by the DARPA HPCS program. Contact 5 * David Koester <dkoester@mitre.org> or Bob Lucas <rflucas@isi.edu> 6 * if you have questions. 7 * 8 * GUPS (Giga UPdates per Second) is a measurement that profiles the memory 9 * architecture of a system and is a measure of performance similar to MFLOPS. 10 * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the 11 * GUPS capability of a system, much like the LINPACK benchmark is intended to 12 * exercise the MFLOPS capability of a computer. In each case, we would 13 * expect these benchmarks to achieve close to the "peak" capability of the 14 * memory system. The extent of the similarities between RandomAccess and 15 * LINPACK are limited to both benchmarks attempting to calculate a peak system 16 * capability. 17 * 18 * GUPS is calculated by identifying the number of memory locations that can be 19 * randomly updated in one second, divided by 1 billion (1e9). The term "randomly" 20 * means that there is little relationship between one address to be updated and 21 * the next, except that they occur in the space of one half the total system 22 * memory. An update is a read-modify-write operation on a table of 64-bit words. 23 * An address is generated, the value at that address read from memory, modified 24 * by an integer operation (add, and, or, xor) with a literal value, and that 25 * new value is written back to memory. 26 * 27 * We are interested in knowing the GUPS performance of both entire systems and 28 * system subcomponents --- e.g., the GUPS rating of a distributed memory 29 * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a 30 * single processor. While there is typically a scaling of FLOPS with processor 31 * count, a similar phenomenon may not always occur for GUPS. 32 * 33 * For additional information on the GUPS metric, the HPCchallenge RandomAccess 34 * Benchmark,and the rules to run RandomAccess or modify it to optimize 35 * performance -- see http://icl.cs.utk.edu/hpcc/ 36 * 37 */ 38 39/* 40 * This file contains the computational core of the single cpu version 41 * of GUPS. The inner loop should easily be vectorized by compilers 42 * with such support. 43 * 44 * This core is used by both the single_cpu and star_single_cpu tests. 45 */ 46 47#include <stdio.h> 48#ifdef BARRELFISH 49#include <barrelfish/barrelfish.h> 50#endif 51#include "RandomAccess.h" 52 53/* Number of updates to table (suggested: 4x number of table entries) */ 54#define NUPDATE (4 * TableSize) 55 56/* Utility routine to start random number generator at Nth step */ 57static uint64_t HPCC_starts(int64_t n) 58{ 59 int i, j; 60 uint64_t m2[64]; 61 uint64_t temp, ran; 62 63 while (n < 0) 64 n += PERIOD; 65 while (n > PERIOD) 66 n -= PERIOD; 67 if (n == 0) 68 return 0x1; 69 70 temp = 0x1; 71 for (i = 0; i < 64; i++) { 72 m2[i] = temp; 73 temp = (temp << 1) ^ ((int64_t) temp < 0 ? POLY : 0); 74 temp = (temp << 1) ^ ((int64_t) temp < 0 ? POLY : 0); 75 } 76 77 for (i = 62; i >= 0; i--) 78 if ((n >> i) & 1) 79 break; 80 81 ran = 0x2; 82 while (i > 0) { 83 temp = 0; 84 for (j = 0; j < 64; j++) 85 if ((ran >> j) & 1) 86 temp ^= m2[j]; 87 ran = temp; 88 i -= 1; 89 if ((n >> i) & 1) 90 ran = (ran << 1) ^ ((int64_t) ran < 0 ? POLY : 0); 91 } 92 93 return ran; 94} 95 96static void RandomAccessUpdate(uint64_t TableSize, uint64_t *Table) 97{ 98 uint64_t i; 99 uint64_t ran[128]; /* Current random numbers */ 100 int j; 101 102 /* Perform updates to main table. The scalar equivalent is: 103 * 104 * uint64_t ran; 105 * ran = 1; 106 * for (i=0; i<NUPDATE; i++) { 107 * ran = (ran << 1) ^ (((int64_t) ran < 0) ? POLY : 0); 108 * table[ran & (TableSize-1)] ^= ran; 109 * } 110 */ 111 for (j = 0; j < 128; j++) 112 ran[j] = HPCC_starts((NUPDATE / 128) * j); 113 114 for (i = 0; i < NUPDATE / 128; i++) { 115 /* #pragma ivdep */ 116#ifdef _OPENMP 117#pragma omp parallel for 118#endif 119 for (j = 0; j < 128; j++) { 120 ran[j] = (ran[j] << 1) ^ ((int64_t) ran[j] < 0 ? POLY : 0); 121 Table[ran[j] & (TableSize - 1)] ^= ran[j]; 122 } 123 } 124} 125 126 127 128int HPCC_RandomAccess(HPCC_Params *params, int doIO, double *GUPs, int *failure) 129{ 130 uint64_t i; 131 uint64_t temp; 132 double totalMem; 133 uint64_t *Table; 134 uint64_t logTableSize, TableSize; 135 136 /* calculate local memory per node for the update table */ 137 totalMem = params->HPLMaxProcMem; 138 totalMem /= sizeof(uint64_t); 139 140 /* calculate the size of update array (must be a power of 2) */ 141 for (totalMem *= 0.5, logTableSize = 0, TableSize = 1; totalMem >= 1.0; 142 totalMem *= 0.5, logTableSize++, TableSize <<= 1) 143 ; /* EMPTY */ 144 145 Table = HPCC_malloc(sizeof(uint64_t) * TableSize, params->TableAlignment); 146 if (!Table) { 147 printf("could not allocate table"); 148 return 1; 149 } 150 151 params->RandomAccess_N = (int64_t) TableSize; 152 153 /* Print parameters for run */ 154 155 printf("# GUPS: Main table (@%p)size = 2^%" PRIu64 " = %" PRIu64 " words\n", 156 Table, logTableSize, TableSize); 157 printf("# GUPS: Number of updates = %" PRIu64 "\n", NUPDATE); 158 159 printf("# GUPS: Starting GUPS benchmark with %" PRIu32 " runs\n", params->NumReps); 160 bench_ctl_t *bench = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, params->NumReps); 161 162 cycles_t t_diff; 163 164 do { 165 /* Initialize main table for each run */ 166 for (i = 0; i < TableSize; i++) { 167 Table[i] = i; 168 } 169 170 /* Begin timing here */ 171#ifdef BARRELFISH 172 cycles_t t_start = bench_tsc(); 173#else 174 cycles_t t_start = get_timems(); 175#endif 176 RandomAccessUpdate(TableSize, Table); 177 178 /* End timed section */ 179#ifdef BARRELFISH 180 cycles_t t_end = bench_tsc(); 181 t_diff = bench_tsc_to_ms(bench_time_diff(t_start, t_end)); 182#else 183 cycles_t t_end = get_timems(); 184 t_diff = bench_time_diff(t_start, t_end); 185#endif 186 /* time elapsed in seconds */ 187 188 printf("# GUPS: Round: %" PRIu64 "ms\n", t_diff); 189 190 } while(!bench_ctl_add_run(bench, &t_diff)); 191 192 cycles_t *bench_data = bench->data; 193 cycles_t avg; 194 cycles_t stddev; 195 bench_stddev(bench_data, bench->result_count, 0, &avg, &stddev); 196 double t_elapsed = ((double)avg) / 1000.0; 197 double t_err = ((double)stddev) / 1000000.0; 198 199 /* make sure no division by zero */ 200 *GUPs = (t_elapsed > 0.0 ? 1.0 / t_elapsed : -1.0); 201 *GUPs *= 1e-9 * NUPDATE; 202 /* Print timing results */ 203 204 printf("GUPS: CPU time used = %.6f seconds\n", t_elapsed); 205 printf("GUPS: %.9f Billion(10^9) (s=%.9f) Updates per second [GUP/s] using %" PRIu64 206 " pages\n", *GUPs, t_err, params->TableAlignment); 207 208 /* Verification of results (in serial or "safe" mode; optional) */ 209 210 temp = 0x1; 211 for (i = 0; i < NUPDATE; i++) { 212 temp = (temp << 1) ^ (((int64_t) temp < 0) ? POLY : 0); 213 Table[temp & (TableSize - 1)] ^= temp; 214 } 215 216 temp = 0; 217 for (i = 0; i < TableSize; i++) { 218 if (Table[i] != i) { 219 temp++; 220 } 221 } 222 223 printf("GUPS: Found %" PRIu64 " errors in %" PRIu64 " locations (%s).\n", temp, 224 TableSize, (temp <= 0.01 * TableSize) ? "passed" : "failed"); 225 226 if (temp <= 0.01 * TableSize) { 227 *failure = 0; 228 } else { 229 *failure = 1; 230 } 231 232 233 bench_ctl_destroy(bench); 234 HPCC_free(Table); 235 236 return 0; 237} 238