1/* 2 * Copyright (c) 2014 ETH Zurich. 3 * All rights reserved. 4 * 5 * This file is distributed under the terms in the attached LICENSE file. 6 * If you do not find this file, copies can be found by writing to: 7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group. 8 */ 9 10#include <string.h> 11#include <stdlib.h> 12#include <limits.h> 13#include <barrelfish/barrelfish.h> 14#include <bench/bench.h> 15 16/* 17 * =========================================================================== 18 * XXX: README 19 * 20 * The compilation of this file may take quite long due to heavily use of 21 * expansive makros in the loop (see next) 22 * =========================================================================== 23 */ 24 25 26/* 27 * ---------------------------------------------------------------------------- 28 * Benchmark Settings 29 * ---------------------------------------------------------------------------- 30 */ 31 32/// the number of benchmark rounds to execute 33#define RUN_COUNT 1000 34 35/// number of loop iterations of 10k operations 36#define LOOP_ITERATIONS 1000 37 38/// loop unrolling factor {10, 50, 100, 500, 1000, 5000} 39#define LOOP_UNROLLING 1000 40 41/// working set multiplier (workset = sizeof(LL_CACHE) * multiplier 42#define WORKSET_SIZE_MULT 8 43 44/* 45 * ---------------------------------------------------------------------------- 46 * Machine Settings 47 * ---------------------------------------------------------------------------- 48 */ 49 50#ifdef __k1om__ 51#define CHACHE_L1_SIZE (32UL * 1024) 52#define CHACHE_L2_SIZE (512UL * 1024) 53#define CHACHE_L3_SIZE (28UL*1024*1024 + 512UL * 1024) 54#define CHACHE_LL_SIZE (CHACHE_L3_SIZE) 55#define CHACHE_LINE_SIZE 64 56#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE) 57#define NUMA_NODE_MAX 0 58#define NUME_MEM_PER_NODE (6UL * 1024 * 1024 * 1024) 59#define DIMENSIONS 2 60#else 61#define CHACHE_L1_SIZE (32UL * 1024) 62#define CHACHE_L2_SIZE (256UL * 1024) 63#define CHACHE_L3_SIZE (25UL*1024*1024) 64#define CHACHE_LL_SIZE (CHACHE_L3_SIZE) 65#define CHACHE_LINE_SIZE 64 66#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE) 67#define NUMA_NODE_MAX 1 68#define NUME_MEM_PER_NODE (128UL * 1024 * 1024 * 1024) 69#define DIMENSIONS 3 70#endif 71 72 73#define EXPECT_SUCCESS(_err, msg...) \ 74 if (err_is_fail(_err)) {USER_PANIC_ERR(_err, msg);} 75 76 77#define NEXT(_e) (_e) = (_e)->next; 78#define NEXT_5(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) 79#define NEXT_10(_e) NEXT_5(_e) NEXT_5(_e) 80#define NEXT_50(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) 81#define NEXT_100(_e) NEXT_50(_e) NEXT_50(_e) 82#define NEXT_500(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) 83#define NEXT_1000(_e) NEXT_500(_e) NEXT_500(_e) 84 85#if LOOP_UNROLLING == 10000 86#define UNROLL_NEXT(_e) NEXT_100(_e) 87#elif LOOP_UNROLLING == 5000 88#define UNROLL_NEXT(_e) NEXT_500(_e) 89#elif LOOP_UNROLLING == 1000 90#define UNROLL_NEXT(_e) NEXT_100(_e) 91#elif LOOP_UNROLLING == 500 92#define UNROLL_NEXT(_e) NEXT_50(_e) 93#elif LOOP_UNROLLING == 100 94#define UNROLL_NEXT(_e) NEXT_10(_e) 95#elif LOOP_UNROLLING == 50 96#define UNROLL_NEXT(_e) NEXT_5(_e) 97#elif LOOP_UNROLLING == 10 98#define UNROLL_NEXT(_e) NEXT(_e) 99#endif 100 101 102#ifndef UNROLL_NEXT 103#error "UNROLL_NEXT not defined" 104#endif 105 106 107struct elem { 108 struct elem *next; 109 uint32_t val; 110 uint8_t pad[CHACHE_LINE_SIZE - sizeof(void *) - sizeof(uint32_t)]; 111}; 112 113struct celem { 114 struct celem *next; 115}; 116 117 118uint32_t *elem_id = NULL; 119 120static inline cycles_t calculate_time(cycles_t tsc_start, 121 cycles_t tsc_end) 122{ 123 cycles_t result; 124 if (tsc_end < tsc_start) { 125 result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead(); 126 } else { 127 result = (tsc_end - tsc_start - bench_tscoverhead()); 128 } 129 130 if (result < (tsc_end - tsc_start)) { 131 return result; 132 } 133 134 debug_printf("result: %lu / %lu / overhead: %lu\n", result, 135 tsc_end - tsc_start, bench_tscoverhead()); 136 137 return result; 138} 139 140static uint8_t numa_node(void) 141{ 142#ifdef __k1om__ 143 return 0; 144#else 145 return (disp_get_core_id() > 19); 146#endif 147} 148 149static void generate_shuffle(size_t num) 150{ 151 if (elem_id) { 152 return; 153 } 154 155 debug_printf("generating shuffle\n"); 156 157 elem_id = malloc(sizeof(uint32_t) * num + 1); 158 assert(elem_id); 159 160 for (uint32_t i = 0; i < num; ++i) { 161 elem_id[i] = i; 162 } 163 164 /* 165 * shuffle the array using Knuth shuffle 166 */ 167 for (uint32_t i = 0; i < num; ++i) { 168 uint32_t idx = i + (rand() % (num - i)); 169 assert(idx < num + 1); 170 uint32_t tmp = elem_id[i]; 171 elem_id[i] = elem_id[idx]; 172 elem_id[idx] = tmp; 173 } 174} 175 176static void init_memory(struct elem *mem, 177 size_t num) 178{ 179 generate_shuffle(num); 180 181 for (uint32_t i = 0; i < num; ++i) { 182 mem[i].val = i + 1; 183 } 184 185 /* do the linkage */ 186 struct elem *head = &mem[elem_id[0]]; 187 for (uint32_t i = 1; i < num; ++i) { 188 head->next = &mem[elem_id[i]]; 189 head = head->next; 190 } 191 mem[elem_id[num-1]].next = &mem[elem_id[0]]; 192} 193 194static void alloc_memory(struct elem **mem, 195 uint8_t node) 196{ 197 assert(node <= NUMA_NODE_MAX); 198 199 errval_t err; 200 201#ifndef __k1om__ 202 uint64_t min_base, max_limit; 203 ram_get_affinity(&min_base, &max_limit); 204 205 uint64_t mem_min = NUME_MEM_PER_NODE * node; 206 uint64_t mem_max = NUME_MEM_PER_NODE * (node + 1); 207 208 debug_printf("alloc mem: numa=%u, range={%016lx, %016lx}\n", node, mem_min, 209 mem_max); 210 211 ram_set_affinity(mem_min, mem_max); 212#endif 213 214 struct capref frame; 215 err = frame_alloc(&frame, WORKSET_SIZE, NULL); 216 EXPECT_SUCCESS(err, "frame alloc failed\n"); 217 218 void *addr; 219 err = vspace_map_one_frame(&addr, WORKSET_SIZE, frame, NULL, NULL); 220 EXPECT_SUCCESS(err, "mapping of frame failed"); 221 222#ifndef __k1om__ 223 ram_set_affinity(min_base, max_limit); 224#endif 225 226 if (mem) { 227 *mem = addr; 228 } 229} 230 231static cycles_t run_benchmark(void *buffer, volatile void **ret_elem) 232{ 233 volatile struct elem *e = buffer; 234 235 cycles_t tsc_start = bench_tsc(); 236 237 for (uint32_t i = 0; i < LOOP_ITERATIONS; ++i) { 238 UNROLL_NEXT(e); 239 UNROLL_NEXT(e); 240 UNROLL_NEXT(e); 241 UNROLL_NEXT(e); 242 UNROLL_NEXT(e); 243 UNROLL_NEXT(e); 244 UNROLL_NEXT(e); 245 UNROLL_NEXT(e); 246 UNROLL_NEXT(e); 247 UNROLL_NEXT(e); 248 } 249 cycles_t tsc_end = bench_tsc(); 250 251 if (ret_elem) { 252 *ret_elem = e; 253 } 254 255 return calculate_time(tsc_start, tsc_end) / (LOOP_ITERATIONS * LOOP_UNROLLING); 256} 257 258int main(int argc, 259 char *argv[]) 260{ 261 262 bench_init(); 263 264 debug_printf("memlatency benchmark started...Workset size: %lu MBytes\n", 265 WORKSET_SIZE >> 20); 266 267 if (numa_node()) { 268 USER_PANIC("must be run on numa node 0\n"); 269 } 270 271 assert(sizeof(struct elem) == CACHE_LINE_SIZE); 272 273 cycles_t tscperus = bench_tsc_per_us(); 274 275 size_t num_elements = (WORKSET_SIZE) / sizeof(struct elem); 276 assert(num_elements < (0x7FFFFFFF)); 277 278 debug_printf("allocating elements list: %lu elements\n", num_elements); 279 280 struct elem *ll_elements; 281 alloc_memory(&ll_elements, 0); 282 init_memory(ll_elements, num_elements); 283 284#ifndef __k1om__ 285 struct elem *ll_elements_numa; 286 alloc_memory(&ll_elements_numa, 1); 287 init_memory(ll_elements_numa, num_elements); 288#endif 289 290 /* 291 * do cache allocations 292 */ 293 size_t cache_elements = (CHACHE_L1_SIZE / sizeof(struct celem)) >> 2; 294 debug_printf("initialize cached elements, num:%lu\n", cache_elements); 295 296 struct celem *l1_elements = malloc((cache_elements)* sizeof(struct celem)); 297 for (uint32_t i = 0; i < cache_elements - 1; ++i) { 298 l1_elements[i].next = &l1_elements[i+1]; 299 } 300 l1_elements[cache_elements-1].next = l1_elements; 301 302 /* 303 * BENCHMARK 304 */ 305 306 debug_printf("starting benchmark %u rounds\n", RUN_COUNT); 307 308 bench_ctl_t *ctl; 309 cycles_t result[DIMENSIONS]; 310 311 uint32_t round = 0; 312 313 ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, DIMENSIONS, RUN_COUNT); 314 do { 315 volatile void *element; 316 result[0] = run_benchmark(&ll_elements[elem_id[0]], &element); 317 /* make sure the loop is not optimized away */ 318 319 /* just a access to the variable */ 320 if (!element) { 321 debug_printf("element %p was null.\n", element); 322 } 323 324 325#ifndef __k1om__ 326 result[2] = run_benchmark(&ll_elements_numa[elem_id[0]], &element); 327 328 /* just a access to the variable */ 329 if (!element) { 330 debug_printf("element %p was null.\n", element); 331 } 332#endif 333 334 volatile struct celem *e2 = l1_elements; 335 336 /* warmup cache */ 337 for (uint32_t i = 0; i < cache_elements; ++i) { 338 NEXT_1000(e2); 339 } 340 341 e2 = l1_elements; 342 result[1] = run_benchmark(l1_elements, &element); 343 344 /* just a access to the variable */ 345 if (!element) { 346 debug_printf("element %p was null.\n", element); 347 } 348 349 debug_printf("round: %u of %u\n", ++round, RUN_COUNT); 350 } while (!bench_ctl_add_run(ctl, result)); 351 352 debug_printf("---------------------------------------------------------\n"); 353 bench_ctl_dump_analysis(ctl, 0, "memlatency", tscperus); 354#ifndef __k1om__ 355 bench_ctl_dump_analysis(ctl, 2, "memlatency numa", tscperus); 356#endif 357 bench_ctl_dump_analysis(ctl, 1, "cachelatency", tscperus); 358 debug_printf("---------------------------------------------------------\n"); 359 return 0; 360} 361