1/* 2 * Copyright (c) 2014 ETH Zurich. 3 * All rights reserved. 4 * 5 * This file is distributed under the terms in the attached LICENSE file. 6 * If you do not find this file, copies can be found by writing to: 7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group. 8 */ 9#include <string.h> 10#include <stdlib.h> 11#include <omp.h> 12 13#include <barrelfish/barrelfish.h> 14#include <barrelfish/nameservice_client.h> 15 16#include <xeon_phi/xeon_phi.h> 17#include <xeon_phi/xeon_phi_domain.h> 18 19#include <bench/bench.h> 20#include <xomp/xomp.h> 21 22#define BENCH_MEASURE_LOCAL 0 23 24#define BENCH_RUN_COUNT 25 25#define BENCH_RUN_SINGLE 0 26 27#define DEBUG(x...) debug_printf(x) 28 29#define EXPECT_SUCCESS(errval, msg) \ 30 if (err_is_fail(err)) {USER_PANIC_ERR(err, msg);} 31 32static uint32_t nthreads; 33 34#define SCHEDULE static 35#define CHUNK 25 36 37#define MATRIX_TYPE uint64_t 38#define MATRIX_ROWS 1000 39#define MATRIX_COLS 1000 40#define MATRIX_ELEMENTS (MATRIX_ROWS * MATRIX_COLS) 41 42static MATRIX_TYPE *mA_repl; 43static MATRIX_TYPE *mB_repl; 44static MATRIX_TYPE *mC_repl; 45 46static MATRIX_TYPE *mA_shared; 47static MATRIX_TYPE *mB_shared; 48static MATRIX_TYPE *mC_shared; 49 50struct capref frame_mAB; 51struct capref frame_mC; 52 53struct mm_frame { 54 MATRIX_TYPE sum; 55 uint64_t rows; 56 uint64_t cols; 57}; 58 59static struct mm_frame *g_mm_frame; 60struct capref frame_mm_frame; 61 62MATRIX_TYPE g_sum; 63 64#define RAM_MIN_BASE (128ULL * 1024 * 1024 * 1024) 65#define RAM_MAX_LIMIT (512ULL * 1024 * 1024 * 1024) 66 67/* 68 * benchmark timers 69 */ 70cycles_t timer_xompinit = 0; 71cycles_t timer_share = 0; 72 73static lvaddr_t vbase_map = (128ULL * 1024 * 1024 * 1024); 74 75static void matrix_alloc(uint64_t rows, 76 uint64_t cols) 77{ 78 errval_t err; 79 80#ifndef __k1om__ 81 uint64_t min_base, max_limit; 82 83 ram_get_affinity(&min_base, &max_limit); 84 ram_set_affinity(RAM_MIN_BASE, RAM_MAX_LIMIT); 85 86#endif 87 size_t matrix_size = (rows * cols) * sizeof(MATRIX_TYPE); 88 89 err = frame_alloc(&frame_mAB, 2 * matrix_size, &matrix_size); 90 EXPECT_SUCCESS(err, "frame allocate\n"); 91 92 err = vspace_map_one_frame_fixed(vbase_map, matrix_size, frame_mAB, NULL, NULL); 93 EXPECT_SUCCESS(err, "vspace_map_one_frame\n"); 94 95 mA_repl = (void *)vbase_map; 96 mB_repl = (void *)(vbase_map + ((rows * cols) * sizeof(MATRIX_TYPE))); 97 98 debug_printf("frameAB: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size); 99 100 vbase_map += matrix_size; 101 102 struct capref copy; 103 err = slot_alloc(©); 104 EXPECT_SUCCESS(err, "slot_alloc\n"); 105 106 err = cap_copy(copy, frame_mAB); 107 EXPECT_SUCCESS(err, "cap_copy\n"); 108 109 err = vspace_map_one_frame_fixed(vbase_map, matrix_size, copy, NULL, NULL); 110 EXPECT_SUCCESS(err, "vspace_map_one_frame\n"); 111 112 mA_shared = (void *)vbase_map; 113 mB_shared = (void *)(vbase_map + ((rows * cols) * sizeof(MATRIX_TYPE))); 114 115 debug_printf("sharedAB: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size); 116 117 vbase_map += matrix_size; 118 119 matrix_size = (rows * cols) * sizeof(MATRIX_TYPE); 120 121 err = frame_alloc(&frame_mC, matrix_size, &matrix_size); 122 EXPECT_SUCCESS(err, "frame allocate\n"); 123 124 err = vspace_map_one_frame_fixed(vbase_map, matrix_size, frame_mC, NULL, NULL); 125 EXPECT_SUCCESS(err, "vspace_map_one_frame\n"); 126 127 mC_repl = (void *)vbase_map; 128 129 debug_printf("frameC: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size); 130 131 vbase_map += matrix_size; 132 133 err = slot_alloc(©); 134 EXPECT_SUCCESS(err, "slot_alloc\n"); 135 136 err = cap_copy(copy, frame_mC); 137 EXPECT_SUCCESS(err, "cap_copy\n"); 138 139 err = vspace_map_one_frame_fixed(vbase_map, matrix_size, copy, NULL, NULL); 140 EXPECT_SUCCESS(err, "vspace_map_one_frame\n"); 141 142 mC_shared = (void *)vbase_map; 143 144 debug_printf("sharedC: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size); 145 146 vbase_map += matrix_size; 147 148 size_t mm_frame_size = sizeof(struct mm_frame); 149 err = frame_alloc(&frame_mm_frame, mm_frame_size, &mm_frame_size); 150 EXPECT_SUCCESS(err, "frame allocate\n"); 151 152 err = vspace_map_one_frame_fixed(vbase_map, mm_frame_size, frame_mm_frame, NULL, NULL); 153 EXPECT_SUCCESS(err, "vspace_map_one_frame\n"); 154 155 g_mm_frame = (void *)vbase_map; 156 157 vbase_map += mm_frame_size; 158 159 g_mm_frame->cols = cols; 160 g_mm_frame->rows = rows; 161 g_mm_frame->sum = 0; 162 163#ifndef __k1om__ 164 ram_set_affinity(min_base, max_limit); 165#endif 166} 167 168static void matrix_init(void) 169{ 170 debug_printf("g_mm %p, %p, %p\n", g_mm_frame, mA_repl, mB_repl); 171 for (int i = 0; i < g_mm_frame->rows; i++) { 172 uint64_t row = i * g_mm_frame->rows; 173 MATRIX_TYPE *a_row = mA_repl + (row); 174 MATRIX_TYPE *b_row = mB_shared + (row); 175 for (int j = 0; j < g_mm_frame->cols; j++) { 176 if (j == i) { 177 a_row[j] = 1; 178 b_row[j] = 1; 179 } else { 180 a_row[j] = 0; 181 b_row[j] = 0; 182 } 183 } 184 } 185} 186 187static void matrix_share(xomp_wloc_t location) 188{ 189 errval_t err; 190 DEBUG("==========================================================\n"); 191 DEBUG("frame_mAB, (lvaddr_t) mA_shared, XOMP_FRAME_TYPE_SHARED_RW\n"); 192 err = xomp_master_add_memory(frame_mAB, (lvaddr_t) mA_shared, 193 XOMP_FRAME_TYPE_SHARED_RW); 194 EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n"); 195 196 DEBUG("==========================================================\n"); 197 DEBUG("frame_mC, (lvaddr_t) mC_shared, XOMP_FRAME_TYPE_SHARED_RW\n"); 198 err = xomp_master_add_memory(frame_mC, (lvaddr_t) mC_shared, 199 XOMP_FRAME_TYPE_SHARED_RW); 200 EXPECT_SUCCESS(err, "xomp_master_add_memory mC_shared\n"); 201 202 DEBUG("==========================================================\n"); 203 DEBUG("frame_mm_frame, (lvaddr_t) g_mm_frame, XOMP_FRAME_TYPE_SHARED_RW\n"); 204 err = xomp_master_add_memory(frame_mm_frame, (lvaddr_t) g_mm_frame, 205 XOMP_FRAME_TYPE_SHARED_RW); 206 EXPECT_SUCCESS(err, "xomp_master_add_memory g_mm_frame\n"); 207 208 if (location == XOMP_WORKER_LOC_MIXED) { 209 DEBUG("==========================================================\n"); 210 DEBUG("frame_mAB, (lvaddr_t) mA_repl, XOMP_FRAME_TYPE_REPL_RW\n"); 211 err = xomp_master_add_memory(frame_mAB, (lvaddr_t) mA_repl, 212 XOMP_FRAME_TYPE_REPL_RW); 213 EXPECT_SUCCESS(err, "xomp_master_add_memory mA_repl\n"); 214 215 DEBUG("==========================================================\n"); 216 DEBUG("frame_mC, (lvaddr_t) mC_repl, XOMP_FRAME_TYPE_REPL_RW\n"); 217 err = xomp_master_add_memory(frame_mC, (lvaddr_t) mC_repl, 218 XOMP_FRAME_TYPE_REPL_RW); 219 EXPECT_SUCCESS(err, "xomp_master_add_memory mC_repl\n"); 220 } 221} 222 223#if BENCH_RUN_SINGLE 224static void mm_single(MATRIX_TYPE *a, 225 MATRIX_TYPE *b, 226 MATRIX_TYPE *c) 227{ 228 g_sum = 0; 229 for (int i = 0; i < g_mm_frame->rows; i++) { 230 uint64_t row = i * g_mm_frame->rows; 231 MATRIX_TYPE *c_row_i = c + row; 232 MATRIX_TYPE *a_row_i = a + row; 233 for (int k = 0; k < g_mm_frame->rows; ++k) { 234 MATRIX_TYPE *b_row_k = b + k * g_mm_frame->rows; 235 MATRIX_TYPE a_elem = a_row_i[k]; 236 for (int j = 0; j < g_mm_frame->cols; ++j) { 237 c_row_i[j] += a_elem * b_row_k[j]; 238 } 239 } 240 241 for (int j = 0; j < g_mm_frame->cols; ++j) { 242 g_sum += c_row_i[j]; 243 } 244 } 245} 246#endif 247 248 249#if BENCH_MEASURE_LOCAL 250static bench_ctl_t *ctl_local; 251#endif 252 253static void mm_init(void) 254{ 255#pragma omp parallel 256 { 257 debug_printf("initializing omp library\n"); 258 bench_init(); 259 } 260} 261 262static void mm_omp(MATRIX_TYPE *a, 263 MATRIX_TYPE *b, 264 MATRIX_TYPE *c, 265 struct mm_frame *mm_frame) 266{ 267#pragma omp parallel 268 { 269 uint64_t nrows = mm_frame->rows; 270 uint64_t ncols = mm_frame->cols; 271 uint64_t counter = 0; 272 MATRIX_TYPE sum = 0; 273#if BENCH_MEASURE_LOCAL 274 if (ctl_local == NULL) { 275 ctl_local = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT); 276 } 277 cycles_t start = bench_tsc(); 278#endif 279#pragma omp for nowait schedule (static, 1) 280 for (int i = 0; i < nrows; i++) { 281 counter++; 282 uint64_t row = i * nrows; 283 MATRIX_TYPE *c_row_i = c + row; 284 MATRIX_TYPE *a_row_i = a + row; 285 for (int k = 0; k < nrows; ++k) { 286 MATRIX_TYPE *b_row_k = b + k * nrows; 287 MATRIX_TYPE a_elem = a_row_i[k]; 288 for (int j = 0; j < ncols; ++j) { 289 c_row_i[j] += a_elem * b_row_k[j]; 290 } 291 } 292 293 for (int j = 0; j < ncols; ++j) { 294 sum += c_row_i[j] > 0; 295 } 296 } 297 298 // TODO: atomic add; 299 __sync_fetch_and_add(&mm_frame->sum, sum); 300#if BENCH_MEASURE_LOCAL 301 cycles_t end = bench_tsc(); 302 cycles_t elapsed = bench_time_diff(start, end); 303 if (bench_ctl_add_run(ctl_local, &elapsed)) { 304 bench_ctl_dump_analysis(ctl_local, 0, "LOCAL", bench_tsc_per_us()); 305 } 306#endif 307 } 308} 309 310static void prepare_bomp(void) 311{ 312 cycles_t tsc_start = bench_tsc(); 313 bomp_bomp_init(nthreads); 314 cycles_t tsc_end = bench_tsc(); 315 timer_xompinit = bench_time_diff(tsc_start, tsc_end); 316} 317 318static int prepare_xomp(int argc, 319 char *argv[]) 320{ 321 errval_t err; 322 323 xomp_wloc_t location = XOMP_WORKER_LOC_MIXED; 324 for (int i = 3; i < argc; ++i) { 325 if (!strncmp(argv[i], "--location=", 11)) { 326 char *p = strchr(argv[i], '='); 327 p++; 328 if (!strcmp(p, "local")) { 329 location = XOMP_WORKER_LOC_LOCAL; 330 } 331 } 332 } 333 334 if (location == XOMP_WORKER_LOC_MIXED) { 335 debug_printf("waiting for xeon phi to be ready\n"); 336 err = xeon_phi_domain_blocking_lookup("xeon_phi.0.ready", NULL); 337 EXPECT_SUCCESS(err, "nameservice_blocking_lookup"); 338 err = xeon_phi_domain_blocking_lookup("xeon_phi.1.ready", NULL); 339 EXPECT_SUCCESS(err, "nameservice_blocking_lookup"); 340#if XOMP_BENCH_ENABLED 341 xomp_master_bench_enable(BENCH_RUN_COUNT, nthreads, XOMP_MASTER_BENCH_DO_WORK); 342#endif 343 } 344 345 struct xomp_spawn local_info = { 346 .argc = argc, 347 .argv = argv, 348#ifdef __k1om__ 349 .path = "/k1om/sbin/benchmarks/bomp_mm", 350#else 351 .path = "/x86_64/sbin/benchmarks/bomp_mm", 352#endif 353 }; 354 355 struct xomp_spawn remote_info = { 356 .argc = argc, 357 .argv = argv, 358 .path = "/k1om/sbin/benchmarks/bomp_mm", 359 }; 360 361 struct xomp_args xomp_arg = { 362 .type = XOMP_ARG_TYPE_DISTINCT, 363 .core_stride = 0, // use default 364 .args = { 365 .distinct = { 366 .nthreads = nthreads, 367 .worker_loc = location, 368 .nphi = 2, 369 .local = local_info, 370 .remote = remote_info 371 } 372 } 373 }; 374 375 cycles_t tsc_start = bench_tsc(); 376 if (bomp_xomp_init(&xomp_arg)) { 377 debug_printf("bomp init failed!\n"); 378 exit(1); 379 } 380 cycles_t tsc_end = bench_tsc(); 381 timer_xompinit = bench_time_diff(tsc_start, tsc_end); 382 383 tsc_start = bench_tsc(); 384 matrix_share(location); 385 tsc_end = bench_tsc(); 386 timer_share = bench_time_diff(tsc_start, tsc_end); 387 388 return (location == XOMP_WORKER_LOC_LOCAL); 389} 390 391int main(int argc, 392 char *argv[]) 393{ 394 errval_t err; 395 xomp_wid_t wid; 396 397 cycles_t tsc_start, tsc_end; 398 399 bench_init(); 400 401 err = xomp_worker_parse_cmdline(argc, argv, &wid); 402 if (err_is_ok(err)) { 403 struct xomp_args xw_arg = { 404 .type = XOMP_ARG_TYPE_WORKER, 405 .args = { 406 .worker = { 407 .id = wid 408 } 409 } 410 }; 411 bomp_xomp_init(&xw_arg); 412 } 413 414 if (argc < 4) { 415 debug_printf("Usage: %s <size> <numthreats>\n", argv[0]); 416 exit(1); 417 } 418 419 uint64_t rows, cols; 420 rows = strtoul(argv[1], NULL, 10); 421 cols = rows; 422 423 nthreads = strtoul(argv[2], NULL, 10); 424 if (nthreads == 0) { 425 debug_printf("num threads must be >0\n"); 426 exit(1); 427 } 428 429 DEBUG("\n"); 430 DEBUG("======================================================\n"); 431 debug_printf("Matrix Size: [%lu x %lu]\n", rows, cols); 432 debug_printf("Matrix Size: %lu kB\n", rows * cols * sizeof(MATRIX_TYPE) >> 10); 433 debug_printf("Num Threads: %u\n", nthreads); 434 435 DEBUG("\n"); 436 DEBUG("======================================================\n"); 437 DEBUG("matrix_alloc\n"); 438 tsc_start = bench_tsc(); 439 matrix_alloc(rows, cols); 440 tsc_end = bench_tsc(); 441 cycles_t timer_alloc = bench_time_diff(tsc_start, tsc_end); 442 443 DEBUG("\n"); 444 DEBUG("======================================================\n"); 445 DEBUG("matrix_init\n"); 446 tsc_start = bench_tsc(); 447 matrix_init(); 448 tsc_end = bench_tsc(); 449 cycles_t timer_init = bench_time_diff(tsc_start, tsc_end); 450 451 uint8_t is_shared = 0; 452 for (int i = 3; i < argc; ++i) { 453 if (!strcmp(argv[i], "bomp")) { 454 prepare_bomp(); 455 is_shared = 1; 456 } else if (!strcmp(argv[i], "xomp")) { 457 is_shared = prepare_xomp(argc, argv); 458 } else { 459 debug_printf("ignoring argument {%s}\n", argv[i]); 460 } 461 } 462 if (0) { 463 mm_init(); 464 } 465#if BENCH_RUN_SINGLE 466 DEBUG("\n"); 467 DEBUG("======================================================\n"); 468 DEBUG("mm_single\n"); 469 470 bench_ctl_t *ctl_single; 471 cycles_t timer_single; 472 473 ctl_single = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT); 474 do { 475 tsc_start = bench_tsc(); 476 //mm_single(mA_repl, mB_repl, mC_repl); 477 mm_single(mA_shared, mB_shared, mC_shared); 478 tsc_end = bench_tsc(); 479 timer_single = bench_time_diff(tsc_start, tsc_end); 480 memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE)); 481 } while (!bench_ctl_add_run(ctl_single, &timer_single)); 482#endif 483 DEBUG("\n"); 484 DEBUG("======================================================\n"); 485 DEBUG("mm_omp_repl\n"); 486 487 bench_ctl_t *ctl_omp_repl = NULL; 488 cycles_t timer_omp_repl = 0; 489 490 if (!is_shared) { 491 ctl_omp_repl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT); 492 do { 493 tsc_start = bench_tsc(); 494 mm_omp(mA_repl, mB_repl, mC_repl, g_mm_frame); 495 tsc_end = bench_tsc(); 496 timer_omp_repl = bench_time_diff(tsc_start, tsc_end); 497 debug_printf("took: %lu cycles\n", timer_omp_repl); 498 if (g_mm_frame->sum != g_mm_frame->rows) { 499 debug_printf("ERROR: sum was not identical: %lu / %lu\n", g_mm_frame->sum, 500 g_mm_frame->rows); 501 } 502 g_mm_frame->sum = 0; 503 } while (!bench_ctl_add_run(ctl_omp_repl, &timer_omp_repl)); 504 } 505 DEBUG("\n"); 506 DEBUG("======================================================\n"); 507 DEBUG("mm_omp_shared\n"); 508 509 //memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE)); 510 bench_ctl_t *ctl_omp_shared; 511 cycles_t timer_omp_shared = 0; 512 513 ctl_omp_shared = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT); 514 do { 515 tsc_start = bench_tsc(); 516 mm_omp(mA_shared, mB_shared, mC_shared, g_mm_frame); 517 tsc_end = bench_tsc(); 518 timer_omp_shared = bench_time_diff(tsc_start, tsc_end); 519 if (g_mm_frame->sum != g_mm_frame->rows) { 520 debug_printf("ERROR: sum was not identical: %lu / %lu\n", g_mm_frame->sum, 521 g_mm_frame->rows); 522 } 523 524 g_mm_frame->sum = 0; 525 debug_printf("took: %lu cycles\n", timer_omp_shared); 526 memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE)); 527 } while (!bench_ctl_add_run(ctl_omp_shared, &timer_omp_shared)); 528 529 debug_printf("-------------------------------------\n"); 530 531 debug_printf("BOMP init: %lu (%lu ms)\n", timer_xompinit, 532 bench_tsc_to_ms(timer_xompinit)); 533 534 debug_printf("Alloc Time: %lu (%lu ms)\n", timer_alloc, 535 bench_tsc_to_ms(timer_alloc)); 536 537 debug_printf("Init Time: %lu (%lu ms)\n", timer_init, 538 bench_tsc_to_ms(timer_init)); 539 540 debug_printf("Share Time: %lu (%lu ms)\n", timer_share, 541 bench_tsc_to_ms(timer_share)); 542#if BENCH_RUN_SINGLE 543 debug_printf("Single Time: %lu (%lu ms)\n", timer_single, 544 bench_tsc_to_ms(timer_single)); 545#endif 546 if (!is_shared) { 547 debug_printf("OMP Repl: %lu (%lu ms)\n", timer_omp_repl, 548 bench_tsc_to_ms(timer_omp_repl)); 549 } 550 debug_printf("OMP Shared: %lu (%lu ms)\n", timer_omp_shared, 551 bench_tsc_to_ms(timer_omp_shared)); 552 553 cycles_t timer_common = timer_alloc + timer_init; 554 cycles_t timer_xomp = timer_common + timer_xompinit + timer_share; 555#if BENCH_RUN_SINGLE 556 debug_printf("Total (Single): %lu (%lu ms)\n", timer_single + timer_common, 557 bench_tsc_to_ms(timer_single + timer_common)); 558#endif 559 if (!is_shared) { 560 debug_printf("Total (Repl): %lu (%lu ms)\n", timer_omp_repl + timer_xomp, 561 bench_tsc_to_ms(timer_omp_repl + timer_xomp)); 562 } 563 564 debug_printf("Total (Shared): %lu (%lu ms)\n", timer_omp_shared + timer_xomp, 565 bench_tsc_to_ms(timer_omp_shared + timer_xomp)); 566 567 debug_printf("-------------------------------------\n"); 568 569 cycles_t tscperus = bench_tsc_per_us(); 570#if BENCH_RUN_SINGLE 571 bench_ctl_dump_analysis(ctl_single, 0, "Single", tscperus); 572#endif 573 if (!is_shared) { 574 bench_ctl_dump_analysis(ctl_omp_repl, 0, "OMP Replicated", tscperus); 575 } 576 bench_ctl_dump_analysis(ctl_omp_shared, 0, "OMP Shared", tscperus); 577 578 debug_printf("-------------------------------------\n"); 579#if XOMP_BENCH_ENABLED 580 xomp_master_bench_print_results(); 581#endif 582 debug_printf("-------------------------------------\n"); 583 while (1) 584 ; 585 586} 587 588