1/** 2 * \file 3 * \brief Driver for booting the Xeon Phi Coprocessor card on a Barrelfish Host 4 */ 5 6/* 7 * Copyright (c) 2014 ETH Zurich. 8 * All rights reserved. 9 * 10 * This file is distributed under the terms in the attached LICENSE file. 11 * If you do not find this file, copies can be found by writing to: 12 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group. 13 */ 14 15#include <stdio.h> 16#include <stdlib.h> 17#include <barrelfish/barrelfish.h> 18#include <barrelfish/capabilities.h> 19 20#include <mm/mm.h> 21#include <xeon_phi/xeon_phi.h> 22 23#include "xeon_phi_internal.h" 24#include "sysmem_caps.h" 25 26/// the initial number of slots to allocate for the allocator 27#define NUM_SLOTS L2_CNODE_SLOTS 28 29#define NUM_CHILDREN 2 30 31/* 32 * XXX: This manager relies on the 1:1 mapping of the system memory 33 * in the system memory page tables! 34 */ 35 36/// the memory manager for the system memory 37static struct mm sysmem_manager; 38 39/// offset to the base address 40static lpaddr_t base_offset = 0; 41 42/// the slot allocator 43static struct range_slot_allocator sysmem_allocator; 44 45/* 46 * ---------------------------------------------------------------------------- 47 * System Memory Latency Benchmark 48 * ---------------------------------------------------------------------------- 49 */ 50#ifdef __k1om__ 51#define SYSMEM_BENCH_ENABLED 0 52#else 53#define SYSMEM_BENCH_ENABLED 0 54#endif 55 56#if SYSMEM_BENCH_ENABLED 57#include <barrelfish/nameservice_client.h> 58#include <bench/bench.h> 59#include <limits.h> 60#include <dma/dma_bench.h> 61 62#define EXPECT_SUCCESS(_err, msg...) \ 63 if (err_is_fail(_err)) {USER_PANIC_ERR(_err, msg);} 64 65#define CHACHE_L1_SIZE (32UL * 1024) 66#define CHACHE_LINE_SIZE 64 67#ifdef __k1om__ 68#define CHACHE_LL_SIZE (28UL*1024*1024 + 512UL * 1024) 69#define DIMENSIONS 4 70#else 71#define CHACHE_LL_SIZE (25UL*1024*1024) 72#define DIMENSIONS 2 73#endif 74#define WORKSET_SIZE_MULT 16 75#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE) 76 77/// the number of benchmark rounds to execute 78#define RUN_COUNT 1000 79 80/// number of loop iterations of 10k operations 81#define LOOP_ITERATIONS 1000 82 83/// loop unrolling factor {10, 50, 100, 500, 1000, 5000} 84#define LOOP_UNROLLING 1000 85 86#define NEXT(_e) (_e) = (_e)->next; 87#define NEXT_5(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) 88#define NEXT_10(_e) NEXT_5(_e) NEXT_5(_e) 89#define NEXT_50(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) 90#define NEXT_100(_e) NEXT_50(_e) NEXT_50(_e) 91#define NEXT_500(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) 92#define NEXT_1000(_e) NEXT_500(_e) NEXT_500(_e) 93 94#if LOOP_UNROLLING == 10000 95#define UNROLL_NEXT(_e) NEXT_100(_e) 96#elif LOOP_UNROLLING == 5000 97#define UNROLL_NEXT(_e) NEXT_500(_e) 98#elif LOOP_UNROLLING == 1000 99#define UNROLL_NEXT(_e) NEXT_100(_e) 100#elif LOOP_UNROLLING == 500 101#define UNROLL_NEXT(_e) NEXT_50(_e) 102#elif LOOP_UNROLLING == 100 103#define UNROLL_NEXT(_e) NEXT_10(_e) 104#elif LOOP_UNROLLING == 50 105#define UNROLL_NEXT(_e) NEXT_5(_e) 106#elif LOOP_UNROLLING == 10 107#define UNROLL_NEXT(_e) NEXT(_e) 108#endif 109 110 111#ifndef UNROLL_NEXT 112#error "UNROLL_NEXT not defined" 113#endif 114 115struct elem { 116 struct elem *next; 117 uint8_t pad[CHACHE_LINE_SIZE - sizeof(void *)]; 118}; 119 120struct celem { 121 struct celem *next; 122}; 123 124static uint32_t *elem_id = NULL; 125 126/** 127 * \brief calculates the time difference between two time stamps with overhead 128 * 129 * \param tsc_start start time stamp 130 * \param tsc_end end time stamp 131 * 132 * \returns elapsed time in cycles 133 */ 134static inline cycles_t sysmem_bench_calculate_time(cycles_t tsc_start, 135 cycles_t tsc_end) 136{ 137 cycles_t result; 138 if (tsc_end < tsc_start) { 139 result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead(); 140 } else { 141 result = (tsc_end - tsc_start - bench_tscoverhead()); 142 } 143 144 return result; 145} 146 147/** 148 * \brief generates a shuffled index array for randomized access 149 * 150 * \param num number of elements in the array 151 */ 152static void sysmem_bench_generate_shuffle(size_t num) 153{ 154 if (elem_id) { 155 return; 156 } 157 158 elem_id = malloc(sizeof(uint32_t) * num + 1); 159 assert(elem_id); 160 161 for (uint32_t i = 0; i < num; ++i) { 162 elem_id[i] = i; 163 } 164 165 /* 166 * shuffle the array using Knuth shuffle 167 */ 168 for (uint32_t i = 0; i < num; ++i) { 169 uint32_t idx = i + (rand() % (num - i)); 170 assert(idx < num + 1); 171 uint32_t tmp = elem_id[i]; 172 elem_id[i] = elem_id[idx]; 173 elem_id[idx] = tmp; 174 } 175} 176 177static void sysmem_bench_init_memory(struct elem *mem, 178 size_t num) 179{ 180 sysmem_bench_generate_shuffle(num); 181 182 /* do the linkage */ 183 struct elem *head = &mem[elem_id[0]]; 184 for (uint32_t i = 1; i < num; ++i) { 185 head->next = &mem[elem_id[i]]; 186 head = head->next; 187 } 188 mem[elem_id[num-1]].next = &mem[elem_id[0]]; 189} 190 191#ifdef __k1om__ 192static lvaddr_t requested_size = 0; 193static lvaddr_t requested_size_other = (2UL * 1024 * 1024 * 1024); 194#else 195static lvaddr_t requested_size = 0; 196#endif 197static void sysmem_bench_alloc_memory(void **mem, 198 uint8_t other_phi, 199 size_t size) 200{ 201 202 errval_t err; 203 204 uint8_t bits = 0; 205 while(size > (1UL << bits)) { 206 bits++; 207 } 208 209#ifdef __k1om__ 210 lvaddr_t base = 0; 211 if (other_phi) { 212 base += 31 * XEON_PHI_SYSMEM_PAGE_SIZE; 213 base += requested_size_other; 214 requested_size_other += (1UL << (bits + 1)); 215 } else { 216 base += XEON_PHI_SYSMEM_PAGE_SIZE << 1; 217 base += requested_size; 218 requested_size += (1UL << (bits + 1)); 219 } 220#else 221 lvaddr_t base = (2UL * 1024 * 1024 * 1024); 222 base += requested_size; 223 requested_size += (1UL << (bits + 1)); 224#endif 225 226 227 228 debug_printf("requesting: %lx, %u bits\n", base, bits); 229 230 struct capref frame; 231 err = sysmem_cap_request(base, bits, &frame); 232 EXPECT_SUCCESS(err, "sysmem cap request"); 233 234 void *addr; 235 err = vspace_map_one_frame(&addr, size, frame, NULL, NULL); 236 EXPECT_SUCCESS(err, "mapping of frame failed"); 237 238 if (mem) { 239 *mem = addr; 240 } 241} 242 243static cycles_t sysmem_bench_run_round(void *buffer, volatile void **ret_elem) 244{ 245 volatile struct elem *e = buffer; 246 247 cycles_t tsc_start = bench_tsc(); 248 249 for (uint32_t i = 0; i < LOOP_ITERATIONS; ++i) { 250 UNROLL_NEXT(e); 251 UNROLL_NEXT(e); 252 UNROLL_NEXT(e); 253 UNROLL_NEXT(e); 254 UNROLL_NEXT(e); 255 UNROLL_NEXT(e); 256 UNROLL_NEXT(e); 257 UNROLL_NEXT(e); 258 UNROLL_NEXT(e); 259 UNROLL_NEXT(e); 260 } 261 cycles_t tsc_end = bench_tsc(); 262 263 if (ret_elem) { 264 *ret_elem = e; 265 } 266 267 return sysmem_bench_calculate_time(tsc_start, tsc_end) / (LOOP_ITERATIONS * LOOP_UNROLLING); 268} 269 270static void sysmem_bench_run(void) 271{ 272 273#ifdef __k1om__ 274 errval_t err = nameservice_blocking_lookup("all_spawnds_up", NULL); 275 EXPECT_SUCCESS(err, "all_spawnds_up"); 276#endif 277 278 debug_printf("==========================================================\n"); 279 debug_printf("Running sysmem bench\n"); 280 debug_printf("==========================================================\n"); 281 282 bench_init(); 283 284 cycles_t tscperus = bench_tsc_per_us(); 285 286 assert(sizeof(struct elem) == CACHE_LINE_SIZE); 287 288 size_t num_elements = (WORKSET_SIZE) / sizeof(struct elem); 289 290 void *sysmem; 291 sysmem_bench_alloc_memory(&sysmem, 0, 2*DMA_BENCH_BUFFER_SIZE); 292 293 void *local = malloc(DMA_BENCH_BUFFER_SIZE); 294 295 296 struct elem *ll_elements; 297 sysmem_bench_alloc_memory((void **)&ll_elements, 0, WORKSET_SIZE); 298 sysmem_bench_init_memory(ll_elements, num_elements); 299 300 301 struct celem *l1_elements; 302 sysmem_bench_alloc_memory((void **)&l1_elements, 0, CHACHE_L1_SIZE); 303 304 size_t cache_elements = (CHACHE_L1_SIZE / sizeof(struct celem)) >> 2; 305 for (uint32_t i = 0; i < cache_elements - 1; ++i) { 306 l1_elements[i].next = &l1_elements[i+1]; 307 } 308 l1_elements[cache_elements-1].next = l1_elements; 309 310#ifdef __k1om__ 311 void *otherphi; 312 sysmem_bench_alloc_memory(&otherphi, 1, 2*DMA_BENCH_BUFFER_SIZE); 313 314 struct elem *oll_elements; 315 sysmem_bench_alloc_memory((void **)&oll_elements, 1, WORKSET_SIZE); 316 sysmem_bench_init_memory(oll_elements, num_elements); 317 318 struct celem *l1o_elements; 319 sysmem_bench_alloc_memory((void **)&l1o_elements, 1, CHACHE_L1_SIZE); 320 321 for (uint32_t i = 0; i < cache_elements - 1; ++i) { 322 l1o_elements[i].next = &l1o_elements[i+1]; 323 } 324 l1o_elements[cache_elements-1].next = l1o_elements; 325#endif 326 327 debug_printf("starting benchmark %u rounds\n", RUN_COUNT); 328 329 debug_printf("memcpy: LOCAL -> REMOTE\n"); 330 dma_bench_run_memcpy(sysmem, local); 331 332 debug_printf("memcpy:REMOTE -> LOCAL\n"); 333 dma_bench_run_memcpy(local, sysmem); 334 335#ifdef __k1om__ 336 debug_printf("memcpy: LOCAL -> OTHERPHI\n"); 337 dma_bench_run_memcpy(otherphi, local); 338 339 debug_printf("memcpy: OTHERPHI -> LOCAL\n"); 340 dma_bench_run_memcpy(local, otherphi); 341#endif 342 343 bench_ctl_t *ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, DIMENSIONS, RUN_COUNT); 344 cycles_t result[DIMENSIONS]; 345 uint32_t rounds_done = 0; 346 347 do { 348 volatile void *element; 349 result[0] = sysmem_bench_run_round(&ll_elements[elem_id[0]], &element); 350 351 /* just a access to the variable */ 352 if (!element) { 353 debug_printf("element %p was null.\n", element); 354 } 355 356#ifdef __k1om__ 357 debug_printf("sysmem_bench_run_round(&oll_elements[elem_id[0]], &element);\n"); 358 result[2] = sysmem_bench_run_round(&oll_elements[elem_id[0]], &element); 359 360 /* just a access to the variable */ 361 if (!element) { 362 debug_printf("element %p was null.\n", element); 363 } 364 365 debug_printf("sysmem_bench_run_round(&l1o_elements[0], &element);\n"); 366 result[3] = sysmem_bench_run_round(&l1o_elements[0], &element); 367 /* just a access to the variable */ 368 if (!element) { 369 debug_printf("element %p was null.\n", element); 370 } 371 372#endif 373 volatile struct celem *e2 = l1_elements; 374 for (uint32_t i = 0; i < cache_elements; ++i) { 375 NEXT_1000(e2); 376 } 377 378 result[1] = sysmem_bench_run_round(&l1_elements[0], &element); 379 380 /* just a access to the variable */ 381 if (!element) { 382 debug_printf("element %p was null.\n", element); 383 } 384 385 debug_printf("round: %u of %u\n", ++rounds_done, RUN_COUNT); 386 387 } while (!bench_ctl_add_run(ctl, result)); 388 389 debug_printf("---------------------------------------------------------\n"); 390 bench_ctl_dump_analysis(ctl, 0, "memlatency sysmem", tscperus); 391#ifdef __k1om__ 392 bench_ctl_dump_analysis(ctl, 2, "memlatency other", tscperus); 393 bench_ctl_dump_analysis(ctl, 3, "memlatency other cached", tscperus); 394#endif 395 bench_ctl_dump_analysis(ctl, 1, "cachelatency sysmem", tscperus); 396 debug_printf("---------------------------------------------------------\n"); 397 while(1); 398} 399 400#endif 401 402/* 403 * ---------------------------------------------------------------------------- 404 * Interface 405 * ---------------------------------------------------------------------------- 406 */ 407 408/** 409 * \brief Initializes the capability manager of the system memory range 410 * 411 * \return SYS_ERR_OK on success, 412 */ 413errval_t sysmem_cap_manager_init(struct capref sysmem_cap) 414{ 415 errval_t err; 416 417 // initialize the memory allcator 418 XSYSMEM_DEBUG("Initializing slot allocator of %" PRIu64 " slots\n", NUM_SLOTS); 419 err = range_slot_alloc_init(&sysmem_allocator, NUM_SLOTS, NULL); 420 if (err_is_fail(err)) { 421 return err_push(err, LIB_ERR_SLOT_ALLOC_INIT); 422 } 423 424 struct frame_identity ret; 425 err = invoke_frame_identify(sysmem_cap, &ret); 426 if (err_is_fail(err)) { 427 return err; 428 } 429 430 base_offset = ret.base; 431 432 XSYSMEM_DEBUG("Initializing memory manager\n"); 433 434 /* 435 * initialize the memory manager. 436 * 437 * Important: the type has to be DevFrame, we do not want to zero out the 438 * host memory! 439 */ 440 assert((1UL << log2ceil(ret.bytes)) == ret.bytes); 441 err = mm_init(&sysmem_manager, ObjType_DevFrame, ret.base, log2ceil(ret.bytes), 442 NUM_CHILDREN, slab_default_refill, slot_alloc_dynamic, 443 slot_refill_dynamic, &sysmem_allocator, false); 444 if (err_is_fail(err)) { 445 return err_push(err, MM_ERR_MM_INIT); 446 } 447 448 XSYSMEM_DEBUG("Adding cap: [0x%016lx, %i]\n", ret.base, log2ceil(ret.bytes)); 449 err = mm_add(&sysmem_manager, sysmem_cap, log2ceil(ret.bytes), ret.base); 450 if (err_is_fail(err)) { 451 return err; 452 } 453 454#if SYSMEM_BENCH_ENABLED 455#ifdef __k1om__ 456 if (disp_xeon_phi_id()==1) { 457 sysmem_bench_run(); 458 } 459#else 460 if (disp_get_core_id() >= 20) { 461 sysmem_bench_run(); 462 } else { 463 while(1) 464 ; 465 } 466#endif 467#endif 468 return SYS_ERR_OK; 469} 470 471/** 472 * \brief Returns a previously requested system memory capability to the 473 * cap manager 474 */ 475errval_t sysmem_cap_return(struct capref frame) 476{ 477 errval_t err; 478 struct frame_identity id; 479 err = invoke_frame_identify(frame, &id); 480 if (err_is_fail(err)) { 481 return err; 482 } 483 484 assert((1UL << log2ceil(id.bytes)) == id.bytes); 485 return mm_free(&sysmem_manager, frame, id.base, log2ceil(id.bytes)); 486} 487 488/** 489 * \brief Requests a certain system memory capability based on the base and 490 * length requirements 491 * 492 * \param base the base address of the system memory (host address) 493 * \param bits the size of the requested capability in bits 494 * \param frame capability representing the system memory frame 495 * 496 * \retval SYS_ERR_OK on success 497 * 498 * Note: the caller must check the size and base of the frame... 499 */ 500errval_t sysmem_cap_request(lpaddr_t base, 501 uint8_t bits, 502 struct capref *frame) 503{ 504 errval_t err; 505 506 XSYSMEM_DEBUG("Requesting cap for [0x%016lx, %i]\n", base, bits); 507 // the size and base must not exceed the maximum range (512G) 508 assert(bits < 40); 509 assert(!(base & (BASE_PAGE_SIZE-1))); 510 511 // align the base to the next 4k boundary 512 //size += (base & (BASE_PAGE_SIZE-1)); 513 // base -= (base & (BASE_PAGE_SIZE-1)); 514 515 // size = (size+BASE_PAGE_SIZE-1) & ~(BASE_PAGE_SIZE - 1); 516 517 // transform the address into the host memory range 518 base += base_offset; 519 520 err = mm_alloc_range(&sysmem_manager, bits, base, base + (1UL << bits), frame, 521 NULL); 522 523 if (err_is_fail(err)) { 524 XSYSMEM_DEBUG("Try reallocation for [0x%016lx, %i]\n", base, bits); 525 err = mm_realloc_range(&sysmem_manager, bits, base, frame); 526 if (err_is_fail(err)) { 527 return err; 528 } 529 } 530 return SYS_ERR_OK; 531} 532