1/** 2 * \file 3 * \brief Driver for booting the Xeon Phi Coprocessor card on a Barrelfish Host 4 */ 5 6/* 7 * Copyright (c) 2014 ETH Zurich. 8 * All rights reserved. 9 * 10 * This file is distributed under the terms in the attached LICENSE file. 11 * If you do not find this file, copies can be found by writing to: 12 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group. 13 */ 14 15#include <stdio.h> 16#include <stdlib.h> 17#include <barrelfish/barrelfish.h> 18#include <barrelfish/capabilities.h> 19 20#include <mm/mm.h> 21#include <xeon_phi/xeon_phi.h> 22 23#include "xeon_phi_internal.h" 24#include "sysmem_caps.h" 25 26/// the initial number of slots to allocate for the allocator 27#define NUM_SLOTS L2_CNODE_SLOTS 28 29#define NUM_CHILDREN 2 30 31/* 32 * XXX: This manager relies on the 1:1 mapping of the system memory 33 * in the system memory page tables! 34 */ 35 36/// the memory manager for the system memory 37static struct mm sysmem_manager; 38 39/// offset to the base address 40static lpaddr_t base_offset = 0; 41 42/// the slot allocator 43static struct range_slot_allocator sysmem_allocator; 44 45/* 46 * ---------------------------------------------------------------------------- 47 * System Memory Latency Benchmark 48 * ---------------------------------------------------------------------------- 49 */ 50#ifdef __k1om__ 51#define SYSMEM_BENCH_ENABLED 0 52#else 53#define SYSMEM_BENCH_ENABLED 0 54#endif 55 56#if SYSMEM_BENCH_ENABLED 57#include <barrelfish/nameservice_client.h> 58#include <bench/bench.h> 59#include <limits.h> 60#include <dma/dma_bench.h> 61 62#define EXPECT_SUCCESS(_err, msg...) \ 63 if (err_is_fail(_err)) {USER_PANIC_ERR(_err, msg);} 64 65#define CHACHE_L1_SIZE (32UL * 1024) 66#define CHACHE_LINE_SIZE 64 67#ifdef __k1om__ 68#define CHACHE_LL_SIZE (28UL*1024*1024 + 512UL * 1024) 69#define DIMENSIONS 4 70#else 71#define CHACHE_LL_SIZE (25UL*1024*1024) 72#define DIMENSIONS 2 73#endif 74#define WORKSET_SIZE_MULT 16 75#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE) 76 77/// the number of benchmark rounds to execute 78#define RUN_COUNT 1000 79 80/// number of loop iterations of 10k operations 81#define LOOP_ITERATIONS 1000 82 83/// loop unrolling factor {10, 50, 100, 500, 1000, 5000} 84#define LOOP_UNROLLING 1000 85 86#define NEXT(_e) (_e) = (_e)->next; 87#define NEXT_5(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) 88#define NEXT_10(_e) NEXT_5(_e) NEXT_5(_e) 89#define NEXT_50(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) 90#define NEXT_100(_e) NEXT_50(_e) NEXT_50(_e) 91#define NEXT_500(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) 92#define NEXT_1000(_e) NEXT_500(_e) NEXT_500(_e) 93 94#if LOOP_UNROLLING == 10000 95#define UNROLL_NEXT(_e) NEXT_100(_e) 96#elif LOOP_UNROLLING == 5000 97#define UNROLL_NEXT(_e) NEXT_500(_e) 98#elif LOOP_UNROLLING == 1000 99#define UNROLL_NEXT(_e) NEXT_100(_e) 100#elif LOOP_UNROLLING == 500 101#define UNROLL_NEXT(_e) NEXT_50(_e) 102#elif LOOP_UNROLLING == 100 103#define UNROLL_NEXT(_e) NEXT_10(_e) 104#elif LOOP_UNROLLING == 50 105#define UNROLL_NEXT(_e) NEXT_5(_e) 106#elif LOOP_UNROLLING == 10 107#define UNROLL_NEXT(_e) NEXT(_e) 108#endif 109 110 111#ifndef UNROLL_NEXT 112#error "UNROLL_NEXT not defined" 113#endif 114 115struct elem { 116 struct elem *next; 117 uint8_t pad[CHACHE_LINE_SIZE - sizeof(void *)]; 118}; 119 120struct celem { 121 struct celem *next; 122}; 123 124static uint32_t *elem_id = NULL; 125 126/** 127 * \brief calculates the time difference between two time stamps with overhead 128 * 129 * \param tsc_start start time stamp 130 * \param tsc_end end time stamp 131 * 132 * \returns elapsed time in cycles 133 */ 134static inline cycles_t sysmem_bench_calculate_time(cycles_t tsc_start, 135 cycles_t tsc_end) 136{ 137 cycles_t result; 138 if (tsc_end < tsc_start) { 139 result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead(); 140 } else { 141 result = (tsc_end - tsc_start - bench_tscoverhead()); 142 } 143 144 return result; 145} 146 147/** 148 * \brief generates a shuffled index array for randomized access 149 * 150 * \param num number of elements in the array 151 */ 152static void sysmem_bench_generate_shuffle(size_t num) 153{ 154 if (elem_id) { 155 return; 156 } 157 158 elem_id = malloc(sizeof(uint32_t) * num + 1); 159 assert(elem_id); 160 161 for (uint32_t i = 0; i < num; ++i) { 162 elem_id[i] = i; 163 } 164 165 /* 166 * shuffle the array using Knuth shuffle 167 */ 168 for (uint32_t i = 0; i < num; ++i) { 169 uint32_t idx = i + (rand() % (num - i)); 170 assert(idx < num + 1); 171 uint32_t tmp = elem_id[i]; 172 elem_id[i] = elem_id[idx]; 173 elem_id[idx] = tmp; 174 } 175} 176 177static void sysmem_bench_init_memory(struct elem *mem, 178 size_t num) 179{ 180 sysmem_bench_generate_shuffle(num); 181 182 /* do the linkage */ 183 struct elem *head = &mem[elem_id[0]]; 184 for (uint32_t i = 1; i < num; ++i) { 185 head->next = &mem[elem_id[i]]; 186 head = head->next; 187 } 188 mem[elem_id[num-1]].next = &mem[elem_id[0]]; 189} 190 191#ifdef __k1om__ 192static lvaddr_t requested_size = 0; 193static lvaddr_t requested_size_other = (2UL * 1024 * 1024 * 1024); 194#else 195static lvaddr_t requested_size = 0; 196#endif 197static void sysmem_bench_alloc_memory(void **mem, 198 uint8_t other_phi, 199 size_t size) 200{ 201 202 errval_t err; 203 204 uint8_t bits = 0; 205 while(size > (1UL << bits)) { 206 bits++; 207 } 208 209#ifdef __k1om__ 210 lvaddr_t base = 0; 211 if (other_phi) { 212 base += 31 * XEON_PHI_SYSMEM_PAGE_SIZE; 213 base += requested_size_other; 214 requested_size_other += (1UL << (bits + 1)); 215 } else { 216 base += XEON_PHI_SYSMEM_PAGE_SIZE << 1; 217 base += requested_size; 218 requested_size += (1UL << (bits + 1)); 219 } 220#else 221 lvaddr_t base = (2UL * 1024 * 1024 * 1024); 222 base += requested_size; 223 requested_size += (1UL << (bits + 1)); 224#endif 225 226 227 228 debug_printf("requesting: %lx, %u bits\n", base, bits); 229 230 struct capref frame; 231 err = sysmem_cap_request(base, bits, &frame); 232 EXPECT_SUCCESS(err, "sysmem cap request"); 233 234 void *addr; 235 err = vspace_map_one_frame(&addr, size, frame, NULL, NULL); 236 EXPECT_SUCCESS(err, "mapping of frame failed"); 237 238 if (mem) { 239 *mem = addr; 240 } 241} 242 243static cycles_t sysmem_bench_run_round(void *buffer, volatile void **ret_elem) 244{ 245 volatile struct elem *e = buffer; 246 247 cycles_t tsc_start = bench_tsc(); 248 249 for (uint32_t i = 0; i < LOOP_ITERATIONS; ++i) { 250 UNROLL_NEXT(e); 251 UNROLL_NEXT(e); 252 UNROLL_NEXT(e); 253 UNROLL_NEXT(e); 254 UNROLL_NEXT(e); 255 UNROLL_NEXT(e); 256 UNROLL_NEXT(e); 257 UNROLL_NEXT(e); 258 UNROLL_NEXT(e); 259 UNROLL_NEXT(e); 260 } 261 cycles_t tsc_end = bench_tsc(); 262 263 if (ret_elem) { 264 *ret_elem = e; 265 } 266 267 return sysmem_bench_calculate_time(tsc_start, tsc_end) / (LOOP_ITERATIONS * LOOP_UNROLLING); 268} 269 270static void sysmem_bench_run(void) 271{ 272 273#ifdef __k1om__ 274 errval_t err = nameservice_blocking_lookup("all_spawnds_up", NULL); 275 EXPECT_SUCCESS(err, "all_spawnds_up"); 276#endif 277 278 debug_printf("==========================================================\n"); 279 debug_printf("Running sysmem bench\n"); 280 debug_printf("==========================================================\n"); 281 282 bench_init(); 283 284 cycles_t tscperus = bench_tsc_per_us(); 285 286 assert(sizeof(struct elem) == CACHE_LINE_SIZE); 287 288 size_t num_elements = (WORKSET_SIZE) / sizeof(struct elem); 289 290 void *sysmem; 291 sysmem_bench_alloc_memory(&sysmem, 0, 2*DMA_BENCH_BUFFER_SIZE); 292 293 void *local = malloc(DMA_BENCH_BUFFER_SIZE); 294 295 296 struct elem *ll_elements; 297 sysmem_bench_alloc_memory((void **)&ll_elements, 0, WORKSET_SIZE); 298 sysmem_bench_init_memory(ll_elements, num_elements); 299 300 301 struct celem *l1_elements; 302 sysmem_bench_alloc_memory((void **)&l1_elements, 0, CHACHE_L1_SIZE); 303 304 size_t cache_elements = (CHACHE_L1_SIZE / sizeof(struct celem)) >> 2; 305 for (uint32_t i = 0; i < cache_elements - 1; ++i) { 306 l1_elements[i].next = &l1_elements[i+1]; 307 } 308 l1_elements[cache_elements-1].next = l1_elements; 309 310#ifdef __k1om__ 311 void *otherphi; 312 sysmem_bench_alloc_memory(&otherphi, 1, 2*DMA_BENCH_BUFFER_SIZE); 313 314 struct elem *oll_elements; 315 sysmem_bench_alloc_memory((void **)&oll_elements, 1, WORKSET_SIZE); 316 sysmem_bench_init_memory(oll_elements, num_elements); 317 318 struct celem *l1o_elements; 319 sysmem_bench_alloc_memory((void **)&l1o_elements, 1, CHACHE_L1_SIZE); 320 321 for (uint32_t i = 0; i < cache_elements - 1; ++i) { 322 l1o_elements[i].next = &l1o_elements[i+1]; 323 } 324 l1o_elements[cache_elements-1].next = l1o_elements; 325#endif 326 327 debug_printf("starting benchmark %u rounds\n", RUN_COUNT); 328 329 debug_printf("memcpy: LOCAL -> REMOTE\n"); 330 dma_bench_run_memcpy(sysmem, local); 331 332 debug_printf("memcpy:REMOTE -> LOCAL\n"); 333 dma_bench_run_memcpy(local, sysmem); 334 335#ifdef __k1om__ 336 debug_printf("memcpy: LOCAL -> OTHERPHI\n"); 337 dma_bench_run_memcpy(otherphi, local); 338 339 debug_printf("memcpy: OTHERPHI -> LOCAL\n"); 340 dma_bench_run_memcpy(local, otherphi); 341#endif 342 343 bench_ctl_t *ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, DIMENSIONS, RUN_COUNT); 344 cycles_t result[DIMENSIONS]; 345 uint32_t rounds_done = 0; 346 347 do { 348 volatile void *element; 349 result[0] = sysmem_bench_run_round(&ll_elements[elem_id[0]], &element); 350 351 /* just a access to the variable */ 352 if (!element) { 353 debug_printf("element %p was null.\n", element); 354 } 355 356#ifdef __k1om__ 357 debug_printf("sysmem_bench_run_round(&oll_elements[elem_id[0]], &element);\n"); 358 result[2] = sysmem_bench_run_round(&oll_elements[elem_id[0]], &element); 359 360 /* just a access to the variable */ 361 if (!element) { 362 debug_printf("element %p was null.\n", element); 363 } 364 365 debug_printf("sysmem_bench_run_round(&l1o_elements[0], &element);\n"); 366 result[3] = sysmem_bench_run_round(&l1o_elements[0], &element); 367 /* just a access to the variable */ 368 if (!element) { 369 debug_printf("element %p was null.\n", element); 370 } 371 372#endif 373 volatile struct celem *e2 = l1_elements; 374 for (uint32_t i = 0; i < cache_elements; ++i) { 375 NEXT_1000(e2); 376 } 377 378 result[1] = sysmem_bench_run_round(&l1_elements[0], &element); 379 380 /* just a access to the variable */ 381 if (!element) { 382 debug_printf("element %p was null.\n", element); 383 } 384 385 debug_printf("round: %u of %u\n", ++rounds_done, RUN_COUNT); 386 387 } while (!bench_ctl_add_run(ctl, result)); 388 389 debug_printf("---------------------------------------------------------\n"); 390 bench_ctl_dump_analysis(ctl, 0, "memlatency sysmem", tscperus); 391#ifdef __k1om__ 392 bench_ctl_dump_analysis(ctl, 2, "memlatency other", tscperus); 393 bench_ctl_dump_analysis(ctl, 3, "memlatency other cached", tscperus); 394#endif 395 bench_ctl_dump_analysis(ctl, 1, "cachelatency sysmem", tscperus); 396 debug_printf("---------------------------------------------------------\n"); 397 while(1); 398} 399 400#endif 401 402/* 403 * ---------------------------------------------------------------------------- 404 * Interface 405 * ---------------------------------------------------------------------------- 406 */ 407 408/** 409 * \brief Initializes the capability manager of the system memory range 410 * 411 * \return SYS_ERR_OK on success, 412 */ 413errval_t sysmem_cap_manager_init(struct capref sysmem_cap) 414{ 415 errval_t err; 416 417 // initialize the memory allcator 418 XSYSMEM_DEBUG("Initializing slot allocator of %" PRIu64 " slots\n", NUM_SLOTS); 419 err = range_slot_alloc_init(&sysmem_allocator, NUM_SLOTS, NULL); 420 if (err_is_fail(err)) { 421 return err_push(err, LIB_ERR_SLOT_ALLOC_INIT); 422 } 423 424 struct frame_identity ret; 425 err = frame_identify(sysmem_cap, &ret); 426 if (err_is_fail(err)) { 427 return err; 428 } 429 430 base_offset = ret.base; 431 432 XSYSMEM_DEBUG("Initializing memory manager with base 0x%" PRIxGENPADDR 433 "..0x%" PRIxGENPADDR "\n", 434 ret.base, ret.base + ret.bytes - 1); 435 436 /* 437 * initialize the memory manager. 438 * 439 * Important: the type has to be DevFrame, we do not want to zero out the 440 * host memory! 441 */ 442 assert((1UL << log2ceil(ret.bytes)) == ret.bytes); 443 err = mm_init(&sysmem_manager, ObjType_DevFrame, ret.base, log2ceil(ret.bytes), 444 NUM_CHILDREN, slab_default_refill, slot_alloc_dynamic, 445 slot_refill_dynamic, &sysmem_allocator, false); 446 if (err_is_fail(err)) { 447 return err_push(err, MM_ERR_MM_INIT); 448 } 449 450 451 452 XSYSMEM_DEBUG("Adding cap: [0x%016lx, %i]\n", ret.base, log2ceil(ret.bytes)); 453 err = mm_add(&sysmem_manager, sysmem_cap, log2ceil(ret.bytes), ret.base); 454 if (err_is_fail(err)) { 455 return err; 456 } 457 458#if SYSMEM_BENCH_ENABLED 459#ifdef __k1om__ 460 if (disp_xeon_phi_id()==1) { 461 sysmem_bench_run(); 462 } 463#else 464 if (disp_get_core_id() >= 20) { 465 sysmem_bench_run(); 466 } else { 467 while(1) 468 ; 469 } 470#endif 471#endif 472 return SYS_ERR_OK; 473} 474 475/** 476 * \brief Returns a previously requested system memory capability to the 477 * cap manager 478 */ 479errval_t sysmem_cap_return(struct capref frame) 480{ 481 errval_t err; 482 struct frame_identity id; 483 err = frame_identify(frame, &id); 484 if (err_is_fail(err)) { 485 return err; 486 } 487 488 assert((1UL << log2ceil(id.bytes)) == id.bytes); 489 return mm_free(&sysmem_manager, frame, id.base, log2ceil(id.bytes)); 490} 491 492/** 493 * \brief Requests a certain system memory capability based on the base and 494 * length requirements 495 * 496 * \param base the base address of the system memory (host address) 497 * \param bits the size of the requested capability in bits 498 * \param frame capability representing the system memory frame 499 * 500 * \retval SYS_ERR_OK on success 501 * 502 * Note: the caller must check the size and base of the frame... 503 */ 504errval_t sysmem_cap_request(lpaddr_t base, 505 uint8_t bits, 506 struct capref *frame) 507{ 508 errval_t err; 509 510 debug_printf("XXX Requesting cap for [0x%" PRIxLPADDR "..0x%" PRIxLPADDR "]\n", 511 base, base + (1UL << bits) - 1); 512 // the size and base must not exceed the maximum range (512G) 513 assert(bits < 40); 514 assert(!(base & (BASE_PAGE_SIZE-1))); 515 516 // align the base to the next 4k boundary 517 //size += (base & (BASE_PAGE_SIZE-1)); 518 // base -= (base & (BASE_PAGE_SIZE-1)); 519 520 // size = (size+BASE_PAGE_SIZE-1) & ~(BASE_PAGE_SIZE - 1); 521 522 // transform the address into the host memory range 523 // XXX: we just hand in the correct base now.! 524#if !defined(XEON_PHI_USE_HW_MODEL) 525 base += base_offset; 526#endif 527 528 err = mm_alloc_range(&sysmem_manager, bits, base, base + (1UL << bits), frame, 529 NULL); 530 531 if (err_is_fail(err)) { 532 XSYSMEM_DEBUG("Try reallocation for [0x%016lx, %i]\n", base, bits); 533 err = mm_realloc_range(&sysmem_manager, bits, base, frame); 534 if (err_is_fail(err)) { 535 return err; 536 } 537 } 538 return SYS_ERR_OK; 539} 540