1/**
2 * \file
3 * \brief Driver for booting the Xeon Phi Coprocessor card on a Barrelfish Host
4 */
5
6/*
7 * Copyright (c) 2014 ETH Zurich.
8 * All rights reserved.
9 *
10 * This file is distributed under the terms in the attached LICENSE file.
11 * If you do not find this file, copies can be found by writing to:
12 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group.
13 */
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <barrelfish/barrelfish.h>
18#include <barrelfish/capabilities.h>
19
20#include <mm/mm.h>
21#include <xeon_phi/xeon_phi.h>
22
23#include "xeon_phi_internal.h"
24#include "sysmem_caps.h"
25
26/// the initial number of slots to allocate for the allocator
27#define NUM_SLOTS L2_CNODE_SLOTS
28
29#define NUM_CHILDREN 2
30
31/*
32 * XXX: This manager relies on the 1:1 mapping of the system memory
33 *      in the system memory page tables!
34 */
35
36/// the memory manager for the system memory
37static struct mm sysmem_manager;
38
39/// offset to the base address
40static lpaddr_t base_offset = 0;
41
42/// the slot allocator
43static struct range_slot_allocator sysmem_allocator;
44
45/*
46 * ----------------------------------------------------------------------------
47 * System Memory Latency Benchmark
48 * ----------------------------------------------------------------------------
49 */
50#ifdef __k1om__
51#define SYSMEM_BENCH_ENABLED 0
52#else
53#define SYSMEM_BENCH_ENABLED 0
54#endif
55
56#if SYSMEM_BENCH_ENABLED
57#include <barrelfish/nameservice_client.h>
58#include <bench/bench.h>
59#include <limits.h>
60#include <dma/dma_bench.h>
61
62#define EXPECT_SUCCESS(_err, msg...) \
63    if (err_is_fail(_err)) {USER_PANIC_ERR(_err, msg);}
64
65#define CHACHE_L1_SIZE (32UL * 1024)
66#define CHACHE_LINE_SIZE 64
67#ifdef __k1om__
68#define CHACHE_LL_SIZE (28UL*1024*1024 + 512UL * 1024)
69#define DIMENSIONS 4
70#else
71#define CHACHE_LL_SIZE (25UL*1024*1024)
72#define DIMENSIONS 2
73#endif
74#define WORKSET_SIZE_MULT 16
75#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE)
76
77/// the number of benchmark rounds to execute
78#define RUN_COUNT 1000
79
80/// number of loop iterations of 10k operations
81#define LOOP_ITERATIONS 1000
82
83/// loop unrolling factor {10, 50, 100, 500, 1000, 5000}
84#define LOOP_UNROLLING 1000
85
86#define NEXT(_e) (_e) = (_e)->next;
87#define NEXT_5(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e)
88#define NEXT_10(_e) NEXT_5(_e) NEXT_5(_e)
89#define NEXT_50(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e)
90#define NEXT_100(_e) NEXT_50(_e) NEXT_50(_e)
91#define NEXT_500(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e)
92#define NEXT_1000(_e) NEXT_500(_e) NEXT_500(_e)
93
94#if LOOP_UNROLLING == 10000
95#define UNROLL_NEXT(_e) NEXT_100(_e)
96#elif LOOP_UNROLLING == 5000
97#define UNROLL_NEXT(_e) NEXT_500(_e)
98#elif LOOP_UNROLLING == 1000
99#define UNROLL_NEXT(_e) NEXT_100(_e)
100#elif LOOP_UNROLLING == 500
101#define UNROLL_NEXT(_e) NEXT_50(_e)
102#elif LOOP_UNROLLING == 100
103#define UNROLL_NEXT(_e) NEXT_10(_e)
104#elif LOOP_UNROLLING == 50
105#define UNROLL_NEXT(_e) NEXT_5(_e)
106#elif LOOP_UNROLLING == 10
107#define UNROLL_NEXT(_e) NEXT(_e)
108#endif
109
110
111#ifndef UNROLL_NEXT
112#error "UNROLL_NEXT not defined"
113#endif
114
115struct elem {
116    struct elem *next;
117    uint8_t pad[CHACHE_LINE_SIZE - sizeof(void *)];
118};
119
120struct celem {
121    struct celem *next;
122};
123
124static uint32_t *elem_id = NULL;
125
126/**
127 * \brief calculates the time difference between two time stamps with overhead
128 *
129 * \param tsc_start start time stamp
130 * \param tsc_end   end time stamp
131 *
132 * \returns elapsed time in cycles
133 */
134static inline cycles_t sysmem_bench_calculate_time(cycles_t tsc_start,
135                                                   cycles_t tsc_end)
136{
137    cycles_t result;
138    if (tsc_end < tsc_start) {
139        result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
140    } else {
141        result = (tsc_end - tsc_start - bench_tscoverhead());
142    }
143
144    return result;
145}
146
147/**
148 * \brief generates a shuffled index array for randomized access
149 *
150 * \param num   number of elements in the array
151 */
152static void sysmem_bench_generate_shuffle(size_t num)
153{
154    if (elem_id) {
155        return;
156    }
157
158    elem_id = malloc(sizeof(uint32_t) * num + 1);
159    assert(elem_id);
160
161    for (uint32_t i = 0; i < num; ++i) {
162        elem_id[i] = i;
163    }
164
165    /*
166     * shuffle the array using Knuth shuffle
167     */
168    for (uint32_t i = 0; i < num; ++i) {
169        uint32_t idx = i + (rand() % (num - i));
170        assert(idx < num + 1);
171        uint32_t tmp = elem_id[i];
172        elem_id[i] = elem_id[idx];
173        elem_id[idx] = tmp;
174    }
175}
176
177static void sysmem_bench_init_memory(struct elem *mem,
178                                     size_t num)
179{
180    sysmem_bench_generate_shuffle(num);
181
182    /* do the linkage */
183    struct elem *head = &mem[elem_id[0]];
184    for (uint32_t i = 1; i < num; ++i) {
185        head->next = &mem[elem_id[i]];
186        head = head->next;
187    }
188    mem[elem_id[num-1]].next = &mem[elem_id[0]];
189}
190
191#ifdef __k1om__
192static lvaddr_t requested_size = 0;
193static lvaddr_t requested_size_other = (2UL * 1024 * 1024 * 1024);
194#else
195static lvaddr_t requested_size = 0;
196#endif
197static void sysmem_bench_alloc_memory(void **mem,
198                                      uint8_t other_phi,
199                                      size_t size)
200{
201
202    errval_t err;
203
204    uint8_t bits = 0;
205    while(size > (1UL << bits)) {
206        bits++;
207    }
208
209#ifdef __k1om__
210    lvaddr_t base = 0;
211    if (other_phi) {
212        base += 31 * XEON_PHI_SYSMEM_PAGE_SIZE;
213        base += requested_size_other;
214        requested_size_other += (1UL << (bits + 1));
215    } else {
216        base += XEON_PHI_SYSMEM_PAGE_SIZE << 1;
217        base += requested_size;
218        requested_size += (1UL << (bits + 1));
219    }
220#else
221    lvaddr_t base = (2UL * 1024 * 1024 * 1024);
222    base += requested_size;
223    requested_size += (1UL << (bits + 1));
224#endif
225
226
227
228    debug_printf("requesting: %lx, %u bits\n", base, bits);
229
230    struct capref frame;
231    err = sysmem_cap_request(base, bits, &frame);
232    EXPECT_SUCCESS(err, "sysmem cap request");
233
234    void *addr;
235    err = vspace_map_one_frame(&addr, size, frame, NULL, NULL);
236    EXPECT_SUCCESS(err, "mapping of frame failed");
237
238    if (mem) {
239        *mem = addr;
240    }
241}
242
243static cycles_t sysmem_bench_run_round(void *buffer, volatile void **ret_elem)
244{
245    volatile struct elem *e = buffer;
246
247    cycles_t tsc_start = bench_tsc();
248
249    for (uint32_t i = 0; i < LOOP_ITERATIONS; ++i) {
250        UNROLL_NEXT(e);
251        UNROLL_NEXT(e);
252        UNROLL_NEXT(e);
253        UNROLL_NEXT(e);
254        UNROLL_NEXT(e);
255        UNROLL_NEXT(e);
256        UNROLL_NEXT(e);
257        UNROLL_NEXT(e);
258        UNROLL_NEXT(e);
259        UNROLL_NEXT(e);
260    }
261    cycles_t tsc_end = bench_tsc();
262
263    if (ret_elem) {
264        *ret_elem = e;
265    }
266
267    return sysmem_bench_calculate_time(tsc_start, tsc_end) / (LOOP_ITERATIONS * LOOP_UNROLLING);
268}
269
270static void sysmem_bench_run(void)
271{
272
273#ifdef __k1om__
274    errval_t err = nameservice_blocking_lookup("all_spawnds_up", NULL);
275    EXPECT_SUCCESS(err, "all_spawnds_up");
276#endif
277
278    debug_printf("==========================================================\n");
279    debug_printf("Running sysmem bench\n");
280    debug_printf("==========================================================\n");
281
282    bench_init();
283
284    cycles_t tscperus = bench_tsc_per_us();
285
286    assert(sizeof(struct elem) == CACHE_LINE_SIZE);
287
288    size_t num_elements = (WORKSET_SIZE) / sizeof(struct elem);
289
290    void *sysmem;
291    sysmem_bench_alloc_memory(&sysmem, 0, 2*DMA_BENCH_BUFFER_SIZE);
292
293    void *local = malloc(DMA_BENCH_BUFFER_SIZE);
294
295
296    struct elem *ll_elements;
297    sysmem_bench_alloc_memory((void **)&ll_elements, 0, WORKSET_SIZE);
298    sysmem_bench_init_memory(ll_elements, num_elements);
299
300
301    struct celem *l1_elements;
302    sysmem_bench_alloc_memory((void **)&l1_elements, 0, CHACHE_L1_SIZE);
303
304    size_t cache_elements = (CHACHE_L1_SIZE / sizeof(struct celem)) >> 2;
305    for (uint32_t i = 0; i < cache_elements - 1; ++i) {
306        l1_elements[i].next = &l1_elements[i+1];
307    }
308    l1_elements[cache_elements-1].next = l1_elements;
309
310#ifdef __k1om__
311    void *otherphi;
312    sysmem_bench_alloc_memory(&otherphi, 1, 2*DMA_BENCH_BUFFER_SIZE);
313
314    struct elem *oll_elements;
315    sysmem_bench_alloc_memory((void **)&oll_elements, 1, WORKSET_SIZE);
316    sysmem_bench_init_memory(oll_elements, num_elements);
317
318    struct celem *l1o_elements;
319    sysmem_bench_alloc_memory((void **)&l1o_elements, 1, CHACHE_L1_SIZE);
320
321    for (uint32_t i = 0; i < cache_elements - 1; ++i) {
322        l1o_elements[i].next = &l1o_elements[i+1];
323    }
324    l1o_elements[cache_elements-1].next = l1o_elements;
325#endif
326
327    debug_printf("starting benchmark %u rounds\n", RUN_COUNT);
328
329    debug_printf("memcpy: LOCAL -> REMOTE\n");
330    dma_bench_run_memcpy(sysmem, local);
331
332    debug_printf("memcpy:REMOTE -> LOCAL\n");
333    dma_bench_run_memcpy(local, sysmem);
334
335#ifdef __k1om__
336    debug_printf("memcpy: LOCAL -> OTHERPHI\n");
337    dma_bench_run_memcpy(otherphi, local);
338
339    debug_printf("memcpy: OTHERPHI -> LOCAL\n");
340    dma_bench_run_memcpy(local, otherphi);
341#endif
342
343    bench_ctl_t *ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, DIMENSIONS, RUN_COUNT);
344    cycles_t result[DIMENSIONS];
345    uint32_t rounds_done = 0;
346
347    do {
348        volatile void *element;
349        result[0] = sysmem_bench_run_round(&ll_elements[elem_id[0]], &element);
350
351        /* just a access to the variable */
352        if (!element) {
353            debug_printf("element %p was null.\n", element);
354        }
355
356#ifdef __k1om__
357        debug_printf("sysmem_bench_run_round(&oll_elements[elem_id[0]], &element);\n");
358        result[2] = sysmem_bench_run_round(&oll_elements[elem_id[0]], &element);
359
360        /* just a access to the variable */
361        if (!element) {
362            debug_printf("element %p was null.\n", element);
363        }
364
365        debug_printf("sysmem_bench_run_round(&l1o_elements[0], &element);\n");
366        result[3] = sysmem_bench_run_round(&l1o_elements[0], &element);
367        /* just a access to the variable */
368        if (!element) {
369            debug_printf("element %p was null.\n", element);
370        }
371
372#endif
373        volatile struct celem *e2 = l1_elements;
374        for (uint32_t i = 0; i < cache_elements; ++i) {
375            NEXT_1000(e2);
376        }
377
378        result[1] = sysmem_bench_run_round(&l1_elements[0], &element);
379
380        /* just a access to the variable */
381        if (!element) {
382            debug_printf("element %p was null.\n", element);
383        }
384
385        debug_printf("round: %u of %u\n", ++rounds_done, RUN_COUNT);
386
387    } while (!bench_ctl_add_run(ctl, result));
388
389    debug_printf("---------------------------------------------------------\n");
390    bench_ctl_dump_analysis(ctl, 0, "memlatency sysmem", tscperus);
391#ifdef __k1om__
392    bench_ctl_dump_analysis(ctl, 2, "memlatency other", tscperus);
393    bench_ctl_dump_analysis(ctl, 3, "memlatency other cached", tscperus);
394#endif
395    bench_ctl_dump_analysis(ctl, 1, "cachelatency sysmem", tscperus);
396    debug_printf("---------------------------------------------------------\n");
397    while(1);
398}
399
400#endif
401
402/*
403 * ----------------------------------------------------------------------------
404 * Interface
405 * ----------------------------------------------------------------------------
406 */
407
408/**
409 * \brief Initializes the capability manager of the system memory range
410 *
411 * \return SYS_ERR_OK on success,
412 */
413errval_t sysmem_cap_manager_init(struct capref sysmem_cap)
414{
415    errval_t err;
416
417    // initialize the memory allcator
418    XSYSMEM_DEBUG("Initializing slot allocator of %" PRIu64 " slots\n", NUM_SLOTS);
419    err = range_slot_alloc_init(&sysmem_allocator, NUM_SLOTS, NULL);
420    if (err_is_fail(err)) {
421        return err_push(err, LIB_ERR_SLOT_ALLOC_INIT);
422    }
423
424    struct frame_identity ret;
425    err = frame_identify(sysmem_cap, &ret);
426    if (err_is_fail(err)) {
427        return err;
428    }
429
430    base_offset = ret.base;
431
432    XSYSMEM_DEBUG("Initializing memory manager with base 0x%" PRIxGENPADDR
433                          "..0x%" PRIxGENPADDR "\n",
434                  ret.base, ret.base + ret.bytes - 1);
435
436    /*
437     * initialize the memory manager.
438     *
439     * Important: the type has to be DevFrame, we do not want to zero out the
440     *            host memory!
441     */
442    assert((1UL << log2ceil(ret.bytes)) == ret.bytes);
443    err = mm_init(&sysmem_manager, ObjType_DevFrame, ret.base, log2ceil(ret.bytes),
444                  NUM_CHILDREN, slab_default_refill, slot_alloc_dynamic,
445                  slot_refill_dynamic, &sysmem_allocator, false);
446    if (err_is_fail(err)) {
447        return err_push(err, MM_ERR_MM_INIT);
448    }
449
450
451
452    XSYSMEM_DEBUG("Adding cap: [0x%016lx, %i]\n", ret.base, log2ceil(ret.bytes));
453    err = mm_add(&sysmem_manager, sysmem_cap, log2ceil(ret.bytes), ret.base);
454    if (err_is_fail(err)) {
455        return err;
456    }
457
458#if SYSMEM_BENCH_ENABLED
459#ifdef __k1om__
460    if (disp_xeon_phi_id()==1) {
461        sysmem_bench_run();
462    }
463#else
464    if (disp_get_core_id() >= 20) {
465        sysmem_bench_run();
466    } else {
467        while(1)
468            ;
469    }
470#endif
471#endif
472    return SYS_ERR_OK;
473}
474
475/**
476 * \brief Returns a previously requested system memory capability to the
477 *        cap manager
478 */
479errval_t sysmem_cap_return(struct capref frame)
480{
481    errval_t err;
482    struct frame_identity id;
483    err = frame_identify(frame, &id);
484    if (err_is_fail(err)) {
485        return err;
486    }
487
488    assert((1UL << log2ceil(id.bytes)) == id.bytes);
489    return mm_free(&sysmem_manager, frame, id.base, log2ceil(id.bytes));
490}
491
492/**
493 * \brief Requests a certain system memory capability based on the base and
494 *        length requirements
495 *
496 * \param base  the base address of the system memory (host address)
497 * \param bits  the size of the requested capability in bits
498 * \param frame capability representing the system memory frame
499 *
500 * \retval SYS_ERR_OK on success
501 *
502 * Note: the caller must check the size and base of the frame...
503 */
504errval_t sysmem_cap_request(lpaddr_t base,
505                            uint8_t bits,
506                            struct capref *frame)
507{
508    errval_t err;
509
510    debug_printf("XXX Requesting cap for [0x%" PRIxLPADDR "..0x%" PRIxLPADDR "]\n",
511                  base, base + (1UL << bits) - 1);
512    // the size and base must not exceed the maximum range (512G)
513    assert(bits < 40);
514    assert(!(base & (BASE_PAGE_SIZE-1)));
515
516    // align the base to the next 4k boundary
517    //size += (base & (BASE_PAGE_SIZE-1));
518    // base -= (base & (BASE_PAGE_SIZE-1));
519
520    // size = (size+BASE_PAGE_SIZE-1) & ~(BASE_PAGE_SIZE - 1);
521
522    // transform the address into the host memory range
523    // XXX: we just hand in the correct base now.!
524#if !defined(XEON_PHI_USE_HW_MODEL)
525    base += base_offset;
526#endif
527
528    err = mm_alloc_range(&sysmem_manager, bits, base, base + (1UL << bits), frame,
529                         NULL);
530
531    if (err_is_fail(err)) {
532        XSYSMEM_DEBUG("Try reallocation for  [0x%016lx, %i]\n", base, bits);
533        err = mm_realloc_range(&sysmem_manager, bits, base, frame);
534        if (err_is_fail(err)) {
535            return err;
536        }
537    }
538    return SYS_ERR_OK;
539}
540