1/*
2 * Copyright (c) 2014 ETH Zurich.
3 * All rights reserved.
4 *
5 * This file is distributed under the terms in the attached LICENSE file.
6 * If you do not find this file, copies can be found by writing to:
7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group.
8 */
9
10#include <string.h>
11#include <stdlib.h>
12#include <limits.h>
13#include <barrelfish/barrelfish.h>
14#include <bench/bench.h>
15
16/*
17 * ===========================================================================
18 * XXX: README
19 *
20 * The compilation of this file may take quite long due to heavily use of
21 * expansive makros in the loop (see next)
22 * ===========================================================================
23 */
24
25
26/*
27 * ----------------------------------------------------------------------------
28 * Benchmark Settings
29 * ----------------------------------------------------------------------------
30 */
31
32/// the number of benchmark rounds to execute
33#define RUN_COUNT 1000
34
35/// number of loop iterations of 10k operations
36#define LOOP_ITERATIONS 1000
37
38/// loop unrolling factor {10, 50, 100, 500, 1000, 5000}
39#define LOOP_UNROLLING 1000
40
41/// working set multiplier (workset = sizeof(LL_CACHE) * multiplier
42#define WORKSET_SIZE_MULT 8
43
44/*
45 * ----------------------------------------------------------------------------
46 * Machine Settings
47 * ----------------------------------------------------------------------------
48 */
49
50#ifdef __k1om__
51#define CHACHE_L1_SIZE (32UL * 1024)
52#define CHACHE_L2_SIZE (512UL * 1024)
53#define CHACHE_L3_SIZE (28UL*1024*1024 + 512UL * 1024)
54#define CHACHE_LL_SIZE (CHACHE_L3_SIZE)
55#define CHACHE_LINE_SIZE 64
56#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE)
57#define NUMA_NODE_MAX 0
58#define NUME_MEM_PER_NODE (6UL * 1024 * 1024 * 1024)
59#define DIMENSIONS 2
60#else
61#define CHACHE_L1_SIZE (32UL * 1024)
62#define CHACHE_L2_SIZE (256UL * 1024)
63#define CHACHE_L3_SIZE (25UL*1024*1024)
64#define CHACHE_LL_SIZE (CHACHE_L3_SIZE)
65#define CHACHE_LINE_SIZE 64
66#define WORKSET_SIZE (WORKSET_SIZE_MULT * CHACHE_LL_SIZE)
67#define NUMA_NODE_MAX 1
68#define NUME_MEM_PER_NODE (128UL * 1024 * 1024 * 1024)
69#define DIMENSIONS 3
70#endif
71
72
73#define EXPECT_SUCCESS(_err, msg...) \
74    if (err_is_fail(_err)) {USER_PANIC_ERR(_err, msg);}
75
76
77#define NEXT(_e) (_e) = (_e)->next;
78#define NEXT_5(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e) NEXT(_e)
79#define NEXT_10(_e) NEXT_5(_e) NEXT_5(_e)
80#define NEXT_50(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e) NEXT_10(_e)
81#define NEXT_100(_e) NEXT_50(_e) NEXT_50(_e)
82#define NEXT_500(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e) NEXT_100(_e)
83#define NEXT_1000(_e) NEXT_500(_e) NEXT_500(_e)
84
85#if LOOP_UNROLLING == 10000
86#define UNROLL_NEXT(_e) NEXT_100(_e)
87#elif LOOP_UNROLLING == 5000
88#define UNROLL_NEXT(_e) NEXT_500(_e)
89#elif LOOP_UNROLLING == 1000
90#define UNROLL_NEXT(_e) NEXT_100(_e)
91#elif LOOP_UNROLLING == 500
92#define UNROLL_NEXT(_e) NEXT_50(_e)
93#elif LOOP_UNROLLING == 100
94#define UNROLL_NEXT(_e) NEXT_10(_e)
95#elif LOOP_UNROLLING == 50
96#define UNROLL_NEXT(_e) NEXT_5(_e)
97#elif LOOP_UNROLLING == 10
98#define UNROLL_NEXT(_e) NEXT(_e)
99#endif
100
101
102#ifndef UNROLL_NEXT
103#error "UNROLL_NEXT not defined"
104#endif
105
106
107struct elem {
108    struct elem *next;
109    uint32_t val;
110    uint8_t pad[CHACHE_LINE_SIZE - sizeof(void *) - sizeof(uint32_t)];
111};
112
113struct celem {
114    struct celem *next;
115};
116
117
118uint32_t *elem_id = NULL;
119
120static inline cycles_t calculate_time(cycles_t tsc_start,
121                                      cycles_t tsc_end)
122{
123    cycles_t result;
124    if (tsc_end < tsc_start) {
125        result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
126    } else {
127        result = (tsc_end - tsc_start - bench_tscoverhead());
128    }
129
130    if (result < (tsc_end - tsc_start)) {
131        return result;
132    }
133
134    debug_printf("result: %lu / %lu / overhead: %lu\n", result,
135                 tsc_end - tsc_start, bench_tscoverhead());
136
137    return result;
138}
139
140static uint8_t numa_node(void)
141{
142#ifdef __k1om__
143    return 0;
144#else
145    return (disp_get_core_id() > 19);
146#endif
147}
148
149static void generate_shuffle(size_t num)
150{
151    if (elem_id) {
152        return;
153    }
154
155    debug_printf("generating shuffle\n");
156
157    elem_id = malloc(sizeof(uint32_t) * num + 1);
158    assert(elem_id);
159
160    for (uint32_t i = 0; i < num; ++i) {
161        elem_id[i] = i;
162    }
163
164    /*
165     * shuffle the array using Knuth shuffle
166     */
167    for (uint32_t i = 0; i < num; ++i) {
168        uint32_t idx = i + (rand() % (num - i));
169        assert(idx < num + 1);
170        uint32_t tmp = elem_id[i];
171        elem_id[i] = elem_id[idx];
172        elem_id[idx] = tmp;
173    }
174}
175
176static void init_memory(struct elem *mem,
177                        size_t num)
178{
179    generate_shuffle(num);
180
181    for (uint32_t i = 0; i < num; ++i) {
182        mem[i].val = i + 1;
183    }
184
185    /* do the linkage */
186    struct elem *head = &mem[elem_id[0]];
187    for (uint32_t i = 1; i < num; ++i) {
188        head->next = &mem[elem_id[i]];
189        head = head->next;
190    }
191    mem[elem_id[num-1]].next = &mem[elem_id[0]];
192}
193
194static void alloc_memory(struct elem **mem,
195                         uint8_t node)
196{
197    assert(node <= NUMA_NODE_MAX);
198
199    errval_t err;
200
201#ifndef __k1om__
202    uint64_t min_base, max_limit;
203    ram_get_affinity(&min_base, &max_limit);
204
205    uint64_t mem_min = NUME_MEM_PER_NODE * node;
206    uint64_t mem_max = NUME_MEM_PER_NODE * (node + 1);
207
208    debug_printf("alloc mem: numa=%u, range={%016lx, %016lx}\n", node, mem_min,
209                 mem_max);
210
211    ram_set_affinity(mem_min, mem_max);
212#endif
213
214    struct capref frame;
215    err = frame_alloc(&frame, WORKSET_SIZE, NULL);
216    EXPECT_SUCCESS(err, "frame alloc failed\n");
217
218    void *addr;
219    err = vspace_map_one_frame(&addr, WORKSET_SIZE, frame, NULL, NULL);
220    EXPECT_SUCCESS(err, "mapping of frame failed");
221
222#ifndef __k1om__
223    ram_set_affinity(min_base, max_limit);
224#endif
225
226    if (mem) {
227        *mem = addr;
228    }
229}
230
231static cycles_t run_benchmark(void *buffer, volatile void **ret_elem)
232{
233    volatile struct elem *e = buffer;
234
235    cycles_t tsc_start = bench_tsc();
236
237    for (uint32_t i = 0; i < LOOP_ITERATIONS; ++i) {
238        UNROLL_NEXT(e);
239        UNROLL_NEXT(e);
240        UNROLL_NEXT(e);
241        UNROLL_NEXT(e);
242        UNROLL_NEXT(e);
243        UNROLL_NEXT(e);
244        UNROLL_NEXT(e);
245        UNROLL_NEXT(e);
246        UNROLL_NEXT(e);
247        UNROLL_NEXT(e);
248    }
249    cycles_t tsc_end = bench_tsc();
250
251    if (ret_elem) {
252        *ret_elem = e;
253    }
254
255    return calculate_time(tsc_start, tsc_end) / (LOOP_ITERATIONS * LOOP_UNROLLING);
256}
257
258int main(int argc,
259         char *argv[])
260{
261
262    bench_init();
263
264    debug_printf("memlatency benchmark started...Workset size: %lu MBytes\n",
265                 WORKSET_SIZE >> 20);
266
267    if (numa_node()) {
268        USER_PANIC("must be run on numa node 0\n");
269    }
270
271    assert(sizeof(struct elem) == CACHE_LINE_SIZE);
272
273    cycles_t tscperus = bench_tsc_per_us();
274
275    size_t num_elements = (WORKSET_SIZE) / sizeof(struct elem);
276    assert(num_elements < (0x7FFFFFFF));
277
278    debug_printf("allocating elements list: %lu elements\n", num_elements);
279
280    struct elem *ll_elements;
281    alloc_memory(&ll_elements, 0);
282    init_memory(ll_elements, num_elements);
283
284#ifndef __k1om__
285    struct elem *ll_elements_numa;
286    alloc_memory(&ll_elements_numa, 1);
287    init_memory(ll_elements_numa, num_elements);
288#endif
289
290    /*
291     * do cache allocations
292     */
293    size_t cache_elements = (CHACHE_L1_SIZE / sizeof(struct celem)) >> 2;
294    debug_printf("initialize cached elements, num:%lu\n", cache_elements);
295
296    struct celem *l1_elements = malloc((cache_elements)* sizeof(struct celem));
297    for (uint32_t i = 0; i < cache_elements - 1; ++i) {
298        l1_elements[i].next = &l1_elements[i+1];
299    }
300    l1_elements[cache_elements-1].next = l1_elements;
301
302    /*
303     * BENCHMARK
304     */
305
306    debug_printf("starting benchmark %u rounds\n", RUN_COUNT);
307
308    bench_ctl_t *ctl;
309    cycles_t result[DIMENSIONS];
310
311    uint32_t round = 0;
312
313    ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, DIMENSIONS, RUN_COUNT);
314    do {
315        volatile void *element;
316        result[0] = run_benchmark(&ll_elements[elem_id[0]], &element);
317                /*  make sure the loop is not optimized away */
318
319        /* just a access to the variable */
320        if (!element) {
321            debug_printf("element %p was null.\n", element);
322        }
323
324
325#ifndef __k1om__
326        result[2] = run_benchmark(&ll_elements_numa[elem_id[0]], &element);
327
328        /* just a access to the variable */
329        if (!element) {
330            debug_printf("element %p was null.\n", element);
331        }
332#endif
333
334        volatile struct celem *e2 = l1_elements;
335
336        /* warmup cache */
337        for (uint32_t i = 0; i < cache_elements; ++i) {
338            NEXT_1000(e2);
339        }
340
341        e2 = l1_elements;
342        result[1] = run_benchmark(l1_elements, &element);
343
344        /* just a access to the variable */
345        if (!element) {
346            debug_printf("element %p was null.\n", element);
347        }
348
349        debug_printf("round: %u of %u\n", ++round, RUN_COUNT);
350    } while (!bench_ctl_add_run(ctl, result));
351
352    debug_printf("---------------------------------------------------------\n");
353    bench_ctl_dump_analysis(ctl, 0, "memlatency", tscperus);
354#ifndef __k1om__
355    bench_ctl_dump_analysis(ctl, 2, "memlatency numa", tscperus);
356#endif
357    bench_ctl_dump_analysis(ctl, 1, "cachelatency", tscperus);
358    debug_printf("---------------------------------------------------------\n");
359    return 0;
360}
361