1/*
2 * Copyright (c) 2014 ETH Zurich.
3 * All rights reserved.
4 *
5 * This file is distributed under the terms in the attached LICENSE file.
6 * If you do not find this file, copies can be found by writing to:
7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group.
8 */
9#include <string.h>
10#include <stdlib.h>
11#include <omp.h>
12
13#include <barrelfish/barrelfish.h>
14#include <barrelfish/nameservice_client.h>
15
16#include <xeon_phi/xeon_phi.h>
17#include <xeon_phi/xeon_phi_domain.h>
18
19#include <bench/bench.h>
20#include <xomp/xomp.h>
21
22#define BENCH_MEASURE_LOCAL 0
23#define BENCH_MEASURE_MAP_ONLY 0
24
25#define BENCH_RUN_COUNT 250
26#define BENCH_RUN_SINGLE 0
27
28#define DEBUG(x...) debug_printf(x)
29
30#define EXPECT_SUCCESS(errval, msg) \
31    if (err_is_fail(err)) {USER_PANIC_ERR(err, msg);}
32
33static uint32_t nthreads;
34
35cycles_t timer_xompinit = 0;
36
37#if BENCH_MEASURE_LOCAL
38static bench_ctl_t *ctl_local;
39#endif
40
41#if BENCH_MEASURE_MAP_ONLY
42static void measure_mapping(struct capref frame)
43{
44    errval_t err;
45
46    debug_printf("\n");
47    debug_printf("==========================================\n");
48    debug_printf("Mapping of frame\n");
49
50    struct frame_identity id;
51    err = invoke_frame_identify(frame, &id);
52    EXPECT_SUCCESS(err, "invoke_frame_identify");
53
54    size_t frame_size = (1UL << id.bits);
55
56    bench_ctl_t *b_ctl = NULL;
57    cycles_t tsc_start, tsc_end, elapsed;
58
59    b_ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
60
61    struct capref copy;
62    err = slot_alloc(&copy);
63    EXPECT_SUCCESS(err, "slot alloc");
64
65    err = cap_copy(copy, frame);
66    EXPECT_SUCCESS(err, "cap copy");
67
68    do {
69
70        void *addr;
71        tsc_start = bench_tsc();
72        err = vspace_map_one_frame(&addr, frame_size, copy, NULL, NULL);
73        tsc_end = bench_tsc();
74        EXPECT_SUCCESS(err, "vspace map one frame");
75
76        err = vspace_unmap(addr);
77        EXPECT_SUCCESS(err, "vspace unmap");
78        elapsed = bench_time_diff(tsc_start, tsc_end);
79
80    }while (!bench_ctl_add_run(b_ctl, &elapsed));
81
82    debug_printf("\n");
83    debug_printf("%% %lu bytes\n", frame_size);
84    bench_ctl_dump_analysis(b_ctl, 0, "map local", bench_tsc_per_us());
85    debug_printf("==========================================\n");
86}
87#endif
88
89static void prepare_bomp(void)
90{
91    cycles_t tsc_start = bench_tsc();
92    bomp_bomp_init(nthreads);
93    cycles_t tsc_end = bench_tsc();
94    timer_xompinit = bench_time_diff(tsc_start, tsc_end);
95}
96
97static int prepare_xomp(int argc,
98                        char *argv[])
99{
100    errval_t err;
101
102    xomp_wloc_t location = XOMP_WORKER_LOC_MIXED;
103    for (int i = 3; i < argc; ++i) {
104        if (!strncmp(argv[i], "--location=", 11)) {
105            char *p = strchr(argv[i], '=');
106            p++;
107            if (!strcmp(p, "local")) {
108                location = XOMP_WORKER_LOC_LOCAL;
109            }
110        }
111    }
112
113    if (location == XOMP_WORKER_LOC_MIXED) {
114        debug_printf("waiting for xeon phi to be ready\n");
115        err = xeon_phi_domain_blocking_lookup("xeon_phi.0.ready", NULL);
116        EXPECT_SUCCESS(err, "nameservice_blocking_lookup");
117        err = xeon_phi_domain_blocking_lookup("xeon_phi.1.ready", NULL);
118        EXPECT_SUCCESS(err, "nameservice_blocking_lookup");
119
120#if XOMP_BENCH_ENABLED
121        xomp_master_bench_enable(BENCH_RUN_COUNT, nthreads,
122                        		 XOMP_MASTER_BENCH_MEM_ADD);
123#endif
124
125    }
126
127    struct xomp_spawn local_info = {
128        .argc = argc,
129        .argv = argv,
130#ifdef __k1om__
131        .path = "/k1om/sbin/benchmarks/xomp_share",
132#else
133        .path = "/x86_64/sbin/benchmarks/xomp_share",
134#endif
135	};
136
137    struct xomp_spawn remote_info = {
138        .argc = argc,
139        .argv = argv,
140        .path = "/k1om/sbin/benchmarks/xomp_share",
141    };
142
143    struct xomp_args xomp_arg = {
144        .type = XOMP_ARG_TYPE_DISTINCT,
145        .core_stride = 1,  // use default
146        .args = {
147            .distinct = {
148                .nthreads = nthreads,
149                .worker_loc = location,
150                .nphi = 2,
151                .local = local_info,
152                .remote = remote_info
153            }
154        }
155    };
156
157    cycles_t tsc_start = bench_tsc();
158    if (bomp_xomp_init(&xomp_arg)) {
159        debug_printf("bomp init failed!\n");
160        exit(1);
161    }
162    cycles_t tsc_end = bench_tsc();
163    timer_xompinit = bench_time_diff(tsc_start, tsc_end);
164
165    return (location == XOMP_WORKER_LOC_LOCAL);
166}
167
168int main(int argc,
169         char *argv[])
170{
171    errval_t err, repl_err = SYS_ERR_OK;
172    xomp_wid_t wid;
173
174    cycles_t t_share, t_repl;
175
176    bench_init();
177
178    err = xomp_worker_parse_cmdline(argc, argv, &wid);
179    if (err_is_ok(err)) {
180        struct xomp_args xw_arg = {
181            .type = XOMP_ARG_TYPE_WORKER,
182            .args = {
183                .worker = {
184                    .id = wid
185                }
186            }
187        };
188        bomp_xomp_init(&xw_arg);
189    }
190
191    if (argc < 3) {
192        debug_printf("Usage: %s  <numthreats>\n", argv[0]);
193        exit(1);
194    }
195
196    nthreads = strtoul(argv[1], NULL, 10);
197    if (nthreads == 0) {
198        debug_printf("num threads must be >0\n");
199        exit(1);
200    }
201
202    DEBUG("\n");
203    DEBUG("======================================================\n");
204    debug_printf("Num Threads: %u\n", nthreads);
205
206    uint8_t is_shared = 0;
207    for (int i = 2; i < argc; ++i) {
208        if (!strcmp(argv[i], "bomp")) {
209            prepare_bomp();
210            is_shared = 1;
211        } else if (!strcmp(argv[i], "xomp")) {
212            is_shared = prepare_xomp(argc, argv);
213        } else {
214            debug_printf("ignoring argument {%s}\n", argv[i]);
215        }
216    }
217
218    lvaddr_t vbase = (10UL * 1024 * 1024 * 1024);
219
220    DEBUG("\n");
221    DEBUG("======================================================\n");
222    DEBUG("sharing of 4k\n");
223
224#define FRAME_SIZE_0 4096UL
225#define FRAME_SIZE_1 (1024UL * 1024)
226#define FRAME_SIZE_2 (32UL * 1024 * 1024)
227
228    size_t frame_size;
229    struct capref frame;
230
231
232#if BENCH_MEASURE_MAP_ONLY
233    measure_mapping(frame);
234#else
235    cycles_t tsc_start, tsc_end;
236
237    cycles_t tscperus = bench_tsc_per_us();
238#if 1
239    bench_ctl_t *ctl_share_4k = NULL, *ctl_repl_4k = NULL;
240
241    ctl_share_4k = bench_ctl_init(BENCH_MODE_FIXEDRUNS,1, BENCH_RUN_COUNT);
242    ctl_repl_4k = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
243    do {
244        err = frame_alloc(&frame, FRAME_SIZE_0, &frame_size);
245        EXPECT_SUCCESS(err, "frame alloc");
246        assert(frame_size == FRAME_SIZE_0);
247
248        tsc_start = bench_tsc();
249        err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_SHARED_RW);
250        tsc_end = bench_tsc();
251        EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n");
252        t_share = bench_time_diff(tsc_start, tsc_end);
253
254        vbase += FRAME_SIZE_0;
255
256        if (err_is_ok(err)) {
257            tsc_start = bench_tsc();
258            repl_err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_REPL_RW);
259            tsc_end = bench_tsc();
260            if (err_is_ok(err)) {
261                t_repl = bench_time_diff(tsc_start, tsc_end);
262                vbase += FRAME_SIZE_0;
263                bench_ctl_add_run(ctl_repl_4k, &t_repl);
264            }
265        }
266    } while (!bench_ctl_add_run(ctl_share_4k, &t_share));
267
268
269    bench_ctl_dump_analysis(ctl_share_4k, 0, "4k shared", tscperus);
270    bench_ctl_dump_analysis(ctl_repl_4k, 0, "4k repl", tscperus);
271#endif
272#endif
273    DEBUG("\n");
274    DEBUG("======================================================\n");
275    DEBUG("sharing of 1M\n");
276
277    err = frame_alloc(&frame, FRAME_SIZE_1, &frame_size);
278    EXPECT_SUCCESS(err, "frame alloc");
279    assert(frame_size == FRAME_SIZE_1);
280
281    vbase = (vbase + FRAME_SIZE_1) & ~(FRAME_SIZE_1 - 1);
282
283#if BENCH_MEASURE_MAP_ONLY
284    measure_mapping(frame);
285#else
286#if 1
287    bench_ctl_t *ctl_share_1M, *ctl_repl_1M;
288    ctl_share_1M = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
289    ctl_repl_1M = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
290    do {
291        err = frame_alloc(&frame, FRAME_SIZE_1, &frame_size);
292        EXPECT_SUCCESS(err, "frame alloc");
293        assert(frame_size == FRAME_SIZE_1);
294
295        tsc_start = bench_tsc();
296        err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_SHARED_RW);
297        tsc_end = bench_tsc();
298        EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n");
299        t_share = bench_time_diff(tsc_start, tsc_end);
300        vbase += FRAME_SIZE_1;
301
302        if (err_is_ok(repl_err)) {
303            tsc_start = bench_tsc();
304            repl_err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_REPL_RW);
305            tsc_end = bench_tsc();
306            if(err_is_ok(err)) {
307                EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n");
308                t_repl = bench_time_diff(tsc_start, tsc_end);
309                bench_ctl_add_run(ctl_repl_1M, &t_repl);
310            }
311            vbase += FRAME_SIZE_1;
312        }
313    } while (!bench_ctl_add_run(ctl_share_1M, &t_share));
314    bench_ctl_dump_analysis(ctl_share_1M, 0, "1M shared", tscperus);
315    bench_ctl_dump_analysis(ctl_repl_1M, 0, "1M repl", tscperus);
316#endif
317#endif
318    DEBUG("\n");
319    DEBUG("======================================================\n");
320    DEBUG("sharing of 256M\n");
321
322    err = frame_alloc(&frame, FRAME_SIZE_2, &frame_size);
323    EXPECT_SUCCESS(err, "frame alloc");
324    assert(frame_size == FRAME_SIZE_2);
325
326#if BENCH_MEASURE_MAP_ONLY
327    measure_mapping(frame);
328#else
329
330    vbase = (vbase + FRAME_SIZE_2) & ~(FRAME_SIZE_2 - 1);
331
332    bench_ctl_t *ctl_share_256M = NULL;
333    bench_ctl_t *ctl_repl_256M = NULL;
334    uint32_t counter = 0;
335
336    ctl_share_256M = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
337    ctl_repl_256M = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
338    do {
339        tsc_start = bench_tsc();
340        err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_SHARED_RW);
341        tsc_end = bench_tsc();
342        EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n");
343        t_share = bench_time_diff(tsc_start, tsc_end);
344
345        vbase += FRAME_SIZE_2;
346
347
348        if (err_is_ok(repl_err)) {
349            tsc_start = bench_tsc();
350            repl_err = xomp_master_add_memory(frame, vbase, XOMP_FRAME_TYPE_REPL_RW);
351            if (err_is_ok(repl_err)) {
352                tsc_end = bench_tsc();
353                t_repl = bench_time_diff(tsc_start, tsc_end);
354                bench_ctl_add_run(ctl_repl_256M, &t_repl);
355            }
356            vbase += FRAME_SIZE_2;
357        }
358        counter ++;
359    } while (!bench_ctl_add_run(ctl_share_256M, &t_share));
360    bench_ctl_dump_analysis(ctl_share_256M, 0, "256M shared", tscperus);
361    bench_ctl_dump_analysis(ctl_repl_256M, 0, "256M repl", tscperus);
362#endif
363    debug_printf("-------------------------------------\n");
364
365#if XOMP_BENCH_ENABLED
366    xomp_master_bench_print_results();
367#endif
368
369    while (1)
370        ;
371
372}
373
374