1/*
2 * Copyright (c) 2014 ETH Zurich.
3 * All rights reserved.
4 *
5 * This file is distributed under the terms in the attached LICENSE file.
6 * If you do not find this file, copies can be found by writing to:
7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group.
8 */
9#include <string.h>
10#include <stdlib.h>
11#include <omp.h>
12
13#include <barrelfish/barrelfish.h>
14#include <barrelfish/nameservice_client.h>
15
16#include <xeon_phi/xeon_phi.h>
17#include <xeon_phi/xeon_phi_domain.h>
18
19#include <bench/bench.h>
20#include <xomp/xomp.h>
21
22#define BENCH_MEASURE_LOCAL 0
23
24#define BENCH_RUN_COUNT 25
25#define BENCH_RUN_SINGLE 0
26
27#define DEBUG(x...) debug_printf(x)
28
29#define EXPECT_SUCCESS(errval, msg) \
30    if (err_is_fail(err)) {USER_PANIC_ERR(err, msg);}
31
32static uint32_t nthreads;
33
34#define SCHEDULE static
35#define CHUNK    25
36
37#define MATRIX_TYPE uint64_t
38#define MATRIX_ROWS 1000
39#define MATRIX_COLS 1000
40#define MATRIX_ELEMENTS (MATRIX_ROWS * MATRIX_COLS)
41
42static MATRIX_TYPE *mA_repl;
43static MATRIX_TYPE *mB_repl;
44static MATRIX_TYPE *mC_repl;
45
46static MATRIX_TYPE *mA_shared;
47static MATRIX_TYPE *mB_shared;
48static MATRIX_TYPE *mC_shared;
49
50struct capref frame_mAB;
51struct capref frame_mC;
52
53struct mm_frame {
54    MATRIX_TYPE sum;
55    uint64_t rows;
56    uint64_t cols;
57};
58
59static struct mm_frame *g_mm_frame;
60struct capref frame_mm_frame;
61
62MATRIX_TYPE g_sum;
63
64#define RAM_MIN_BASE (128ULL * 1024 * 1024 * 1024)
65#define RAM_MAX_LIMIT (512ULL * 1024 * 1024 * 1024)
66
67/*
68 * benchmark timers
69 */
70cycles_t timer_xompinit = 0;
71cycles_t timer_share = 0;
72
73static lvaddr_t vbase_map = (128ULL * 1024 * 1024 * 1024);
74
75static void matrix_alloc(uint64_t rows,
76                         uint64_t cols)
77{
78    errval_t err;
79
80#ifndef __k1om__
81    uint64_t min_base, max_limit;
82
83    ram_get_affinity(&min_base, &max_limit);
84    ram_set_affinity(RAM_MIN_BASE, RAM_MAX_LIMIT);
85
86#endif
87    size_t matrix_size = (rows * cols) * sizeof(MATRIX_TYPE);
88
89    err = frame_alloc(&frame_mAB, 2 * matrix_size, &matrix_size);
90    EXPECT_SUCCESS(err, "frame allocate\n");
91
92    err = vspace_map_one_frame_fixed(vbase_map, matrix_size, frame_mAB, NULL, NULL);
93    EXPECT_SUCCESS(err, "vspace_map_one_frame\n");
94
95    mA_repl = (void *)vbase_map;
96    mB_repl = (void *)(vbase_map + ((rows * cols) * sizeof(MATRIX_TYPE)));
97
98    debug_printf("frameAB: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size);
99
100    vbase_map += matrix_size;
101
102    struct capref copy;
103    err = slot_alloc(&copy);
104    EXPECT_SUCCESS(err, "slot_alloc\n");
105
106    err = cap_copy(copy, frame_mAB);
107    EXPECT_SUCCESS(err, "cap_copy\n");
108
109    err = vspace_map_one_frame_fixed(vbase_map, matrix_size, copy, NULL, NULL);
110    EXPECT_SUCCESS(err, "vspace_map_one_frame\n");
111
112    mA_shared = (void *)vbase_map;
113    mB_shared = (void *)(vbase_map + ((rows * cols) * sizeof(MATRIX_TYPE)));
114
115    debug_printf("sharedAB: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size);
116
117    vbase_map += matrix_size;
118
119    matrix_size = (rows * cols) * sizeof(MATRIX_TYPE);
120
121    err = frame_alloc(&frame_mC, matrix_size, &matrix_size);
122    EXPECT_SUCCESS(err, "frame allocate\n");
123
124    err = vspace_map_one_frame_fixed(vbase_map, matrix_size, frame_mC, NULL, NULL);
125    EXPECT_SUCCESS(err, "vspace_map_one_frame\n");
126
127    mC_repl = (void *)vbase_map;
128
129    debug_printf("frameC: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size);
130
131    vbase_map += matrix_size;
132
133    err = slot_alloc(&copy);
134    EXPECT_SUCCESS(err, "slot_alloc\n");
135
136    err = cap_copy(copy, frame_mC);
137    EXPECT_SUCCESS(err, "cap_copy\n");
138
139    err = vspace_map_one_frame_fixed(vbase_map, matrix_size, copy, NULL, NULL);
140    EXPECT_SUCCESS(err, "vspace_map_one_frame\n");
141
142    mC_shared = (void *)vbase_map;
143
144    debug_printf("sharedC: [%lx, %lx]\n", vbase_map, vbase_map + matrix_size);
145
146    vbase_map += matrix_size;
147
148    size_t mm_frame_size = sizeof(struct mm_frame);
149    err = frame_alloc(&frame_mm_frame, mm_frame_size, &mm_frame_size);
150    EXPECT_SUCCESS(err, "frame allocate\n");
151
152    err = vspace_map_one_frame_fixed(vbase_map, mm_frame_size, frame_mm_frame, NULL, NULL);
153    EXPECT_SUCCESS(err, "vspace_map_one_frame\n");
154
155    g_mm_frame = (void *)vbase_map;
156
157    vbase_map += mm_frame_size;
158
159    g_mm_frame->cols = cols;
160    g_mm_frame->rows = rows;
161    g_mm_frame->sum = 0;
162
163#ifndef __k1om__
164    ram_set_affinity(min_base, max_limit);
165#endif
166}
167
168static void matrix_init(void)
169{
170    debug_printf("g_mm %p, %p, %p\n", g_mm_frame, mA_repl, mB_repl);
171    for (int i = 0; i < g_mm_frame->rows; i++) {
172        uint64_t row = i * g_mm_frame->rows;
173        MATRIX_TYPE *a_row = mA_repl + (row);
174        MATRIX_TYPE *b_row = mB_shared + (row);
175        for (int j = 0; j < g_mm_frame->cols; j++) {
176            if (j == i) {
177                a_row[j] = 1;
178                b_row[j] = 1;
179            } else {
180                a_row[j] = 0;
181                b_row[j] = 0;
182            }
183        }
184    }
185}
186
187static void matrix_share(xomp_wloc_t location)
188{
189    errval_t err;
190    DEBUG("==========================================================\n");
191    DEBUG("frame_mAB, (lvaddr_t) mA_shared, XOMP_FRAME_TYPE_SHARED_RW\n");
192    err = xomp_master_add_memory(frame_mAB, (lvaddr_t) mA_shared,
193                                 XOMP_FRAME_TYPE_SHARED_RW);
194    EXPECT_SUCCESS(err, "xomp_master_add_memory mA_shared\n");
195
196    DEBUG("==========================================================\n");
197    DEBUG("frame_mC, (lvaddr_t) mC_shared, XOMP_FRAME_TYPE_SHARED_RW\n");
198    err = xomp_master_add_memory(frame_mC, (lvaddr_t) mC_shared,
199                                 XOMP_FRAME_TYPE_SHARED_RW);
200    EXPECT_SUCCESS(err, "xomp_master_add_memory mC_shared\n");
201
202    DEBUG("==========================================================\n");
203    DEBUG("frame_mm_frame, (lvaddr_t) g_mm_frame, XOMP_FRAME_TYPE_SHARED_RW\n");
204    err = xomp_master_add_memory(frame_mm_frame, (lvaddr_t) g_mm_frame,
205                                 XOMP_FRAME_TYPE_SHARED_RW);
206    EXPECT_SUCCESS(err, "xomp_master_add_memory g_mm_frame\n");
207
208    if (location == XOMP_WORKER_LOC_MIXED) {
209        DEBUG("==========================================================\n");
210        DEBUG("frame_mAB, (lvaddr_t) mA_repl, XOMP_FRAME_TYPE_REPL_RW\n");
211        err = xomp_master_add_memory(frame_mAB, (lvaddr_t) mA_repl,
212                                     XOMP_FRAME_TYPE_REPL_RW);
213        EXPECT_SUCCESS(err, "xomp_master_add_memory mA_repl\n");
214
215        DEBUG("==========================================================\n");
216        DEBUG("frame_mC, (lvaddr_t) mC_repl, XOMP_FRAME_TYPE_REPL_RW\n");
217        err = xomp_master_add_memory(frame_mC, (lvaddr_t) mC_repl,
218                                     XOMP_FRAME_TYPE_REPL_RW);
219        EXPECT_SUCCESS(err, "xomp_master_add_memory mC_repl\n");
220    }
221}
222
223#if BENCH_RUN_SINGLE
224static void mm_single(MATRIX_TYPE *a,
225                      MATRIX_TYPE *b,
226                      MATRIX_TYPE *c)
227{
228    g_sum = 0;
229    for (int i = 0; i < g_mm_frame->rows; i++) {
230        uint64_t row = i * g_mm_frame->rows;
231        MATRIX_TYPE *c_row_i = c + row;
232        MATRIX_TYPE *a_row_i = a + row;
233        for (int k = 0; k < g_mm_frame->rows; ++k) {
234            MATRIX_TYPE *b_row_k = b + k * g_mm_frame->rows;
235            MATRIX_TYPE a_elem = a_row_i[k];
236            for (int j = 0; j < g_mm_frame->cols; ++j) {
237                c_row_i[j] += a_elem * b_row_k[j];
238            }
239        }
240
241        for (int j = 0; j < g_mm_frame->cols; ++j) {
242            g_sum += c_row_i[j];
243        }
244    }
245}
246#endif
247
248
249#if BENCH_MEASURE_LOCAL
250static bench_ctl_t *ctl_local;
251#endif
252
253static void mm_init(void)
254{
255#pragma omp parallel
256    {
257        debug_printf("initializing omp library\n");
258        bench_init();
259    }
260}
261
262static void mm_omp(MATRIX_TYPE *a,
263                   MATRIX_TYPE *b,
264                   MATRIX_TYPE *c,
265                   struct mm_frame *mm_frame)
266{
267#pragma omp parallel
268    {
269        uint64_t nrows = mm_frame->rows;
270        uint64_t ncols = mm_frame->cols;
271        uint64_t counter = 0;
272        MATRIX_TYPE sum = 0;
273#if BENCH_MEASURE_LOCAL
274        if (ctl_local == NULL) {
275            ctl_local = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
276        }
277        cycles_t start = bench_tsc();
278#endif
279#pragma omp for nowait schedule (static, 1)
280        for (int i = 0; i < nrows; i++) {
281            counter++;
282            uint64_t row = i * nrows;
283            MATRIX_TYPE *c_row_i = c + row;
284            MATRIX_TYPE *a_row_i = a + row;
285            for (int k = 0; k < nrows; ++k) {
286                MATRIX_TYPE *b_row_k = b + k * nrows;
287                MATRIX_TYPE a_elem = a_row_i[k];
288                for (int j = 0; j < ncols; ++j) {
289                    c_row_i[j] += a_elem * b_row_k[j];
290                }
291            }
292
293            for (int j = 0; j < ncols; ++j) {
294                sum += c_row_i[j] > 0;
295            }
296        }
297
298        // TODO: atomic add;
299        __sync_fetch_and_add(&mm_frame->sum, sum);
300#if BENCH_MEASURE_LOCAL
301        cycles_t end = bench_tsc();
302        cycles_t elapsed = bench_time_diff(start, end);
303        if (bench_ctl_add_run(ctl_local, &elapsed)) {
304            bench_ctl_dump_analysis(ctl_local, 0, "LOCAL", bench_tsc_per_us());
305        }
306#endif
307    }
308}
309
310static void prepare_bomp(void)
311{
312    cycles_t tsc_start = bench_tsc();
313    bomp_bomp_init(nthreads);
314    cycles_t tsc_end = bench_tsc();
315    timer_xompinit = bench_time_diff(tsc_start, tsc_end);
316}
317
318static int prepare_xomp(int argc,
319                         char *argv[])
320{
321    errval_t err;
322
323    xomp_wloc_t location = XOMP_WORKER_LOC_MIXED;
324    for (int i = 3; i < argc; ++i) {
325        if (!strncmp(argv[i], "--location=", 11)) {
326            char *p = strchr(argv[i], '=');
327            p++;
328            if (!strcmp(p, "local")) {
329                location = XOMP_WORKER_LOC_LOCAL;
330            }
331        }
332    }
333
334    if (location == XOMP_WORKER_LOC_MIXED) {
335        debug_printf("waiting for xeon phi to be ready\n");
336        err = xeon_phi_domain_blocking_lookup("xeon_phi.0.ready", NULL);
337        EXPECT_SUCCESS(err, "nameservice_blocking_lookup");
338        err = xeon_phi_domain_blocking_lookup("xeon_phi.1.ready", NULL);
339        EXPECT_SUCCESS(err, "nameservice_blocking_lookup");
340#if XOMP_BENCH_ENABLED
341        xomp_master_bench_enable(BENCH_RUN_COUNT, nthreads, XOMP_MASTER_BENCH_DO_WORK);
342#endif
343    }
344
345    struct xomp_spawn local_info =  {
346        .argc = argc,
347        .argv = argv,
348#ifdef __k1om__
349        .path = "/k1om/sbin/benchmarks/bomp_mm",
350#else
351        .path = "/x86_64/sbin/benchmarks/bomp_mm",
352#endif
353    };
354
355    struct xomp_spawn remote_info =  {
356            .argc = argc,
357            .argv = argv,
358            .path = "/k1om/sbin/benchmarks/bomp_mm",
359        };
360
361    struct xomp_args xomp_arg = {
362        .type = XOMP_ARG_TYPE_DISTINCT,
363        .core_stride = 0, // use default
364        .args = {
365            .distinct = {
366                .nthreads = nthreads,
367                .worker_loc = location,
368                .nphi = 2,
369                .local = local_info,
370                .remote = remote_info
371            }
372        }
373    };
374
375    cycles_t tsc_start = bench_tsc();
376    if (bomp_xomp_init(&xomp_arg)) {
377        debug_printf("bomp init failed!\n");
378        exit(1);
379    }
380    cycles_t tsc_end = bench_tsc();
381    timer_xompinit = bench_time_diff(tsc_start, tsc_end);
382
383    tsc_start = bench_tsc();
384    matrix_share(location);
385    tsc_end = bench_tsc();
386    timer_share = bench_time_diff(tsc_start, tsc_end);
387
388    return (location == XOMP_WORKER_LOC_LOCAL);
389}
390
391int main(int argc,
392         char *argv[])
393{
394    errval_t err;
395    xomp_wid_t wid;
396
397    cycles_t tsc_start, tsc_end;
398
399    bench_init();
400
401    err = xomp_worker_parse_cmdline(argc, argv, &wid);
402    if (err_is_ok(err)) {
403        struct xomp_args xw_arg = {
404            .type = XOMP_ARG_TYPE_WORKER,
405            .args = {
406                .worker = {
407                    .id = wid
408                }
409            }
410        };
411        bomp_xomp_init(&xw_arg);
412    }
413
414    if (argc < 4) {
415        debug_printf("Usage: %s <size> <numthreats>\n", argv[0]);
416        exit(1);
417    }
418
419    uint64_t rows, cols;
420    rows = strtoul(argv[1], NULL, 10);
421    cols = rows;
422
423    nthreads = strtoul(argv[2], NULL, 10);
424    if (nthreads == 0) {
425        debug_printf("num threads must be >0\n");
426        exit(1);
427    }
428
429    DEBUG("\n");
430    DEBUG("======================================================\n");
431    debug_printf("Matrix Size: [%lu x %lu]\n", rows, cols);
432    debug_printf("Matrix Size: %lu kB\n", rows * cols * sizeof(MATRIX_TYPE) >> 10);
433    debug_printf("Num Threads: %u\n", nthreads);
434
435    DEBUG("\n");
436    DEBUG("======================================================\n");
437    DEBUG("matrix_alloc\n");
438    tsc_start = bench_tsc();
439    matrix_alloc(rows, cols);
440    tsc_end = bench_tsc();
441    cycles_t timer_alloc = bench_time_diff(tsc_start, tsc_end);
442
443    DEBUG("\n");
444    DEBUG("======================================================\n");
445    DEBUG("matrix_init\n");
446    tsc_start = bench_tsc();
447    matrix_init();
448    tsc_end = bench_tsc();
449    cycles_t timer_init = bench_time_diff(tsc_start, tsc_end);
450
451    uint8_t is_shared = 0;
452    for (int i = 3; i < argc; ++i) {
453        if (!strcmp(argv[i], "bomp")) {
454            prepare_bomp();
455            is_shared = 1;
456        } else if (!strcmp(argv[i], "xomp")) {
457            is_shared = prepare_xomp(argc, argv);
458        } else {
459            debug_printf("ignoring argument {%s}\n", argv[i]);
460        }
461    }
462    if (0) {
463        mm_init();
464    }
465#if BENCH_RUN_SINGLE
466    DEBUG("\n");
467    DEBUG("======================================================\n");
468    DEBUG("mm_single\n");
469
470    bench_ctl_t *ctl_single;
471    cycles_t timer_single;
472
473    ctl_single = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
474    do {
475        tsc_start = bench_tsc();
476        //mm_single(mA_repl, mB_repl, mC_repl);
477        mm_single(mA_shared, mB_shared, mC_shared);
478        tsc_end = bench_tsc();
479        timer_single = bench_time_diff(tsc_start, tsc_end);
480        memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE));
481    } while (!bench_ctl_add_run(ctl_single, &timer_single));
482#endif
483    DEBUG("\n");
484    DEBUG("======================================================\n");
485    DEBUG("mm_omp_repl\n");
486
487    bench_ctl_t *ctl_omp_repl = NULL;
488    cycles_t timer_omp_repl = 0;
489
490    if (!is_shared) {
491        ctl_omp_repl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
492        do {
493            tsc_start = bench_tsc();
494            mm_omp(mA_repl, mB_repl, mC_repl, g_mm_frame);
495            tsc_end = bench_tsc();
496            timer_omp_repl = bench_time_diff(tsc_start, tsc_end);
497            debug_printf("took: %lu cycles\n", timer_omp_repl);
498            if (g_mm_frame->sum != g_mm_frame->rows) {
499                debug_printf("ERROR: sum was not identical: %lu / %lu\n", g_mm_frame->sum,
500                             g_mm_frame->rows);
501            }
502            g_mm_frame->sum = 0;
503        } while (!bench_ctl_add_run(ctl_omp_repl, &timer_omp_repl));
504    }
505    DEBUG("\n");
506    DEBUG("======================================================\n");
507    DEBUG("mm_omp_shared\n");
508
509    //memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE));
510    bench_ctl_t *ctl_omp_shared;
511    cycles_t timer_omp_shared = 0;
512
513    ctl_omp_shared = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, BENCH_RUN_COUNT);
514    do {
515        tsc_start = bench_tsc();
516        mm_omp(mA_shared, mB_shared, mC_shared, g_mm_frame);
517        tsc_end = bench_tsc();
518        timer_omp_shared = bench_time_diff(tsc_start, tsc_end);
519        if (g_mm_frame->sum != g_mm_frame->rows) {
520            debug_printf("ERROR: sum was not identical: %lu / %lu\n", g_mm_frame->sum,
521                         g_mm_frame->rows);
522        }
523
524        g_mm_frame->sum = 0;
525        debug_printf("took: %lu cycles\n", timer_omp_shared);
526        memset(mC_shared, 0, rows * cols * sizeof(MATRIX_TYPE));
527    } while (!bench_ctl_add_run(ctl_omp_shared, &timer_omp_shared));
528
529    debug_printf("-------------------------------------\n");
530
531    debug_printf("BOMP init:      %lu (%lu ms)\n", timer_xompinit,
532                 bench_tsc_to_ms(timer_xompinit));
533
534    debug_printf("Alloc Time:     %lu (%lu ms)\n", timer_alloc,
535                 bench_tsc_to_ms(timer_alloc));
536
537    debug_printf("Init Time:      %lu (%lu ms)\n", timer_init,
538                 bench_tsc_to_ms(timer_init));
539
540    debug_printf("Share Time:     %lu (%lu ms)\n", timer_share,
541                 bench_tsc_to_ms(timer_share));
542#if BENCH_RUN_SINGLE
543    debug_printf("Single Time:    %lu (%lu ms)\n", timer_single,
544                 bench_tsc_to_ms(timer_single));
545#endif
546    if (!is_shared) {
547        debug_printf("OMP Repl:       %lu (%lu ms)\n", timer_omp_repl,
548                     bench_tsc_to_ms(timer_omp_repl));
549    }
550    debug_printf("OMP Shared:     %lu (%lu ms)\n", timer_omp_shared,
551                 bench_tsc_to_ms(timer_omp_shared));
552
553    cycles_t timer_common = timer_alloc + timer_init;
554    cycles_t timer_xomp = timer_common + timer_xompinit + timer_share;
555#if BENCH_RUN_SINGLE
556    debug_printf("Total (Single): %lu (%lu ms)\n", timer_single + timer_common,
557                 bench_tsc_to_ms(timer_single + timer_common));
558#endif
559    if (!is_shared) {
560        debug_printf("Total (Repl):   %lu (%lu ms)\n", timer_omp_repl + timer_xomp,
561                     bench_tsc_to_ms(timer_omp_repl + timer_xomp));
562    }
563
564    debug_printf("Total (Shared): %lu (%lu ms)\n", timer_omp_shared + timer_xomp,
565                 bench_tsc_to_ms(timer_omp_shared + timer_xomp));
566
567    debug_printf("-------------------------------------\n");
568
569    cycles_t tscperus = bench_tsc_per_us();
570#if BENCH_RUN_SINGLE
571    bench_ctl_dump_analysis(ctl_single, 0, "Single", tscperus);
572#endif
573    if (!is_shared) {
574        bench_ctl_dump_analysis(ctl_omp_repl, 0, "OMP Replicated", tscperus);
575    }
576    bench_ctl_dump_analysis(ctl_omp_shared, 0, "OMP Shared", tscperus);
577
578    debug_printf("-------------------------------------\n");
579#if XOMP_BENCH_ENABLED
580    xomp_master_bench_print_results();
581#endif
582    debug_printf("-------------------------------------\n");
583    while (1)
584        ;
585
586}
587
588