1/*
2 * Copyright (c) 2014 ETH Zurich.
3 * All rights reserved.
4 *
5 * This file is distributed under the terms in the attached LICENSE file.
6 * If you do not find this file, copies can be found by writing to:
7 * ETH Zurich D-INFK, Universitaetsstrasse 6, CH-8092 Zurich. Attn: Systems Group.
8 */
9#include <stdio.h>
10#include <string.h>
11#include <stdlib.h>
12#include <limits.h>
13
14#include <barrelfish/barrelfish.h>
15#include <barrelfish/ump_chan.h>
16#include <bench/bench.h>
17#include <barrelfish/sys_debug.h>
18#include <dma/dma.h>
19#include <dma/dma_request.h>
20#include <dma/client/dma_client_device.h>
21#include <dma/dma_manager_client.h>
22
23#include "benchmark.h"
24
25static void xphi_bench_print_settings(void)
26{
27    printf("Core host: %u, Core card: %u\n",
28    XPHI_BENCH_CORE_HOST,
29           XPHI_BENCH_CORE_CARD);
30    printf("Buffer size = %lu bytes, processing runs %u\n",
31    XPHI_BENCH_BUF_SIZE,
32           XPHI_BENCH_PROCESS_RUNS);
33    printf("Bytes per run: %lu kB\n",
34           (XPHI_BENCH_NUM_RUNS * XPHI_BENCH_BUF_SIZE) / 1024);
35
36#ifdef XPHI_BENCH_PROCESS_CARD
37    printf("Processing Side:  Card\n");
38#else
39    printf("Processing Side:  Host\n");
40#endif
41
42#ifdef XPHI_BENCH_CHAN_CARD
43#ifdef XPHI_BENCH_BUFFER_CARD
44    printf("Memory Setup (Normal):     Host [  ]                      Card [ UMP | UMP | BUFFERS ] \n");
45    printf("Memory Setup (Reversed):   Host [ UMP | UMP | BUFFERS ]   Card [ ] \n");
46#else
47    printf("Memory Setup (Normal):     Host [ BUFFERS ]               Card [ UMP | UMP ] \n");
48    printf("Memory Setup (Reversed):   Host [ UMP | UMP | BUFFERS ]   Card [  ] \n");
49#endif
50#endif
51
52#ifdef XPHI_BENCH_CHAN_HOST
53#ifdef XPHI_BENCH_BUFFER_CARD
54    printf("Memory Setup (Normal):     Host [ UMP | UMP ]             Card [ BUFFERS ] \n");
55    printf("Memory Setup (Reversed):   Host [ UMP | UMP | BUFFERS ]   Card [ ] \n");
56#else
57    printf("Memory Setup (Normal):     Host [ BUFFERS ]   Card [ UMP | UMP ] \n");
58    printf("Memory Setup (Reversed):   Host [ ]           Card [ UMP | UMP | BUFFERS ] \n");
59#endif
60#endif
61
62#ifdef XPHI_BENCH_CHAN_DEFAULT
63#ifdef XPHI_BENCH_BUFFER_CARD
64    printf("Memory Setup (Normal):     Host [ UMP ]             Card [ UMP | BUFFERS ] \n");
65    printf("Memory Setup (Reversed):   Host [ UMP | BUFFERS ]   Card [ UMP ] \n");
66#else
67    printf("Memory Setup (Normal):     Host [ UMP | BUFFERS ]   Card [ UMP ] \n");
68    printf("Memory Setup (Reversed):   Host [ UMP ]             Card [ UMP | BUFFERS ] \n");
69#endif
70    printf("UMP Channel Setup (Normal):   Recv Remote, Send Local\n");
71    printf("UMP Channel Setup (Reversed): Recv Local, Send Remote\n");
72#endif
73}
74
75errval_t xphi_bench_memwrite(void *target)
76{
77    return SYS_ERR_OK;
78
79    debug_printf("Executing local measurements\n");
80
81    errval_t err;
82
83    bench_init();
84
85    cycles_t tsc_start, tsc_end;
86    cycles_t result[4];
87    uint64_t tscperus;
88    bench_ctl_t *ctl;
89
90    err = sys_debug_get_tsc_per_ms(&tscperus);
91    assert(err_is_ok(err));
92    tscperus /= 1000;
93
94    debug_printf("tscperus = %lu\n", tscperus);
95
96    ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 3, XPHI_BENCH_NUM_REPS);
97
98    debug_printf("starting benchmark...\n");
99    uint32_t rep_counter = 0;
100    do {
101        debug_printf("  > run %u of %u memwrite of %lu bytes..\n", rep_counter++,
102        XPHI_BENCH_NUM_REPS,
103                     XPHI_BENCH_BUF_FRAME_SIZE);
104
105        /* using memset */
106        tsc_start = bench_tsc();
107        memset(target, 0, XPHI_BENCH_BUF_FRAME_SIZE);
108        tsc_end = bench_tsc();
109        if (tsc_end < tsc_start) {
110            result[0] = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
111        } else {
112            result[0] = (tsc_end - tsc_start - bench_tscoverhead());
113        }
114
115        /* writing in a loop*/
116        volatile uint8_t *buf = target;
117        tsc_start = bench_tsc();
118        for (uint32_t i = 0; i < XPHI_BENCH_BUF_FRAME_SIZE; ++i) {
119            buf[i] = (uint8_t) 1;
120        }
121        tsc_end = bench_tsc();
122        if (tsc_end < tsc_start) {
123            result[1] = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
124        } else {
125            result[1] = (tsc_end - tsc_start - bench_tscoverhead());
126        }
127
128        /* reading in a while loop */
129        buf = target;
130        buf[XPHI_BENCH_BUF_FRAME_SIZE - 1] = 0;
131        tsc_start = bench_tsc();
132        while (*(buf++))
133            ;
134
135        tsc_end = bench_tsc();
136        if (tsc_end < tsc_start) {
137            result[2] = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
138        } else {
139            result[2] = (tsc_end - tsc_start - bench_tscoverhead());
140        }
141
142    } while (!bench_ctl_add_run(ctl, result));
143
144    // bench_ctl_dump_csv(ctl, "", tscperus);
145    bench_ctl_dump_analysis(ctl, 0, "memset()", tscperus);
146    bench_ctl_dump_analysis(ctl, 1, "forloop write", tscperus);
147    bench_ctl_dump_analysis(ctl, 2, "forloop read", tscperus);
148    return SYS_ERR_OK;
149
150    return SYS_ERR_OK;
151}
152
153static volatile uint8_t dma_done;
154
155static void dma_done_cb(errval_t err,
156                        dma_req_id_t id,
157                        void *st)
158{
159    dma_req_id_t *id2 = st;
160    if (id != *id2) {
161        debug_printf("id %016lx, %016lx\n", id, *id2);
162    }
163    assert(id == *id2); XPHI_BENCH_DBG("DMA request executed...\n");
164    dma_done = 0x1;
165}
166
167static inline cycles_t calculate_time(cycles_t tsc_start,
168                                      cycles_t tsc_end)
169{
170    cycles_t result;
171    if (tsc_end < tsc_start) {
172        result = (LONG_MAX - tsc_start) + tsc_end - bench_tscoverhead();
173    } else {
174        result = (tsc_end - tsc_start - bench_tscoverhead());
175    }
176    return result;
177}
178
179static errval_t measure_memcpy(void *dst,
180                               void *src)
181{
182    errval_t err;
183    cycles_t tsc_start, tsc_end;
184    uint64_t tscperus;
185    bench_ctl_t *ctl;
186
187    cycles_t result;
188
189    debug_printf("--------------------------------\n");
190    debug_printf("Measuring memcpy...\n");
191    debug_printf("--------------------------------\n");
192
193    bench_init();
194
195    err = sys_debug_get_tsc_per_ms(&tscperus);
196    assert(err_is_ok(err));
197    tscperus /= 1000;
198
199    for (int i = XPHI_BENCH_SIZE_MIN_BITS; i <= XPHI_BENCH_SIZE_MAX_BITS - 2;
200                    ++i) {
201        size_t size = (1UL << i);
202
203        ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, XPHI_BENCH_NUM_REPS);
204
205        uint8_t idx = 0;
206        //debug_printf("Benchmark: Run %u, size = %lu bytes, [%016lx] -> [%016lx]\n", idx, size, src, dst);
207        do {
208            tsc_start = bench_tsc();
209            memcpy(dst, src, size);
210            tsc_end = bench_tsc();
211            result = calculate_time(tsc_start, tsc_end);
212            idx++;
213        } while (!bench_ctl_add_run(ctl, &result));
214        char buf[50];
215
216        snprintf(buf, sizeof(buf), "%u", i);
217        bench_ctl_dump_analysis(ctl, 0, buf, tscperus);
218
219        bench_ctl_destroy(ctl);
220    }
221    debug_printf("--------------------------------\n");
222    return SYS_ERR_OK;
223}
224
225static errval_t measure_forloop(void *dst,
226                                void *src)
227{
228    errval_t err;
229    cycles_t tsc_start, tsc_end;
230    uint64_t tscperus;
231    bench_ctl_t *ctl;
232
233    cycles_t result;
234
235    debug_printf("--------------------------------\n");
236    debug_printf("Measuring Forloop...\n");
237    debug_printf("--------------------------------\n");
238
239    bench_init();
240
241    err = sys_debug_get_tsc_per_ms(&tscperus);
242    assert(err_is_ok(err));
243    tscperus /= 1000;
244
245    for (int i = XPHI_BENCH_SIZE_MIN_BITS; i <= XPHI_BENCH_SIZE_MAX_BITS - 2;
246                    ++i) {
247        size_t size = (1UL << i);
248
249        ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, XPHI_BENCH_NUM_REPS);
250
251        uint8_t idx = 0;
252        //debug_printf("Benchmark: Run %u, size = %lu bytes, [%016lx] -> [%016lx]\n", idx, size, src, dst);
253        do {
254            volatile uint64_t *bsrc = src;
255            volatile uint64_t *bdst = dst;
256            tsc_start = bench_tsc();
257            for (uint32_t j = 0; j < size / sizeof(uint64_t); ++j) {
258                bdst[j] = bsrc[j];
259            }
260            tsc_end = bench_tsc();
261            result = calculate_time(tsc_start, tsc_end);
262            idx++;
263        } while (!bench_ctl_add_run(ctl, &result));
264        char buf[50];
265
266        snprintf(buf, sizeof(buf), "%u", i);
267        bench_ctl_dump_analysis(ctl, 0, buf, tscperus);
268
269        bench_ctl_destroy(ctl);
270    }
271    debug_printf("--------------------------------\n");
272    return SYS_ERR_OK;
273}
274
275static errval_t measure_dma(struct dma_device *dev,
276                            lpaddr_t pdst,
277                            lpaddr_t psrc)
278{
279    errval_t err;
280    cycles_t tsc_start, tsc_end;
281    uint64_t tscperus;
282    bench_ctl_t *ctl;
283
284    cycles_t result;
285    debug_printf("--------------------------------\n");
286    debug_printf("Measuring DMA...\n");
287    debug_printf("--------------------------------\n");
288    // avoid host-host DMA.
289    if (psrc == 0) {
290        debug_printf("skipping host-host transfer\n");
291        return SYS_ERR_OK;
292    }
293
294    bench_init();
295
296    err = sys_debug_get_tsc_per_ms(&tscperus);
297    assert(err_is_ok(err));
298    tscperus /= 1000;
299
300    for (int i = XPHI_BENCH_SIZE_MIN_BITS; i <= XPHI_BENCH_SIZE_MAX_BITS; ++i) {
301        size_t size = (1UL << i);
302
303        ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, XPHI_BENCH_NUM_REPS);
304
305        uint8_t idx = 0;
306        //debug_printf("Benchmark: Run %u, size = %lu bytes, [%016lx] -> [%016lx]\n", idx, size, src, dst);
307        do {
308
309            dma_req_id_t id;
310
311            struct dma_req_setup setup = {
312                .done_cb = dma_done_cb,
313                .cb_arg = &id,
314                .args = {
315                    .memcpy = {
316                        .src = psrc,
317                        .dst = pdst,
318                        .bytes = size
319                    }
320                }
321            };
322
323            dma_done = 0x0;
324
325            tsc_start = bench_tsc();
326            err = dma_request_memcpy(dev, &setup, &id);
327            if (err_is_fail(err)) {
328                USER_PANIC_ERR(err, "could not exec the transfer");
329            }
330            while (!dma_done) {
331                messages_wait_and_handle_next();
332            }
333            tsc_end = bench_tsc();
334            result = calculate_time(tsc_start, tsc_end);
335            idx++;
336        } while (!bench_ctl_add_run(ctl, &result));
337        char buf[50];
338
339        snprintf(buf, sizeof(buf), "%u", i);
340        bench_ctl_dump_analysis(ctl, 0, buf, tscperus);
341
342        bench_ctl_destroy(ctl);
343    }
344
345    debug_printf("--------------------------------\n");
346
347    return SYS_ERR_OK;
348}
349
350errval_t xphi_bench_memcpy(struct dma_device *dev,
351                           void *dst,
352                           void *src,
353                           size_t size,
354                           lpaddr_t pdst,
355                           lpaddr_t psrc)
356{
357    errval_t err;
358    uint64_t tscperus;
359
360    bench_init();
361
362    err = sys_debug_get_tsc_per_ms(&tscperus);
363    assert(err_is_ok(err));
364    tscperus /= 1000;
365
366    debug_printf("Starting memcpy benchmark. tsc/us=%lu, cpysize=%lu bytes\n",
367                 tscperus, (uint64_t) size);
368
369    if (0) {
370        measure_memcpy(dst, src);
371
372        measure_forloop(dst, src);
373    }
374    measure_dma(dev, pdst, psrc);
375
376    return SYS_ERR_OK;
377}
378
379void xphi_bench_start_echo(struct bench_bufs *bufs,
380                           struct ump_chan *uc)
381{
382    errval_t err;
383
384    volatile struct ump_message *msg;
385    volatile struct ump_message *msg_recv;
386
387    struct ump_control ctrl;
388    msg = ump_chan_get_next(uc, &ctrl);
389
390    // send initiator message
391    debug_printf("signal ready.\n");
392    msg->data[0] = 123;
393    msg->header.control = ctrl;
394
395    debug_printf("xphi_bench_start_echo: receiving messages.\n");
396#ifdef XPHI_BENCH_CHECK_STOP
397    uint64_t data = 0x0;
398    while (data != XPHI_BENCH_STOP_FLAG) {
399#else
400        while(true) {
401#endif
402        err = ump_chan_recv(uc, &msg_recv);
403        if (err_is_ok(err)) {
404            XPHI_BENCH_DBG("received ump message [%p]\n", msg_recv);
405            msg = ump_chan_get_next(uc, &ctrl);
406            msg->header.control = ctrl;
407#ifdef XPHI_BENCH_CHECK_STOP
408            data = msg_recv->data[0];
409#endif
410        }
411    }
412    if (data == XPHI_BENCH_STOP_FLAG) {
413        debug_printf("xphi_bench_start_echo: received stop flag.\n");
414    }
415}
416
417void xphi_bench_start_processor(struct bench_bufs *bufs,
418                                struct ump_chan *uc)
419{
420    errval_t err;
421
422    volatile struct ump_message *msg;
423
424    uint64_t buf_idx = 0;
425
426    struct ump_control ctrl;
427    msg = ump_chan_get_next(uc, &ctrl);
428
429    // send initiator message
430    debug_printf("signal ready.\n");
431    msg->data[0] = 123;
432    msg->header.control = ctrl;
433
434    debug_printf("xphi_bench_start_processor: receiving messages.\n");
435#ifdef XPHI_BENCH_CHECK_STOP
436    while (buf_idx != XPHI_BENCH_STOP_FLAG) {
437#else
438        while(true) {
439#endif
440        err = ump_chan_recv(uc, &msg);
441        if (err_is_ok(err)) {
442            buf_idx = msg->data[0];
443            XPHI_BENCH_DBG("received ump message [%016lx]\n", buf_idx);
444            struct bench_buf *buf = &bufs->buf[buf_idx];
445            xphi_bench_fill_buffer(buf, XPHI_BENCH_PROCESS_RUNS);
446            msg = ump_chan_get_next(uc, &ctrl);
447            msg->data[0] = buf_idx;
448            msg->header.control = ctrl;
449        }
450    }
451    if (buf_idx == XPHI_BENCH_STOP_FLAG) {
452        debug_printf("xphi_bench_start_processor: received stop flag\n");
453    }
454}
455
456errval_t xphi_bench_start_initator_rtt(struct bench_bufs *bufs,
457                                       struct ump_chan *uc)
458{
459    errval_t err;
460    cycles_t tsc_start, tsc_end;
461    cycles_t result;
462    uint64_t tscperus;
463    bench_ctl_t *ctl;
464
465    volatile struct ump_message *msg;
466
467    bench_init();
468
469    err = sys_debug_get_tsc_per_ms(&tscperus);
470    assert(err_is_ok(err));
471    tscperus /= 1000;
472
473    ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1,
474    XPHI_BENCH_NUM_REPS * XPHI_BENCH_NUM_RUNS);
475
476    debug_printf("RTT benchmark: waiting for ready signal.\n");
477    while (1) {
478        err = ump_chan_recv(uc, &msg);
479        if (err_is_ok(err)) {
480            break;
481        }
482    }
483
484    struct ump_control ctrl;
485
486    debug_printf("Starting RTT benchmark tsc/us=%lu\n", tscperus);
487    uint32_t rep_counter = 0;
488    do {
489        if (!(rep_counter++ % XPHI_BENCH_NUM_RUNS)) {
490            debug_printf("  > run %u of %u...\n", rep_counter,
491            XPHI_BENCH_NUM_REPS * XPHI_BENCH_NUM_RUNS);
492        }
493        tsc_start = bench_tsc();
494        msg = ump_chan_get_next(uc, &ctrl);
495        msg->header.control = ctrl;
496        do {
497            err = ump_chan_recv(uc, &msg);
498        } while (err_is_fail(err));
499        tsc_end = bench_tsc();
500        result = calculate_time(tsc_start, tsc_end);
501
502    } while (!bench_ctl_add_run(ctl, &result));
503
504#ifdef XPHI_BENCH_CHECK_STOP
505    msg = ump_chan_get_next(uc, &ctrl);
506    msg->data[0] = XPHI_BENCH_STOP_FLAG;
507    msg->header.control = ctrl;
508#endif
509    xphi_bench_print_settings();
510    // bench_ctl_dump_csv(ctl, "", tscperus);
511    bench_ctl_dump_analysis(ctl, 0, "RTT", tscperus);
512
513    return SYS_ERR_OK;
514}
515
516errval_t xphi_bench_start_initator_sync(struct bench_bufs *bufs,
517                                        struct ump_chan *uc)
518{
519    errval_t err;
520
521    cycles_t tsc_start, tsc_end;
522    cycles_t result;
523    uint64_t tscperus;
524    bench_ctl_t *ctl;
525
526    volatile struct ump_message *msg;
527    uint64_t buf_idx;
528
529    bench_init();
530
531    uint32_t n_recv = 0;
532
533    err = sys_debug_get_tsc_per_ms(&tscperus);
534    assert(err_is_ok(err));
535    tscperus /= 1000;
536
537    ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, XPHI_BENCH_NUM_REPS);
538
539    debug_printf("Sync Throughput Benchmark: waiting for ready signal...\n");
540    while (1) {
541        err = ump_chan_recv(uc, &msg);
542        if (err_is_ok(err)) {
543            break;
544        }
545    }
546
547    struct ump_control ctrl;
548
549    debug_printf("Starting sync throughput benchmark. tsc/us=%lu\n", tscperus);
550    uint32_t rep_counter = 0;
551    do {
552        uint64_t b_idx = 0;
553
554        debug_printf("  > run %u of %u with %u moves...\n", rep_counter++,
555        XPHI_BENCH_NUM_REPS,
556                     XPHI_BENCH_NUM_RUNS);
557
558        tsc_start = bench_tsc();
559
560        msg = ump_chan_get_next(uc, &ctrl);
561        struct bench_buf *buf = &bufs->buf[b_idx];
562        xphi_bench_fill_buffer(buf, 1);
563
564        // send initiator message
565        XPHI_BENCH_DBG("sending message [%lu]\n", b_idx);
566        msg->data[0] = b_idx;
567        msg->header.control = ctrl;
568        n_recv = 0;
569        for (uint32_t irun = 0; irun < (XPHI_BENCH_NUM_RUNS - 1); ++irun) {
570            do {
571                err = ump_chan_recv(uc, &msg);
572            } while (err_is_fail(err));
573
574            n_recv++;
575            buf_idx = msg->data[0];
576            uint32_t ret_count = 0;
577            buf = &bufs->buf[b_idx];
578            xphi_bench_read_buffer(buf, 1, &ret_count);
579            XPHI_BENCH_DBG("received message [%lu]\n", buf_idx);
580            assert(buf_idx == b_idx);
581            b_idx = (b_idx + 1) & (bufs->num - 1);
582
583            buf = &bufs->buf[b_idx];
584            xphi_bench_fill_buffer(buf, 1);
585
586            XPHI_BENCH_DBG("sending message [%lu]\n", b_idx);
587            msg = ump_chan_get_next(uc, &ctrl);
588            assert(msg);
589            msg->data[0] = b_idx;
590            msg->header.control = ctrl;
591        }
592
593        while (n_recv < XPHI_BENCH_NUM_RUNS) {
594            err = ump_chan_recv(uc, &msg);
595            if (err_is_ok(err)) {
596                buf_idx = msg->data[0];
597                XPHI_BENCH_DBG("received message [%"PRIu64"]\n", buf_idx);
598                buf = &bufs->buf[buf_idx];
599                uint32_t ret_count = 0;
600                xphi_bench_read_buffer(buf, 1, &ret_count);
601                n_recv++;
602            }
603        }
604        tsc_end = bench_tsc();
605        result = calculate_time(tsc_start, tsc_end);
606    } while (!bench_ctl_add_run(ctl, &result));
607
608#ifdef XPHI_BENCH_CHECK_STOP
609    msg = ump_chan_get_next(uc, &ctrl);
610    msg->data[0] = XPHI_BENCH_STOP_FLAG;
611    msg->header.control = ctrl;
612#endif
613
614    double avg_s = bench_avg(ctl->data, ctl->result_count) / tscperus;
615    avg_s /= 1000000;
616    xphi_bench_print_settings();
617// bench_ctl_dump_csv(ctl, "", tscperus);
618    bench_ctl_dump_analysis(ctl, 0, "Sync Throughput", tscperus);
619    printf("Average seconds: %f\n", avg_s);
620    printf("Average throughput: %f GByte/s\n",
621           (((double) (XPHI_BENCH_NUM_RUNS * XPHI_BENCH_BUF_SIZE)) / 1024 / 1024
622            / 1024)
623           / (avg_s));
624    printf("Average throughput (with processing): %f GByte/s\n",
625           (XPHI_BENCH_PROCESS_RUNS * ((double) (XPHI_BENCH_NUM_RUNS
626                           * XPHI_BENCH_BUF_SIZE))
627            / 1024 / 1024 / 1024)
628           / (avg_s));
629
630    return SYS_ERR_OK;
631}
632
633errval_t xphi_bench_start_initator_async(struct bench_bufs *bufs,
634                                         struct ump_chan *uc)
635{
636    volatile struct ump_message *msg;
637    uint64_t buf_idx;
638    uint32_t in_transit = 0;
639
640    errval_t err;
641
642    bench_init();
643
644    cycles_t tsc_start;
645    cycles_t result;
646    uint64_t tscperus;
647    bench_ctl_t *ctl;
648
649    err = sys_debug_get_tsc_per_ms(&tscperus);
650    assert(err_is_ok(err));
651    tscperus /= 1000;
652
653    debug_printf("tscperus = %lu\n", tscperus);
654
655    ctl = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, XPHI_BENCH_NUM_REPS);
656
657    debug_printf("waiting for ready signal\n");
658    while (1) {
659        err = ump_chan_recv(uc, &msg);
660        if (err_is_ok(err)) {
661            break;
662        }
663    }
664
665    debug_printf("starting benchmark ASYNC...\n");
666
667    struct ump_control ctrl;
668
669    uint32_t rep_counter = 0;
670    do {
671        uint64_t b_idx = 0;
672        debug_printf("  > run %u of %u with %u moves...\n", rep_counter++,
673        XPHI_BENCH_NUM_REPS,
674                     XPHI_BENCH_NUM_RUNS);
675        tsc_start = bench_tsc();
676
677        uint32_t irun = 0;
678        uint32_t n_recv = 0;
679        struct bench_buf *buf;
680        while (irun < XPHI_BENCH_NUM_RUNS) {
681            if (in_transit < XPHI_BENCH_MSG_NUM) {
682                msg = ump_chan_get_next(uc, &ctrl);
683                if (!msg) {
684                    continue;
685                }
686                buf = &bufs->buf[b_idx];
687                xphi_bench_fill_buffer(buf, 1);
688                XPHI_BENCH_DBG("sending message [%lu] %p\n", b_idx, msg);
689                msg->data[0] = b_idx;
690                msg->header.control = ctrl;
691                irun++;
692                in_transit++;
693                b_idx = (b_idx + 1) & (bufs->num - 1);
694            }
695
696            err = ump_chan_recv(uc, &msg);
697            if (err_is_ok(err)) {
698                buf_idx = msg->data[0];
699                XPHI_BENCH_DBG("receiving message [%"PRIu64"]\n", buf_idx);
700                buf = &bufs->buf[buf_idx];
701                uint32_t ret_count = 0;
702                xphi_bench_read_buffer(buf, 1, &ret_count);
703                in_transit--;
704                n_recv++;
705            }
706        }
707
708        while (n_recv < XPHI_BENCH_NUM_RUNS) {
709            err = ump_chan_recv(uc, &msg);
710            if (err_is_ok(err)) {
711                buf_idx = msg->data[0];
712                buf = &bufs->buf[buf_idx];
713                uint32_t ret_count = 0;
714                XPHI_BENCH_DBG("receiving message [%lu]\n", buf_idx);
715                xphi_bench_read_buffer(buf, 1, &ret_count);
716                in_transit--;
717                n_recv++;
718            }
719        }
720
721        result = bench_tsc();
722        if (result - tsc_start > bench_tscoverhead()) {
723            debug_printf("%lu %lu", result - tsc_start, bench_tscoverhead());
724        }
725        if (result < tsc_start) {
726            result = (LONG_MAX - tsc_start) + result - bench_tscoverhead();
727        } else {
728            result = (result - tsc_start - bench_tscoverhead());
729        }
730
731        assert(in_transit == 0);
732    } while (!bench_ctl_add_run(ctl, &result));
733
734#ifdef XPHI_BENCH_CHECK_STOP
735    msg = ump_chan_get_next(uc, &ctrl);
736    msg->data[0] = XPHI_BENCH_STOP_FLAG;
737    msg->header.control = ctrl;
738#endif
739
740    double avg_s = bench_avg(ctl->data, ctl->result_count) / tscperus;
741    avg_s /= 1000000;
742    xphi_bench_print_settings();
743// bench_ctl_dump_csv(ctl, "", tscperus);
744    bench_ctl_dump_analysis(ctl, 0, "ASync Throughput", tscperus);
745    printf("Average seconds: %f\n", avg_s);
746    printf("Average throughput: %f GByte/s\n",
747           (((double) (XPHI_BENCH_NUM_RUNS * XPHI_BENCH_BUF_SIZE)) / 1024 / 1024
748            / 1024)
749           / (avg_s));
750    printf("Average throughput (with processing): %f GByte/s\n",
751           (XPHI_BENCH_PROCESS_RUNS * ((double) (XPHI_BENCH_NUM_RUNS
752                           * XPHI_BENCH_BUF_SIZE))
753            / 1024 / 1024 / 1024)
754           / (avg_s));
755
756    return SYS_ERR_OK;
757}
758
759