1/*
2    Copyright (c) 2014 Intel Corporation.  All Rights Reserved.
3
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8      * Redistributions of source code must retain the above copyright
9        notice, this list of conditions and the following disclaimer.
10      * Redistributions in binary form must reproduce the above copyright
11        notice, this list of conditions and the following disclaimer in the
12        documentation and/or other materials provided with the distribution.
13      * Neither the name of Intel Corporation nor the names of its
14        contributors may be used to endorse or promote products derived
15        from this software without specific prior written permission.
16
17    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28*/
29
30
31// Forward declaration as the following 2 functions are declared as friend in offload_engine.h
32// CLANG does not like static to been after friend declaration.
33static void __offload_init_library_once(void);
34static void __offload_fini_library(void);
35
36#include "offload_host.h"
37#ifdef MYO_SUPPORT
38#include "offload_myo_host.h"
39#endif
40
41#include <malloc.h>
42#ifndef TARGET_WINNT
43#include <alloca.h>
44#include <elf.h>
45#endif // TARGET_WINNT
46#include <errno.h>
47#include <fcntl.h>
48#include <stdlib.h>
49#include <string.h>
50#include <sys/stat.h>
51#include <sys/types.h>
52#include <sys/stat.h>
53
54#include <algorithm>
55#include <bitset>
56
57#if defined(HOST_WINNT)
58#define PATH_SEPARATOR ";"
59#else
60#define PATH_SEPARATOR ":"
61#endif
62
63#define GET_OFFLOAD_NUMBER(timer_data) \
64    timer_data? timer_data->offload_number : 0
65
66#ifdef TARGET_WINNT
67// Small subset of ELF declarations for Windows which is needed to compile
68// this file. ELF header is used to understand what binary type is contained
69// in the target image - shared library or executable.
70
71typedef uint16_t Elf64_Half;
72typedef uint32_t Elf64_Word;
73typedef uint64_t Elf64_Addr;
74typedef uint64_t Elf64_Off;
75
76#define EI_NIDENT   16
77
78#define ET_EXEC     2
79#define ET_DYN      3
80
81typedef struct
82{
83    unsigned char e_ident[EI_NIDENT];
84    Elf64_Half    e_type;
85    Elf64_Half    e_machine;
86    Elf64_Word    e_version;
87    Elf64_Addr    e_entry;
88    Elf64_Off     e_phoff;
89    Elf64_Off     e_shoff;
90    Elf64_Word    e_flags;
91    Elf64_Half    e_ehsize;
92    Elf64_Half    e_phentsize;
93    Elf64_Half    e_phnum;
94    Elf64_Half    e_shentsize;
95    Elf64_Half    e_shnum;
96    Elf64_Half    e_shstrndx;
97} Elf64_Ehdr;
98#endif // TARGET_WINNT
99
100// Host console and file logging
101const char *prefix;
102int console_enabled = 0;
103int offload_number = 0;
104
105static const char *htrace_envname = "H_TRACE";
106static const char *offload_report_envname = "OFFLOAD_REPORT";
107static char *timer_envname = "H_TIME";
108
109// Trace information
110static const char* vardesc_direction_as_string[] = {
111    "NOCOPY",
112    "IN",
113    "OUT",
114    "INOUT"
115};
116static const char* vardesc_type_as_string[] = {
117    "unknown",
118    "data",
119    "data_ptr",
120    "func_ptr",
121    "void_ptr",
122    "string_ptr",
123    "dv",
124    "dv_data",
125    "dv_data_slice",
126    "dv_ptr",
127    "dv_ptr_data",
128    "dv_ptr_data_slice",
129    "cean_var",
130    "cean_var_ptr",
131    "c_data_ptr_array",
132    "c_func_ptr_array",
133    "c_void_ptr_array",
134    "c_string_ptr_array"
135};
136
137Engine*         mic_engines = 0;
138uint32_t        mic_engines_total = 0;
139pthread_key_t   mic_thread_key;
140MicEnvVar       mic_env_vars;
141uint64_t        cpu_frequency = 0;
142
143// MIC_STACKSIZE
144uint32_t mic_stack_size = 12 * 1024 * 1024;
145
146// MIC_BUFFERSIZE
147uint64_t mic_buffer_size = 0;
148
149// MIC_LD_LIBRARY_PATH
150char* mic_library_path = 0;
151
152// MIC_PROXY_IO
153bool mic_proxy_io = true;
154
155// MIC_PROXY_FS_ROOT
156char* mic_proxy_fs_root = 0;
157
158// Threshold for creating buffers with large pages. Buffer is created
159// with large pages hint if its size exceeds the threshold value.
160// By default large pages are disabled right now (by setting default
161// value for threshold to MAX) due to HSD 4114629.
162uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
163static const char *mic_use_2mb_buffers_envname  =
164    "MIC_USE_2MB_BUFFERS";
165
166static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
167static const char *mic_use_async_buffer_write_envname  =
168    "MIC_USE_ASYNC_BUFFER_WRITE";
169
170static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
171static const char *mic_use_async_buffer_read_envname  =
172    "MIC_USE_ASYNC_BUFFER_READ";
173
174// device initialization type
175OffloadInitType __offload_init_type = c_init_on_offload_all;
176static const char *offload_init_envname = "OFFLOAD_INIT";
177
178// active wait
179static bool __offload_active_wait = true;
180static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
181
182// OMP_DEFAULT_DEVICE
183int __omp_device_num = 0;
184static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
185
186// The list of pending target libraries
187static bool            __target_libs;
188static TargetImageList __target_libs_list;
189static mutex_t         __target_libs_lock;
190static mutex_t         stack_alloc_lock;
191
192// Target executable
193TargetImage*           __target_exe;
194
195static char * offload_get_src_base(void * ptr, uint8_t type)
196{
197    char *base;
198    if (VAR_TYPE_IS_PTR(type)) {
199        base = *static_cast<char**>(ptr);
200    }
201    else if (VAR_TYPE_IS_SCALAR(type)) {
202        base = static_cast<char*>(ptr);
203    }
204    else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
205        ArrDesc *dvp;
206        if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
207            const arr_desc *ap = static_cast<const arr_desc*>(ptr);
208            dvp = (type == c_dv_data_slice) ?
209                  reinterpret_cast<ArrDesc*>(ap->base) :
210                  *reinterpret_cast<ArrDesc**>(ap->base);
211        }
212        else {
213            dvp = (type == c_dv_data) ?
214                  static_cast<ArrDesc*>(ptr) :
215                  *static_cast<ArrDesc**>(ptr);
216        }
217        base = reinterpret_cast<char*>(dvp->Base);
218    }
219    else {
220        base = NULL;
221    }
222    return base;
223}
224
225void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
226{
227    // special case for the 'process died' error
228    if (res == COI_PROCESS_DIED) {
229        m_device.fini_process(true);
230    }
231    else {
232        switch (msg) {
233            case c_buf_create:
234                if (res == COI_OUT_OF_MEMORY) {
235                    msg = c_buf_create_out_of_mem;
236                }
237                /* fallthru */
238
239            case c_buf_create_from_mem:
240            case c_buf_get_address:
241            case c_pipeline_create:
242            case c_pipeline_run_func:
243                LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
244                break;
245
246            case c_buf_read:
247            case c_buf_write:
248            case c_buf_copy:
249            case c_buf_map:
250            case c_buf_unmap:
251            case c_buf_destroy:
252            case c_buf_set_state:
253                LIBOFFLOAD_ERROR(msg, res);
254                break;
255
256            default:
257                break;
258        }
259    }
260
261    exit(1);
262}
263
264_Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
265{
266    switch (res) {
267        case COI_SUCCESS:
268            return OFFLOAD_SUCCESS;
269
270        case COI_PROCESS_DIED:
271            return OFFLOAD_PROCESS_DIED;
272
273        case COI_OUT_OF_MEMORY:
274            return OFFLOAD_OUT_OF_MEMORY;
275
276        default:
277            return OFFLOAD_ERROR;
278    }
279}
280
281bool OffloadDescriptor::alloc_ptr_data(
282    PtrData* &ptr_data,
283    void *base,
284    int64_t disp,
285    int64_t size,
286    int64_t alloc_disp,
287    int align
288)
289{
290    // total length of base
291    int64_t length = disp + size;
292    bool is_new;
293
294    OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
295                  base, length);
296
297    // add new entry
298    ptr_data = m_device.insert_ptr_data(base, length, is_new);
299    if (is_new) {
300
301        OFFLOAD_TRACE(3, "Added new association\n");
302
303        if (length > 0) {
304            OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
305            COIRESULT res;
306
307            // align should be a power of 2
308            if (align > 0 && (align & (align - 1)) == 0) {
309                // offset within mic_buffer. Can do offset optimization
310                // only when source address alignment satisfies requested
311                // alignment on the target (cq172736).
312                if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
313                    ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095;
314                }
315            }
316
317            // buffer size and flags
318            uint64_t buffer_size = length + ptr_data->mic_offset;
319            uint32_t buffer_flags = 0;
320
321            // create buffer with large pages if data length exceeds
322            // large page threshold
323            if (length >= __offload_use_2mb_buffers) {
324                buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
325            }
326
327            // create CPU buffer
328            OFFLOAD_DEBUG_TRACE_1(3,
329                          GET_OFFLOAD_NUMBER(get_timer_data()),
330                          c_offload_create_buf_host,
331                          "Creating buffer from source memory %p, "
332                          "length %lld\n", base, length);
333
334            // result is not checked because we can continue without cpu
335            // buffer. In this case we will use COIBufferRead/Write instead
336            // of COIBufferCopy.
337            COI::BufferCreateFromMemory(length,
338                                        COI_BUFFER_NORMAL,
339                                        0,
340                                        base,
341                                        1,
342                                        &m_device.get_process(),
343                                        &ptr_data->cpu_buf);
344
345            OFFLOAD_DEBUG_TRACE_1(3,
346                          GET_OFFLOAD_NUMBER(get_timer_data()),
347                          c_offload_create_buf_mic,
348                          "Creating buffer for sink: size %lld, offset %d, "
349                          "flags =0x%x\n", buffer_size - alloc_disp,
350                          ptr_data->mic_offset, buffer_flags);
351
352            // create MIC buffer
353            res = COI::BufferCreate(buffer_size - alloc_disp,
354                                    COI_BUFFER_NORMAL,
355                                    buffer_flags,
356                                    0,
357                                    1,
358                                    &m_device.get_process(),
359                                    &ptr_data->mic_buf);
360            if (res != COI_SUCCESS) {
361                if (m_status != 0) {
362                    m_status->result = translate_coi_error(res);
363                }
364                else if (m_is_mandatory) {
365                    report_coi_error(c_buf_create, res);
366                }
367                ptr_data->alloc_ptr_data_lock.unlock();
368                return false;
369            }
370
371            // make buffer valid on the device.
372            res = COI::BufferSetState(ptr_data->mic_buf,
373                                      m_device.get_process(),
374                                      COI_BUFFER_VALID,
375                                      COI_BUFFER_NO_MOVE,
376                                      0, 0, 0);
377            if (res != COI_SUCCESS) {
378                if (m_status != 0) {
379                    m_status->result = translate_coi_error(res);
380                }
381                else if (m_is_mandatory) {
382                    report_coi_error(c_buf_set_state, res);
383                }
384                ptr_data->alloc_ptr_data_lock.unlock();
385                return false;
386            }
387
388            res = COI::BufferSetState(ptr_data->mic_buf,
389                                      COI_PROCESS_SOURCE,
390                                      COI_BUFFER_INVALID,
391                                      COI_BUFFER_NO_MOVE,
392                                      0, 0, 0);
393            if (res != COI_SUCCESS) {
394                if (m_status != 0) {
395                    m_status->result = translate_coi_error(res);
396                }
397                else if (m_is_mandatory) {
398                    report_coi_error(c_buf_set_state, res);
399                }
400                ptr_data->alloc_ptr_data_lock.unlock();
401                return false;
402            }
403        }
404
405        ptr_data->alloc_disp = alloc_disp;
406        ptr_data->alloc_ptr_data_lock.unlock();
407    }
408    else {
409        mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
410
411        OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
412                      "is_static %d\n",
413                      ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
414                      ptr_data->is_static);
415
416        // This is not a new entry. Make sure that provided address range fits
417        // into existing one.
418        MemRange addr_range(base, length - ptr_data->alloc_disp);
419        if (!ptr_data->cpu_addr.contains(addr_range)) {
420            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
421            exit(1);
422        }
423
424        // if the entry is associated with static data it may not have buffers
425        // created because they are created on demand.
426        if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
427            return false;
428        }
429    }
430
431    return true;
432}
433
434bool OffloadDescriptor::find_ptr_data(
435    PtrData* &ptr_data,
436    void *base,
437    int64_t disp,
438    int64_t size,
439    bool report_error
440)
441{
442    // total length of base
443    int64_t length = disp + size;
444
445    OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
446                  "length %lld\n", base, length);
447
448    // find existing association in pointer table
449    ptr_data = m_device.find_ptr_data(base);
450    if (ptr_data == 0) {
451        if (report_error) {
452            LIBOFFLOAD_ERROR(c_no_ptr_data, base);
453            exit(1);
454        }
455        OFFLOAD_TRACE(3, "Association does not exist\n");
456        return true;
457    }
458
459    OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
460                  ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
461                  ptr_data->is_static);
462
463    // make sure that provided address range fits into existing one
464    MemRange addr_range(base, length);
465    if (!ptr_data->cpu_addr.contains(addr_range)) {
466        if (report_error) {
467            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
468            exit(1);
469        }
470        OFFLOAD_TRACE(3, "Existing association partially overlaps with "
471                      "data address range\n");
472        ptr_data = 0;
473        return true;
474    }
475
476    // if the entry is associated with static data it may not have buffers
477    // created because they are created on demand.
478    if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
479        return false;
480    }
481
482    return true;
483}
484
485bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
486{
487    OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
488
489    if (ptr_data->cpu_buf == 0) {
490        OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
491                      ptr_data->cpu_addr.start());
492
493        COIRESULT res = COI::BufferCreateFromMemory(
494            ptr_data->cpu_addr.length(),
495            COI_BUFFER_NORMAL,
496            0,
497            const_cast<void*>(ptr_data->cpu_addr.start()),
498            1, &m_device.get_process(),
499            &ptr_data->cpu_buf);
500
501        if (res != COI_SUCCESS) {
502            if (m_status != 0) {
503                m_status->result = translate_coi_error(res);
504                return false;
505            }
506            report_coi_error(c_buf_create_from_mem, res);
507        }
508    }
509
510    if (ptr_data->mic_buf == 0) {
511        OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
512                      ptr_data->mic_addr);
513
514        COIRESULT res = COI::BufferCreateFromMemory(
515            ptr_data->cpu_addr.length(),
516            COI_BUFFER_NORMAL,
517            COI_SINK_MEMORY,
518            reinterpret_cast<void*>(ptr_data->mic_addr),
519            1, &m_device.get_process(),
520            &ptr_data->mic_buf);
521
522        if (res != COI_SUCCESS) {
523            if (m_status != 0) {
524                m_status->result = translate_coi_error(res);
525                return false;
526            }
527            report_coi_error(c_buf_create_from_mem, res);
528        }
529    }
530
531    return true;
532}
533
534bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
535{
536    if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
537        COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
538                                                  &ptr_data->mic_addr);
539        if (res != COI_SUCCESS) {
540            if (m_status != 0) {
541                m_status->result = translate_coi_error(res);
542            }
543            else if (m_is_mandatory) {
544                report_coi_error(c_buf_get_address, res);
545            }
546            return false;
547        }
548    }
549    return true;
550}
551
552bool OffloadDescriptor::nullify_target_stack(
553    COIBUFFER targ_buf,
554    uint64_t size
555)
556{
557    char * ptr = (char*)malloc(size);
558    if (ptr == NULL)
559      LIBOFFLOAD_ERROR(c_malloc);
560    COIRESULT res;
561
562    memset(ptr, 0, size);
563    res = COI::BufferWrite(
564        targ_buf,
565        0,
566        ptr,
567        size,
568        COI_COPY_UNSPECIFIED,
569        0, 0, 0);
570    free(ptr);
571    if (res != COI_SUCCESS) {
572        if (m_status != 0) {
573            m_status->result = translate_coi_error(res);
574            return false;
575        }
576        report_coi_error(c_buf_write, res);
577    }
578    return true;
579}
580
581bool OffloadDescriptor::offload_stack_memory_manager(
582    const void * stack_begin,
583    int  routine_id,
584    int  buf_size,
585    int  align,
586    bool *is_new)
587{
588    mutex_locker_t locker(stack_alloc_lock);
589
590    PersistData * new_el;
591    PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
592    PersistDataList::iterator it_end;
593    int erase = 0;
594
595    *is_new = false;
596
597    for (PersistDataList::iterator it = m_device.m_persist_list.begin();
598        it != m_device.m_persist_list.end(); it++) {
599        PersistData cur_el = *it;
600
601        if (stack_begin > it->stack_cpu_addr) {
602            // this stack data must be destroyed
603            m_destroy_stack.push_front(cur_el.stack_ptr_data);
604            it_end = it;
605            erase++;
606        }
607        else if (stack_begin == it->stack_cpu_addr) {
608            if (routine_id != it-> routine_id) {
609                // this stack data must be destroyed
610                m_destroy_stack.push_front(cur_el.stack_ptr_data);
611                it_end = it;
612                erase++;
613                break;
614            }
615            else {
616                // stack data is reused
617                m_stack_ptr_data = it->stack_ptr_data;
618                if (erase > 0) {
619                    // all obsolete stack sections must be erased from the list
620                    m_device.m_persist_list.erase(it_begin, ++it_end);
621
622                    m_in_datalen +=
623                        erase * sizeof(new_el->stack_ptr_data->mic_addr);
624                }
625                OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
626                                 m_stack_ptr_data->mic_addr);
627                return true;
628            }
629        }
630        else if (stack_begin < it->stack_cpu_addr) {
631            break;
632        }
633    }
634
635    if (erase > 0) {
636        // all obsolete stack sections must be erased from the list
637        m_device.m_persist_list.erase(it_begin, ++it_end);
638        m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
639    }
640    // new stack table is created
641    new_el = new PersistData(stack_begin, routine_id, buf_size);
642    // create MIC buffer
643    COIRESULT res;
644    uint32_t buffer_flags = 0;
645
646    // create buffer with large pages if data length exceeds
647    // large page threshold
648    if (buf_size >= __offload_use_2mb_buffers) {
649        buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
650    }
651    res = COI::BufferCreate(buf_size,
652        COI_BUFFER_NORMAL,
653        buffer_flags,
654        0,
655        1,
656        &m_device.get_process(),
657        &new_el->stack_ptr_data->mic_buf);
658    if (res != COI_SUCCESS) {
659        if (m_status != 0) {
660            m_status->result = translate_coi_error(res);
661        }
662        else if (m_is_mandatory) {
663            report_coi_error(c_buf_create, res);
664        }
665        return false;
666    }
667    // make buffer valid on the device.
668    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
669        m_device.get_process(),
670        COI_BUFFER_VALID,
671        COI_BUFFER_NO_MOVE,
672        0, 0, 0);
673    if (res != COI_SUCCESS) {
674        if (m_status != 0) {
675            m_status->result = translate_coi_error(res);
676        }
677        else if (m_is_mandatory) {
678            report_coi_error(c_buf_set_state, res);
679        }
680        return false;
681    }
682    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
683        COI_PROCESS_SOURCE,
684        COI_BUFFER_INVALID,
685        COI_BUFFER_NO_MOVE,
686        0, 0, 0);
687    if (res != COI_SUCCESS) {
688        if (m_status != 0) {
689            m_status->result = translate_coi_error(res);
690        }
691        else if (m_is_mandatory) {
692            report_coi_error(c_buf_set_state, res);
693        }
694        return false;
695    }
696    // persistence algorithm requires target stack initialy to be nullified
697    if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
698        return false;
699    }
700
701    m_stack_ptr_data = new_el->stack_ptr_data;
702    init_mic_address(m_stack_ptr_data);
703    OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
704                      m_stack_ptr_data->mic_addr);
705    m_device.m_persist_list.push_front(*new_el);
706    init_mic_address(new_el->stack_ptr_data);
707    *is_new = true;
708    return true;
709}
710
711bool OffloadDescriptor::setup_descriptors(
712    VarDesc *vars,
713    VarDesc2 *vars2,
714    int vars_total,
715    int entry_id,
716    const void *stack_addr
717)
718{
719    COIRESULT res;
720
721    OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
722
723    // make a copy of variable descriptors
724    m_vars_total = vars_total;
725    if (vars_total > 0) {
726        m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
727        if (m_vars == NULL)
728          LIBOFFLOAD_ERROR(c_malloc);
729        memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
730        m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
731        if (m_vars_extra == NULL)
732          LIBOFFLOAD_ERROR(c_malloc);
733    }
734
735    // dependencies
736    m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total  + 1));
737    if (m_in_deps == NULL)
738      LIBOFFLOAD_ERROR(c_malloc);
739    if (m_vars_total > 0) {
740        m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total);
741        if (m_out_deps == NULL)
742          LIBOFFLOAD_ERROR(c_malloc);
743    }
744
745    // copyin/copyout data length
746    m_in_datalen = 0;
747    m_out_datalen = 0;
748
749    // First pass over variable descriptors
750    // - Calculate size of the input and output non-pointer data
751    // - Allocate buffers for input and output pointers
752    for (int i = 0; i < m_vars_total; i++) {
753        void*   alloc_base = NULL;
754        int64_t alloc_disp = 0;
755        int64_t alloc_size;
756        bool    src_is_for_mic = (m_vars[i].direction.out ||
757                                  m_vars[i].into == NULL);
758
759        const char *var_sname = "";
760        if (vars2 != NULL && i < vars_total) {
761            if (vars2[i].sname != NULL) {
762                var_sname = vars2[i].sname;
763            }
764        }
765        OFFLOAD_TRACE(2, "   VarDesc %d, var=%s, %s, %s\n",
766            i, var_sname,
767            vardesc_direction_as_string[m_vars[i].direction.bits],
768            vardesc_type_as_string[m_vars[i].type.src]);
769        if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
770            OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
771                vardesc_type_as_string[m_vars[i].type.dst]);
772        }
773        OFFLOAD_TRACE(2,
774            "              type_src=%d, type_dstn=%d, direction=%d, "
775            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
776            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
777            m_vars[i].type.src,
778            m_vars[i].type.dst,
779            m_vars[i].direction.bits,
780            m_vars[i].alloc_if,
781            m_vars[i].free_if,
782            m_vars[i].align,
783            m_vars[i].mic_offset,
784            m_vars[i].flags.bits,
785            m_vars[i].offset,
786            m_vars[i].size,
787            m_vars[i].count,
788            m_vars[i].ptr,
789            m_vars[i].into);
790
791        if (m_vars[i].alloc != NULL) {
792            // array descriptor
793            const arr_desc *ap =
794                static_cast<const arr_desc*>(m_vars[i].alloc);
795
796            // debug dump
797            __arr_desc_dump("    ", "ALLOC", ap, 0);
798
799            __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
800
801            alloc_base = reinterpret_cast<void*>(ap->base);
802        }
803
804        m_vars_extra[i].cpu_disp = 0;
805        m_vars_extra[i].cpu_offset = 0;
806        m_vars_extra[i].src_data = 0;
807        m_vars_extra[i].read_rng_src = 0;
808        m_vars_extra[i].read_rng_dst = 0;
809        // flag is_arr_ptr_el is 1 only for var_descs generated
810        // for c_data_ptr_array type
811        if (i < vars_total) {
812            m_vars_extra[i].is_arr_ptr_el = 0;
813        }
814
815        switch (m_vars[i].type.src) {
816            case c_data_ptr_array:
817                {
818                    const arr_desc *ap;
819                    const VarDesc3 *vd3 =
820                        static_cast<const VarDesc3*>(m_vars[i].ptr);
821                    int flags = vd3->array_fields;
822                    OFFLOAD_TRACE(2,
823                        "              pointer array flags = %04x\n", flags);
824                    OFFLOAD_TRACE(2,
825                        "              pointer array type is %s\n",
826                        vardesc_type_as_string[flags & 0x3f]);
827                    ap = static_cast<const arr_desc*>(vd3->ptr_array);
828                    __arr_desc_dump("              ", "ptr array", ap, 0);
829                    if (m_vars[i].into) {
830                        ap = static_cast<const arr_desc*>(m_vars[i].into);
831                        __arr_desc_dump(
832                            "              ", "into array", ap, 0);
833                    }
834                    if ((flags & (1<<flag_align_is_array)) != 0) {
835                        ap = static_cast<const arr_desc*>(vd3->align_array);
836                        __arr_desc_dump(
837                            "              ", "align array", ap, 0);
838                    }
839                    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
840                        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
841                        __arr_desc_dump(
842                            "              ", "alloc_if array", ap, 0);
843                    }
844                    if ((flags & (1<<flag_free_if_is_array)) != 0) {
845                        ap = static_cast<const arr_desc*>(vd3->free_if_array);
846                        __arr_desc_dump(
847                            "              ", "free_if array", ap, 0);
848                    }
849                    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
850                        ap = static_cast<const arr_desc*>(vd3->extent_start);
851                        __arr_desc_dump(
852                            "              ", "extent_start array", ap, 0);
853                    } else if ((flags &
854                        (1<<flag_extent_start_is_scalar)) != 0) {
855                        OFFLOAD_TRACE(2,
856                            "              extent_start scalar = %d\n",
857                            (int64_t)vd3->extent_start);
858                    }
859                    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
860                        ap = static_cast<const arr_desc*>
861                            (vd3->extent_elements);
862                        __arr_desc_dump(
863                            "              ", "extent_elements array", ap, 0);
864                    } else if ((flags &
865                        (1<<flag_extent_elements_is_scalar)) != 0) {
866                        OFFLOAD_TRACE(2,
867                            "              extent_elements scalar = %d\n",
868                            (int64_t)vd3->extent_elements);
869                    }
870                    if ((flags & (1<<flag_into_start_is_array)) != 0) {
871                        ap = static_cast<const arr_desc*>(vd3->into_start);
872                        __arr_desc_dump(
873                            "              ", "into_start array", ap, 0);
874                    } else if ((flags &
875                        (1<<flag_into_start_is_scalar)) != 0) {
876                        OFFLOAD_TRACE(2,
877                            "              into_start scalar = %d\n",
878                            (int64_t)vd3->into_start);
879                    }
880                    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
881                        ap = static_cast<const arr_desc*>(vd3->into_elements);
882                        __arr_desc_dump(
883                            "              ", "into_elements array", ap, 0);
884                    } else if ((flags &
885                        (1<<flag_into_elements_is_scalar)) != 0) {
886                        OFFLOAD_TRACE(2,
887                            "              into_elements scalar = %d\n",
888                            (int64_t)vd3->into_elements);
889                    }
890                    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
891                        ap = static_cast<const arr_desc*>(vd3->alloc_start);
892                        __arr_desc_dump(
893                            "              ", "alloc_start array", ap, 0);
894                    } else if ((flags &
895                        (1<<flag_alloc_start_is_scalar)) != 0) {
896                        OFFLOAD_TRACE(2,
897                            "              alloc_start scalar = %d\n",
898                            (int64_t)vd3->alloc_start);
899                    }
900                    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
901                        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
902                        __arr_desc_dump(
903                            "              ", "alloc_elements array", ap, 0);
904                    } else if ((flags &
905                        (1<<flag_alloc_elements_is_scalar)) != 0) {
906                        OFFLOAD_TRACE(2,
907                            "              alloc_elements scalar = %d\n",
908                            (int64_t)vd3->alloc_elements);
909                    }
910                }
911                if (!gen_var_descs_for_pointer_array(i)) {
912                    return false;
913                }
914                break;
915
916            case c_data:
917            case c_void_ptr:
918            case c_cean_var:
919                // In all uses later
920                // VarDesc.size will have the length of the data to be
921                // transferred
922                // VarDesc.disp will have an offset from base
923                if (m_vars[i].type.src == c_cean_var) {
924                    // array descriptor
925                    const arr_desc *ap =
926                        static_cast<const arr_desc*>(m_vars[i].ptr);
927
928                    // debug dump
929                    __arr_desc_dump("", "IN/OUT", ap, 0);
930
931                    // offset and length are derived from the array descriptor
932                    __arr_data_offset_and_length(ap, m_vars[i].disp,
933                                                 m_vars[i].size);
934                    if (!is_arr_desc_contiguous(ap)) {
935                        m_vars[i].flags.is_noncont_src = 1;
936                        m_vars_extra[i].read_rng_src =
937                            init_read_ranges_arr_desc(ap);
938                    }
939                    // all necessary information about length and offset is
940                    // transferred in var descriptor. There is no need to send
941                    // array descriptor to the target side.
942                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
943                }
944                else {
945                    m_vars[i].size *= m_vars[i].count;
946                    m_vars[i].disp = 0;
947                }
948
949                if (m_vars[i].direction.bits) {
950                    // make sure that transfer size > 0
951                    if (m_vars[i].size <= 0) {
952                        LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
953                        exit(1);
954                    }
955
956                    if (m_vars[i].flags.is_static) {
957                        PtrData *ptr_data;
958
959                        // find data associated with variable
960                        if (!find_ptr_data(ptr_data,
961                                           m_vars[i].ptr,
962                                           m_vars[i].disp,
963                                           m_vars[i].size,
964                                           false)) {
965                            return false;
966                        }
967
968                        if (ptr_data != 0) {
969                            // offset to base from the beginning of the buffer
970                            // memory
971                            m_vars[i].offset =
972                                (char*) m_vars[i].ptr -
973                                (char*) ptr_data->cpu_addr.start();
974                        }
975                        else {
976                            m_vars[i].flags.is_static = false;
977                            if (m_vars[i].into == NULL) {
978                                m_vars[i].flags.is_static_dstn = false;
979                            }
980                        }
981                        m_vars_extra[i].src_data = ptr_data;
982                    }
983
984                    if (m_is_openmp) {
985                        if (m_vars[i].flags.is_static) {
986                            // Static data is transferred only by omp target
987                            // update construct which passes zeros for
988                            // alloc_if and free_if.
989                            if (m_vars[i].alloc_if || m_vars[i].free_if) {
990                                m_vars[i].direction.bits = c_parameter_nocopy;
991                            }
992                        }
993                        else {
994                            AutoData *auto_data;
995                            if (m_vars[i].alloc_if) {
996                                auto_data = m_device.insert_auto_data(
997                                    m_vars[i].ptr, m_vars[i].size);
998                                auto_data->add_reference();
999                            }
1000                            else {
1001                                // TODO: what should be done if var is not in
1002                                // the table?
1003                                auto_data = m_device.find_auto_data(
1004                                    m_vars[i].ptr);
1005                            }
1006
1007                            // For automatic variables data is transferred
1008                            // only if alloc_if == 0 && free_if == 0
1009                            // or reference count is 1
1010                            if ((m_vars[i].alloc_if || m_vars[i].free_if) &&
1011                                auto_data != 0 &&
1012                                auto_data->get_reference() != 1) {
1013                                m_vars[i].direction.bits = c_parameter_nocopy;
1014                            }
1015
1016                            // save data for later use
1017                            m_vars_extra[i].auto_data = auto_data;
1018                        }
1019                    }
1020
1021                    if (m_vars[i].direction.in &&
1022                        !m_vars[i].flags.is_static) {
1023                        m_in_datalen += m_vars[i].size;
1024
1025                        // for non-static target destination defined as CEAN
1026                        // expression we pass to target its size and dist
1027                        if (m_vars[i].into == NULL &&
1028                            m_vars[i].type.src == c_cean_var) {
1029                            m_in_datalen += 2 * sizeof(uint64_t);
1030                        }
1031                        m_need_runfunction = true;
1032                    }
1033                    if (m_vars[i].direction.out &&
1034                        !m_vars[i].flags.is_static) {
1035                        m_out_datalen += m_vars[i].size;
1036                        m_need_runfunction = true;
1037                    }
1038                }
1039                break;
1040
1041            case c_dv:
1042                if (m_vars[i].direction.bits ||
1043                    m_vars[i].alloc_if ||
1044                    m_vars[i].free_if) {
1045                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
1046
1047                    // debug dump
1048                    __dv_desc_dump("IN/OUT", dvp);
1049
1050                    // send dope vector contents excluding base
1051                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1052                    m_need_runfunction = true;
1053                }
1054                break;
1055
1056            case c_string_ptr:
1057                if ((m_vars[i].direction.bits ||
1058                     m_vars[i].alloc_if ||
1059                     m_vars[i].free_if) &&
1060                    m_vars[i].size == 0) {
1061                    m_vars[i].size = 1;
1062                    m_vars[i].count =
1063                        strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
1064                }
1065                /* fallthru */
1066
1067            case c_data_ptr:
1068                if (m_vars[i].flags.is_stack_buf &&
1069                    !m_vars[i].direction.bits &&
1070                    m_vars[i].alloc_if) {
1071                    // this var_desc is for stack buffer
1072                    bool is_new;
1073
1074                    if (!offload_stack_memory_manager(
1075                            stack_addr, entry_id,
1076                            m_vars[i].count, m_vars[i].align, &is_new)) {
1077                        return false;
1078                    }
1079                    if (is_new) {
1080                        m_compute_buffers.push_back(
1081                            m_stack_ptr_data->mic_buf);
1082                        m_device.m_persist_list.front().cpu_stack_addr =
1083                            static_cast<char*>(m_vars[i].ptr);
1084                    }
1085                    else {
1086                        m_vars[i].flags.sink_addr = 1;
1087                        m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
1088                    }
1089                    m_vars[i].size = m_destroy_stack.size();
1090                    m_vars_extra[i].src_data = m_stack_ptr_data;
1091                    // need to add reference for buffer
1092                    m_need_runfunction = true;
1093                    break;
1094                }
1095                /* fallthru */
1096
1097            case c_cean_var_ptr:
1098            case c_dv_ptr:
1099                if (m_vars[i].type.src == c_cean_var_ptr) {
1100                    // array descriptor
1101                    const arr_desc *ap =
1102                        static_cast<const arr_desc*>(m_vars[i].ptr);
1103
1104                    // debug dump
1105                    __arr_desc_dump("", "IN/OUT", ap, 1);
1106
1107                    // offset and length are derived from the array descriptor
1108                    __arr_data_offset_and_length(ap, m_vars[i].disp,
1109                                                 m_vars[i].size);
1110
1111                    if (!is_arr_desc_contiguous(ap)) {
1112                        m_vars[i].flags.is_noncont_src = 1;
1113                        m_vars_extra[i].read_rng_src =
1114                            init_read_ranges_arr_desc(ap);
1115                    }
1116                    // all necessary information about length and offset is
1117                    // transferred in var descriptor. There is no need to send
1118                    // array descriptor to the target side.
1119                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1120                }
1121                else if (m_vars[i].type.src == c_dv_ptr) {
1122                    // need to send DV to the device unless it is 'nocopy'
1123                    if (m_vars[i].direction.bits ||
1124                        m_vars[i].alloc_if ||
1125                        m_vars[i].free_if) {
1126                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1127
1128                        // debug dump
1129                        __dv_desc_dump("IN/OUT", dvp);
1130
1131                        m_vars[i].direction.bits = c_parameter_in;
1132                    }
1133
1134                    // no displacement
1135                    m_vars[i].disp = 0;
1136                }
1137                else {
1138                    // c_data_ptr or c_string_ptr
1139                    m_vars[i].size *= m_vars[i].count;
1140                    m_vars[i].disp = 0;
1141                }
1142
1143                if (m_vars[i].direction.bits ||
1144                    m_vars[i].alloc_if ||
1145                    m_vars[i].free_if) {
1146                    PtrData *ptr_data;
1147
1148                    // check that buffer length >= 0
1149                    if (m_vars[i].alloc_if &&
1150                        m_vars[i].disp + m_vars[i].size < 0) {
1151                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1152                        exit(1);
1153                    }
1154
1155                    // base address
1156                    void *base = *static_cast<void**>(m_vars[i].ptr);
1157
1158                    // allocate buffer if we have no INTO and don't need
1159                    // allocation for the ptr at target
1160                    if (src_is_for_mic) {
1161                        if (m_vars[i].flags.is_stack_buf) {
1162                            // for stack persistent objects ptr data is created
1163                            // by var_desc with number 0.
1164                            // Its ptr_data is stored at m_stack_ptr_data
1165                            ptr_data = m_stack_ptr_data;
1166                            m_vars[i].flags.sink_addr = 1;
1167                        }
1168                        else if (m_vars[i].alloc_if) {
1169                            // add new entry
1170                            if (!alloc_ptr_data(
1171                                    ptr_data,
1172                                    base,
1173                                    (alloc_base != NULL) ?
1174                                        alloc_disp : m_vars[i].disp,
1175                                    (alloc_base != NULL) ?
1176                                        alloc_size : m_vars[i].size,
1177                                    alloc_disp,
1178                                    (alloc_base != NULL) ?
1179                                        0 : m_vars[i].align)) {
1180                                return false;
1181                            }
1182
1183                            if (ptr_data->add_reference() == 0 &&
1184                                ptr_data->mic_buf != 0) {
1185                                // add buffer to the list of buffers that
1186                                // are passed to dispatch call
1187                                m_compute_buffers.push_back(
1188                                    ptr_data->mic_buf);
1189                            }
1190                            else {
1191                                // will send buffer address to device
1192                                m_vars[i].flags.sink_addr = 1;
1193                            }
1194
1195                            if (!ptr_data->is_static) {
1196                                // need to add reference for buffer
1197                                m_need_runfunction = true;
1198                            }
1199                        }
1200                        else {
1201                            bool error_if_not_found = true;
1202                            if (m_is_openmp) {
1203                                // For omp target update variable is ignored
1204                                // if it does not exist.
1205                                if (!m_vars[i].alloc_if &&
1206                                    !m_vars[i].free_if) {
1207                                    error_if_not_found = false;
1208                                }
1209                            }
1210
1211                            // use existing association from pointer table
1212                            if (!find_ptr_data(ptr_data,
1213                                               base,
1214                                               m_vars[i].disp,
1215                                               m_vars[i].size,
1216                                               error_if_not_found)) {
1217                                return false;
1218                            }
1219
1220                            if (m_is_openmp) {
1221                                // make var nocopy if it does not exist
1222                                if (ptr_data == 0) {
1223                                    m_vars[i].direction.bits =
1224                                        c_parameter_nocopy;
1225                                }
1226                            }
1227
1228                            if (ptr_data != 0) {
1229                                m_vars[i].flags.sink_addr = 1;
1230                            }
1231                        }
1232
1233                        if (ptr_data != 0) {
1234                            if (m_is_openmp) {
1235                                // data is transferred only if
1236                                // alloc_if == 0 && free_if == 0
1237                                // or reference count is 1
1238                                if ((m_vars[i].alloc_if ||
1239                                     m_vars[i].free_if) &&
1240                                    ptr_data->get_reference() != 1) {
1241                                    m_vars[i].direction.bits =
1242                                        c_parameter_nocopy;
1243                                }
1244                            }
1245
1246                            if (ptr_data->alloc_disp != 0) {
1247                                m_vars[i].flags.alloc_disp = 1;
1248                                m_in_datalen += sizeof(alloc_disp);
1249                            }
1250
1251                            if (m_vars[i].flags.sink_addr) {
1252                                // get buffers's address on the sink
1253                                if (!init_mic_address(ptr_data)) {
1254                                    return false;
1255                                }
1256
1257                                m_in_datalen += sizeof(ptr_data->mic_addr);
1258                            }
1259
1260                            if (!ptr_data->is_static && m_vars[i].free_if) {
1261                                // need to decrement buffer reference on target
1262                                m_need_runfunction = true;
1263                            }
1264
1265                            // offset to base from the beginning of the buffer
1266                            // memory
1267                            m_vars[i].offset = (char*) base -
1268                                (char*) ptr_data->cpu_addr.start();
1269
1270                            // copy other pointer properties to var descriptor
1271                            m_vars[i].mic_offset = ptr_data->mic_offset;
1272                            m_vars[i].flags.is_static = ptr_data->is_static;
1273                        }
1274                    }
1275                    else {
1276                        if (!find_ptr_data(ptr_data,
1277                                           base,
1278                                           m_vars[i].disp,
1279                                           m_vars[i].size,
1280                                           false)) {
1281                            return false;
1282                        }
1283                        if (ptr_data) {
1284                            m_vars[i].offset =
1285                                (char*) base -
1286                                (char*) ptr_data->cpu_addr.start();
1287                        }
1288                    }
1289
1290                    // save pointer data
1291                    m_vars_extra[i].src_data = ptr_data;
1292                }
1293                break;
1294
1295            case c_func_ptr:
1296                if (m_vars[i].direction.in) {
1297                    m_in_datalen += __offload_funcs.max_name_length();
1298                }
1299                if (m_vars[i].direction.out) {
1300                    m_out_datalen += __offload_funcs.max_name_length();
1301                }
1302                m_need_runfunction = true;
1303                break;
1304
1305            case c_dv_data:
1306            case c_dv_ptr_data:
1307            case c_dv_data_slice:
1308            case c_dv_ptr_data_slice:
1309                ArrDesc *dvp;
1310                if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1311                    const arr_desc *ap;
1312                    ap = static_cast<const arr_desc*>(m_vars[i].ptr);
1313
1314                    dvp = (m_vars[i].type.src == c_dv_data_slice) ?
1315                          reinterpret_cast<ArrDesc*>(ap->base) :
1316                          *reinterpret_cast<ArrDesc**>(ap->base);
1317                }
1318                else {
1319                    dvp = (m_vars[i].type.src == c_dv_data) ?
1320                          static_cast<ArrDesc*>(m_vars[i].ptr) :
1321                          *static_cast<ArrDesc**>(m_vars[i].ptr);
1322                }
1323
1324                // if allocatable dope vector isn't allocated don't
1325                // transfer its data
1326                if (!__dv_is_allocated(dvp)) {
1327                    m_vars[i].direction.bits = c_parameter_nocopy;
1328                    m_vars[i].alloc_if = 0;
1329                    m_vars[i].free_if = 0;
1330                }
1331                if (m_vars[i].direction.bits ||
1332                    m_vars[i].alloc_if ||
1333                    m_vars[i].free_if) {
1334                    const arr_desc *ap;
1335
1336                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1337                        ap = static_cast<const arr_desc*>(m_vars[i].ptr);
1338
1339                        // debug dump
1340                        __arr_desc_dump("", "IN/OUT", ap, 0);
1341                    }
1342                    if (!__dv_is_contiguous(dvp)) {
1343                        m_vars[i].flags.is_noncont_src = 1;
1344                        m_vars_extra[i].read_rng_src =
1345                            init_read_ranges_dv(dvp);
1346                    }
1347
1348                    // size and displacement
1349                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1350                        // offset and length are derived from the
1351                        // array descriptor
1352                        __arr_data_offset_and_length(ap,
1353                                                     m_vars[i].disp,
1354                                                     m_vars[i].size);
1355                        if (m_vars[i].direction.bits) {
1356                            if (!is_arr_desc_contiguous(ap)) {
1357                                if (m_vars[i].flags.is_noncont_src) {
1358                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
1359                                    return false;
1360                                }
1361                                m_vars[i].flags.is_noncont_src = 1;
1362                                m_vars_extra[i].read_rng_src =
1363                                    init_read_ranges_arr_desc(ap);
1364                            }
1365                        }
1366                    }
1367                    else {
1368                        if (m_vars[i].flags.has_length) {
1369                            m_vars[i].size =
1370                                __dv_data_length(dvp, m_vars[i].count);
1371                        }
1372                        else {
1373                            m_vars[i].size = __dv_data_length(dvp);
1374                        }
1375                        m_vars[i].disp = 0;
1376                    }
1377
1378                    // check that length >= 0
1379                    if (m_vars[i].alloc_if &&
1380                        (m_vars[i].disp + m_vars[i].size < 0)) {
1381                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1382                        exit(1);
1383                    }
1384
1385                    // base address
1386                    void *base = reinterpret_cast<void*>(dvp->Base);
1387                    PtrData *ptr_data;
1388
1389                    // allocate buffer if we have no INTO and don't need
1390                    // allocation for the ptr at target
1391                    if (src_is_for_mic) {
1392                        if (m_vars[i].alloc_if) {
1393                            // add new entry
1394                            if (!alloc_ptr_data(
1395                                    ptr_data,
1396                                    base,
1397                                    (alloc_base != NULL) ?
1398                                        alloc_disp : m_vars[i].disp,
1399                                    (alloc_base != NULL) ?
1400                                        alloc_size : m_vars[i].size,
1401                                    alloc_disp,
1402                                    (alloc_base != NULL) ?
1403                                        0 : m_vars[i].align)) {
1404                                return false;
1405                            }
1406
1407                            if (ptr_data->add_reference() == 0 &&
1408                                ptr_data->mic_buf != 0) {
1409                                // add buffer to the list of buffers
1410                                // that are passed to dispatch call
1411                                m_compute_buffers.push_back(
1412                                    ptr_data->mic_buf);
1413                            }
1414                            else {
1415                                // will send buffer address to device
1416                                m_vars[i].flags.sink_addr = 1;
1417                            }
1418
1419                            if (!ptr_data->is_static) {
1420                                // need to add reference for buffer
1421                                m_need_runfunction = true;
1422                            }
1423                        }
1424                        else {
1425                            bool error_if_not_found = true;
1426                            if (m_is_openmp) {
1427                                // For omp target update variable is ignored
1428                                // if it does not exist.
1429                                if (!m_vars[i].alloc_if &&
1430                                    !m_vars[i].free_if) {
1431                                    error_if_not_found = false;
1432                                }
1433                            }
1434
1435                            // use existing association from pointer table
1436                            if (!find_ptr_data(ptr_data,
1437                                               base,
1438                                               m_vars[i].disp,
1439                                               m_vars[i].size,
1440                                               error_if_not_found)) {
1441                                return false;
1442                            }
1443
1444                            if (m_is_openmp) {
1445                                // make var nocopy if it does not exist
1446                                if (ptr_data == 0) {
1447                                    m_vars[i].direction.bits =
1448                                        c_parameter_nocopy;
1449                                }
1450                            }
1451
1452                            if (ptr_data != 0) {
1453                                // need to update base in dope vector on device
1454                                m_vars[i].flags.sink_addr = 1;
1455                            }
1456                        }
1457
1458                        if (ptr_data != 0) {
1459                            if (m_is_openmp) {
1460                                // data is transferred only if
1461                                // alloc_if == 0 && free_if == 0
1462                                // or reference count is 1
1463                                if ((m_vars[i].alloc_if ||
1464                                     m_vars[i].free_if) &&
1465                                    ptr_data->get_reference() != 1) {
1466                                    m_vars[i].direction.bits =
1467                                        c_parameter_nocopy;
1468                                }
1469                            }
1470
1471                            if (ptr_data->alloc_disp != 0) {
1472                                m_vars[i].flags.alloc_disp = 1;
1473                                m_in_datalen += sizeof(alloc_disp);
1474                            }
1475
1476                            if (m_vars[i].flags.sink_addr) {
1477                                // get buffers's address on the sink
1478                                if (!init_mic_address(ptr_data)) {
1479                                    return false;
1480                                }
1481
1482                                m_in_datalen += sizeof(ptr_data->mic_addr);
1483                            }
1484
1485                            if (!ptr_data->is_static && m_vars[i].free_if) {
1486                                // need to decrement buffer reference on target
1487                                m_need_runfunction = true;
1488                            }
1489
1490                            // offset to base from the beginning of the buffer
1491                            // memory
1492                            m_vars[i].offset =
1493                                (char*) base -
1494                                (char*) ptr_data->cpu_addr.start();
1495
1496                            // copy other pointer properties to var descriptor
1497                            m_vars[i].mic_offset = ptr_data->mic_offset;
1498                            m_vars[i].flags.is_static = ptr_data->is_static;
1499                        }
1500                    }
1501                    else { // !src_is_for_mic
1502                        if (!find_ptr_data(ptr_data,
1503                                           base,
1504                                           m_vars[i].disp,
1505                                           m_vars[i].size,
1506                                           false)) {
1507                            return false;
1508                        }
1509                        m_vars[i].offset = !ptr_data ? 0 :
1510                                (char*) base -
1511                                (char*) ptr_data->cpu_addr.start();
1512                    }
1513
1514                    // save pointer data
1515                    m_vars_extra[i].src_data = ptr_data;
1516                }
1517                break;
1518
1519            default:
1520                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
1521                LIBOFFLOAD_ABORT;
1522        }
1523        if (m_vars[i].type.src == c_data_ptr_array) {
1524            continue;
1525        }
1526
1527        if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
1528            m_vars[i].offset = static_cast<char*>(m_vars[i].ptr) -
1529                m_device.m_persist_list.front().cpu_stack_addr;
1530        }
1531        // if source is used at CPU save its offset and disp
1532        if (m_vars[i].into == NULL || m_vars[i].direction.in) {
1533            m_vars_extra[i].cpu_offset = m_vars[i].offset;
1534            m_vars_extra[i].cpu_disp   = m_vars[i].disp;
1535        }
1536
1537        // If "into" is define we need to do the similar work for it
1538        if (!m_vars[i].into) {
1539            continue;
1540        }
1541
1542        int64_t into_disp =0, into_offset = 0;
1543
1544        switch (m_vars[i].type.dst) {
1545            case c_data_ptr_array:
1546                break;
1547            case c_data:
1548            case c_void_ptr:
1549            case c_cean_var: {
1550                int64_t size = m_vars[i].size;
1551
1552                if (m_vars[i].type.dst == c_cean_var) {
1553                    // array descriptor
1554                    const arr_desc *ap =
1555                        static_cast<const arr_desc*>(m_vars[i].into);
1556
1557                    // debug dump
1558                    __arr_desc_dump("    ", "INTO", ap, 0);
1559
1560                    // offset and length are derived from the array descriptor
1561                    __arr_data_offset_and_length(ap, into_disp, size);
1562
1563                    if (!is_arr_desc_contiguous(ap)) {
1564                        m_vars[i].flags.is_noncont_dst = 1;
1565                        m_vars_extra[i].read_rng_dst =
1566                            init_read_ranges_arr_desc(ap);
1567                        if (!cean_ranges_match(
1568                            m_vars_extra[i].read_rng_src,
1569                            m_vars_extra[i].read_rng_dst)) {
1570                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
1571                            exit(1);
1572                        }
1573                    }
1574                    m_vars[i].into = reinterpret_cast<void*>(ap->base);
1575                }
1576
1577                int64_t size_src = m_vars_extra[i].read_rng_src ?
1578                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
1579                    m_vars[i].size;
1580                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
1581                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
1582                    size;
1583                // It's supposed that "into" size must be not less
1584                // than src size
1585                if (size_src > size_dst) {
1586                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
1587                                     size_src, size_dst);
1588                    exit(1);
1589                }
1590
1591                if (m_vars[i].direction.bits) {
1592                    if (m_vars[i].flags.is_static_dstn) {
1593                        PtrData *ptr_data;
1594
1595                        // find data associated with variable
1596                        if (!find_ptr_data(ptr_data, m_vars[i].into,
1597                                           into_disp, size, false)) {
1598                            return false;
1599                        }
1600                        if (ptr_data != 0) {
1601                            // offset to base from the beginning of the buffer
1602                            // memory
1603                            into_offset =
1604                                (char*) m_vars[i].into -
1605                                (char*) ptr_data->cpu_addr.start();
1606                        }
1607                        else {
1608                            m_vars[i].flags.is_static_dstn = false;
1609                        }
1610                        m_vars_extra[i].dst_data = ptr_data;
1611                    }
1612                }
1613
1614                if (m_vars[i].direction.in &&
1615                    !m_vars[i].flags.is_static_dstn) {
1616                    m_in_datalen += m_vars[i].size;
1617
1618                    // for non-static target destination defined as CEAN
1619                    // expression we pass to target its size and dist
1620                    if (m_vars[i].type.dst == c_cean_var) {
1621                        m_in_datalen += 2 * sizeof(uint64_t);
1622                    }
1623                    m_need_runfunction = true;
1624                }
1625                break;
1626            }
1627
1628            case c_dv:
1629                if (m_vars[i].direction.bits ||
1630                    m_vars[i].alloc_if ||
1631                    m_vars[i].free_if) {
1632                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
1633
1634                    // debug dump
1635                    __dv_desc_dump("INTO", dvp);
1636
1637                    // send dope vector contents excluding base
1638                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1639                    m_need_runfunction = true;
1640                }
1641                break;
1642
1643            case c_string_ptr:
1644            case c_data_ptr:
1645            case c_cean_var_ptr:
1646            case c_dv_ptr: {
1647                int64_t size = m_vars[i].size;
1648
1649                if (m_vars[i].type.dst == c_cean_var_ptr) {
1650                    // array descriptor
1651                    const arr_desc *ap =
1652                        static_cast<const arr_desc*>(m_vars[i].into);
1653
1654                    // debug dump
1655                    __arr_desc_dump("    ", "INTO", ap, 1);
1656
1657                    // offset and length are derived from the array descriptor
1658                    __arr_data_offset_and_length(ap, into_disp, size);
1659
1660                    if (!is_arr_desc_contiguous(ap)) {
1661                        m_vars[i].flags.is_noncont_src = 1;
1662                        m_vars_extra[i].read_rng_dst =
1663                            init_read_ranges_arr_desc(ap);
1664                        if (!cean_ranges_match(
1665                            m_vars_extra[i].read_rng_src,
1666                            m_vars_extra[i].read_rng_dst)) {
1667                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
1668                        }
1669                    }
1670                    m_vars[i].into = reinterpret_cast<char**>(ap->base);
1671                }
1672                else if (m_vars[i].type.dst == c_dv_ptr) {
1673                    // need to send DV to the device unless it is 'nocopy'
1674                    if (m_vars[i].direction.bits ||
1675                        m_vars[i].alloc_if ||
1676                        m_vars[i].free_if) {
1677                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
1678
1679                        // debug dump
1680                        __dv_desc_dump("INTO", dvp);
1681
1682                        m_vars[i].direction.bits = c_parameter_in;
1683                    }
1684                }
1685
1686                int64_t size_src = m_vars_extra[i].read_rng_src ?
1687                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
1688                    m_vars[i].size;
1689                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
1690                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
1691                    size;
1692                // It's supposed that "into" size must be not less than
1693                // src size
1694                if (size_src > size_dst) {
1695                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
1696                                     size_src, size_dst);
1697                    exit(1);
1698                }
1699
1700                if (m_vars[i].direction.bits) {
1701                    PtrData *ptr_data;
1702
1703                    // base address
1704                    void *base = *static_cast<void**>(m_vars[i].into);
1705
1706                    if (m_vars[i].direction.in) {
1707                        // allocate buffer
1708                        if (m_vars[i].flags.is_stack_buf) {
1709                            // for stack persistent objects ptr data is created
1710                            // by var_desc with number 0.
1711                            // Its ptr_data is stored at m_stack_ptr_data
1712                            ptr_data = m_stack_ptr_data;
1713                            m_vars[i].flags.sink_addr = 1;
1714                        }
1715                        else if (m_vars[i].alloc_if) {
1716                            // add new entry
1717                            if (!alloc_ptr_data(
1718                                    ptr_data,
1719                                    base,
1720                                    (alloc_base != NULL) ?
1721                                        alloc_disp : into_disp,
1722                                    (alloc_base != NULL) ?
1723                                        alloc_size : size,
1724                                    alloc_disp,
1725                                    (alloc_base != NULL) ?
1726                                        0 : m_vars[i].align)) {
1727                                return false;
1728                            }
1729
1730                            if (ptr_data->add_reference() == 0 &&
1731                                ptr_data->mic_buf != 0) {
1732                                // add buffer to the list of buffers that
1733                                // are passed to dispatch call
1734                                m_compute_buffers.push_back(
1735                                    ptr_data->mic_buf);
1736                            }
1737                            else {
1738                                // will send buffer address to device
1739                                m_vars[i].flags.sink_addr = 1;
1740                            }
1741
1742                            if (!ptr_data->is_static) {
1743                                // need to add reference for buffer
1744                                m_need_runfunction = true;
1745                            }
1746                        }
1747                        else {
1748                            // use existing association from pointer table
1749                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
1750                                return false;
1751                            }
1752                            m_vars[i].flags.sink_addr = 1;
1753                        }
1754
1755                        if (ptr_data->alloc_disp != 0) {
1756                            m_vars[i].flags.alloc_disp = 1;
1757                            m_in_datalen += sizeof(alloc_disp);
1758                        }
1759
1760                        if (m_vars[i].flags.sink_addr) {
1761                            // get buffers's address on the sink
1762                            if (!init_mic_address(ptr_data)) {
1763                                return false;
1764                            }
1765
1766                            m_in_datalen += sizeof(ptr_data->mic_addr);
1767                        }
1768
1769                        if (!ptr_data->is_static && m_vars[i].free_if) {
1770                            // need to decrement buffer reference on target
1771                            m_need_runfunction = true;
1772                        }
1773
1774                        // copy other pointer properties to var descriptor
1775                        m_vars[i].mic_offset = ptr_data->mic_offset;
1776                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
1777                    }
1778                    else {
1779                        if (!find_ptr_data(ptr_data,
1780                                           base,
1781                                           into_disp,
1782                                           m_vars[i].size,
1783                                           false)) {
1784                            return false;
1785                        }
1786                    }
1787                    if (ptr_data) {
1788                        into_offset = ptr_data ?
1789                            (char*) base -
1790                            (char*) ptr_data->cpu_addr.start() :
1791                            0;
1792                    }
1793                    // save pointer data
1794                    m_vars_extra[i].dst_data = ptr_data;
1795                }
1796                break;
1797            }
1798
1799            case c_func_ptr:
1800                break;
1801
1802            case c_dv_data:
1803            case c_dv_ptr_data:
1804            case c_dv_data_slice:
1805            case c_dv_ptr_data_slice:
1806                if (m_vars[i].direction.bits ||
1807                    m_vars[i].alloc_if ||
1808                    m_vars[i].free_if) {
1809                    const arr_desc *ap;
1810                    ArrDesc *dvp;
1811                    PtrData *ptr_data;
1812                    int64_t disp;
1813                    int64_t size;
1814
1815                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
1816                        ap = static_cast<const arr_desc*>(m_vars[i].into);
1817
1818                        // debug dump
1819                        __arr_desc_dump("    ", "INTO", ap, 0);
1820
1821                        dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
1822                              reinterpret_cast<ArrDesc*>(ap->base) :
1823                              *reinterpret_cast<ArrDesc**>(ap->base);
1824                    }
1825                    else {
1826                        dvp = (m_vars[i].type.dst == c_dv_data) ?
1827                              static_cast<ArrDesc*>(m_vars[i].into) :
1828                              *static_cast<ArrDesc**>(m_vars[i].into);
1829                    }
1830                    if (!__dv_is_contiguous(dvp)) {
1831                        m_vars[i].flags.is_noncont_dst = 1;
1832                        m_vars_extra[i].read_rng_dst =
1833                            init_read_ranges_dv(dvp);
1834                    }
1835                    // size and displacement
1836                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
1837                        // offset and length are derived from the array
1838                        // descriptor
1839                        __arr_data_offset_and_length(ap, into_disp, size);
1840                        if (m_vars[i].direction.bits) {
1841                            if (!is_arr_desc_contiguous(ap)) {
1842                                if (m_vars[i].flags.is_noncont_dst) {
1843                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
1844                                    return false;
1845                                }
1846                                m_vars[i].flags.is_noncont_dst = 1;
1847                                m_vars_extra[i].read_rng_dst =
1848                                    init_read_ranges_arr_desc(ap);
1849                                if (!cean_ranges_match(
1850                                    m_vars_extra[i].read_rng_src,
1851                                    m_vars_extra[i].read_rng_dst)) {
1852                                    LIBOFFLOAD_ERROR(c_ranges_dont_match);
1853                                }
1854                            }
1855                        }
1856                    }
1857                    else {
1858                        if (m_vars[i].flags.has_length) {
1859                            size = __dv_data_length(dvp, m_vars[i].count);
1860                        }
1861                        else {
1862                            size = __dv_data_length(dvp);
1863                        }
1864                        disp = 0;
1865                    }
1866
1867                    int64_t size_src =
1868                        m_vars_extra[i].read_rng_src ?
1869                        cean_get_transf_size(m_vars_extra[i].read_rng_src) :
1870                        m_vars[i].size;
1871                    int64_t size_dst =
1872                        m_vars_extra[i].read_rng_dst ?
1873                        cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
1874                        size;
1875                    // It's supposed that "into" size must be not less
1876                    // than src size
1877                    if (size_src > size_dst) {
1878                        LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
1879                            size_src, size_dst);
1880                        exit(1);
1881                    }
1882
1883                    // base address
1884                    void *base = reinterpret_cast<void*>(dvp->Base);
1885
1886                    // allocate buffer
1887                    if (m_vars[i].direction.in) {
1888                        if (m_vars[i].alloc_if) {
1889                            // add new entry
1890                            if (!alloc_ptr_data(
1891                                    ptr_data,
1892                                    base,
1893                                    (alloc_base != NULL) ?
1894                                        alloc_disp : into_disp,
1895                                    (alloc_base != NULL) ?
1896                                        alloc_size : size,
1897                                    alloc_disp,
1898                                    (alloc_base != NULL) ?
1899                                        0 : m_vars[i].align)) {
1900                                return false;
1901                            }
1902                            if (ptr_data->add_reference() == 0 &&
1903                                ptr_data->mic_buf !=0) {
1904                                // add buffer to the list of buffers
1905                                // that are passed to dispatch call
1906                                m_compute_buffers.push_back(
1907                                    ptr_data->mic_buf);
1908                            }
1909                            else {
1910                                // will send buffer address to device
1911                                m_vars[i].flags.sink_addr = 1;
1912                            }
1913
1914                            if (!ptr_data->is_static) {
1915                                // need to add reference for buffer
1916                                m_need_runfunction = true;
1917                            }
1918                        }
1919                        else {
1920                            // use existing association from pointer table
1921                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
1922                                return false;
1923                            }
1924
1925                            // need to update base in dope vector on device
1926                            m_vars[i].flags.sink_addr = 1;
1927                        }
1928
1929                        if (ptr_data->alloc_disp != 0) {
1930                            m_vars[i].flags.alloc_disp = 1;
1931                            m_in_datalen += sizeof(alloc_disp);
1932                        }
1933
1934                        if (m_vars[i].flags.sink_addr) {
1935                            // get buffers's address on the sink
1936                            if (!init_mic_address(ptr_data)) {
1937                                return false;
1938                            }
1939                            m_in_datalen += sizeof(ptr_data->mic_addr);
1940                        }
1941
1942                        if (!ptr_data->is_static && m_vars[i].free_if) {
1943                            // need to decrement buffer reference on target
1944                            m_need_runfunction = true;
1945                        }
1946
1947                        // offset to base from the beginning of the buffer
1948                        // memory
1949                        into_offset =
1950                            (char*) base - (char*) ptr_data->cpu_addr.start();
1951
1952                        // copy other pointer properties to var descriptor
1953                        m_vars[i].mic_offset = ptr_data->mic_offset;
1954                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
1955                    }
1956                    else { // src_is_for_mic
1957                        if (!find_ptr_data(ptr_data,
1958                                           base,
1959                                           into_disp,
1960                                           size,
1961                                           false)) {
1962                            return false;
1963                        }
1964                        into_offset = !ptr_data ?
1965                            0 :
1966                            (char*) base - (char*) ptr_data->cpu_addr.start();
1967                    }
1968
1969                    // save pointer data
1970                    m_vars_extra[i].dst_data = ptr_data;
1971                }
1972                break;
1973
1974            default:
1975                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
1976                LIBOFFLOAD_ABORT;
1977        }
1978        // if into is used at CPU save its offset and disp
1979        if (m_vars[i].direction.out) {
1980            m_vars_extra[i].cpu_offset = into_offset;
1981            m_vars_extra[i].cpu_disp   = into_disp;
1982        }
1983        else {
1984            if (m_vars[i].flags.is_stack_buf) {
1985                into_offset = static_cast<char*>(m_vars[i].into) -
1986                    m_device.m_persist_list.front().cpu_stack_addr;
1987            }
1988            m_vars[i].offset = into_offset;
1989            m_vars[i].disp   = into_disp;
1990        }
1991    }
1992
1993    return true;
1994}
1995
1996bool OffloadDescriptor::setup_misc_data(const char *name)
1997{
1998    OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
1999
2000    // we can skip run functon call together with wait if offloaded
2001    // region is empty and there is no user defined non-pointer IN/OUT data
2002    if (m_need_runfunction) {
2003        // variable descriptors are sent as input data
2004        m_in_datalen += m_vars_total * sizeof(VarDesc);
2005
2006        // timer data is sent as a part of the output data
2007        m_out_datalen += OFFLOAD_TIMER_DATALEN();
2008
2009        // max from input data and output data length
2010        uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
2011                                                           m_out_datalen;
2012
2013        // Misc data has the following layout
2014        //     <Function Descriptor>
2015        //     <Function Name>
2016        //     <In/Out Data>            (optional)
2017        //
2018        // We can transfer copyin/copyout data in misc/return data which can
2019        // be passed to run function call if its size does not exceed
2020        // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2021        // buffer for it.
2022
2023        m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
2024        m_func_desc_size = (m_func_desc_size + 7) & ~7;
2025
2026        int misc_data_offset = 0;
2027        int misc_data_size = 0;
2028        if (data_len > 0) {
2029            if (m_func_desc_size +
2030                m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
2031                m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
2032                // use misc/return data for copyin/copyout
2033                misc_data_offset = m_func_desc_size;
2034                misc_data_size = data_len;
2035            }
2036            else {
2037                OffloadTimer timer_buf(get_timer_data(),
2038                                       c_offload_host_alloc_data_buffer);
2039
2040                // send/receive data using buffer
2041                COIRESULT res = COI::BufferCreate(data_len,
2042                                                  COI_BUFFER_NORMAL,
2043                                                  0, 0,
2044                                                  1, &m_device.get_process(),
2045                                                  &m_inout_buf);
2046                if (res != COI_SUCCESS) {
2047                    if (m_status != 0) {
2048                        m_status->result = translate_coi_error(res);
2049                        return false;
2050                    }
2051                    report_coi_error(c_buf_create, res);
2052                }
2053
2054                m_compute_buffers.push_back(m_inout_buf);
2055                m_destroy_buffers.push_back(m_inout_buf);
2056            }
2057        }
2058
2059        // initialize function descriptor
2060        m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
2061                                                   misc_data_size);
2062        if (m_func_desc == NULL)
2063          LIBOFFLOAD_ERROR(c_malloc);
2064        m_func_desc->console_enabled = console_enabled;
2065        m_func_desc->timer_enabled =
2066            timer_enabled || (offload_report_level && offload_report_enabled);
2067        m_func_desc->offload_report_level = offload_report_level;
2068        m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
2069        m_func_desc->in_datalen = m_in_datalen;
2070        m_func_desc->out_datalen = m_out_datalen;
2071        m_func_desc->vars_num = m_vars_total;
2072        m_func_desc->data_offset = misc_data_offset;
2073
2074        // append entry name
2075        strcpy(m_func_desc->data, name);
2076    }
2077
2078    return true;
2079}
2080
2081bool OffloadDescriptor::wait_dependencies(
2082    const void **waits,
2083    int num_waits
2084)
2085{
2086    OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
2087    bool ret = true;
2088
2089    for (int i = 0; i < num_waits; i++) {
2090
2091        OffloadDescriptor *task = m_device.find_signal(waits[i], true);
2092        if (task == 0) {
2093            LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
2094                             waits[i]);
2095            LIBOFFLOAD_ABORT;
2096        }
2097
2098        if (!task->offload_finish()) {
2099            ret = false;
2100        }
2101
2102        task->cleanup();
2103        delete task;
2104    }
2105
2106    return ret;
2107}
2108
2109bool OffloadDescriptor::offload(
2110    const char *name,
2111    bool is_empty,
2112    VarDesc *vars,
2113    VarDesc2 *vars2,
2114    int vars_total,
2115    const void **waits,
2116    int num_waits,
2117    const void **signal,
2118    int entry_id,
2119    const void *stack_addr
2120)
2121{
2122    if (signal == 0) {
2123        OFFLOAD_DEBUG_TRACE_1(1,
2124                      GET_OFFLOAD_NUMBER(get_timer_data()),
2125                      c_offload_init_func,
2126                      "Offload function %s, is_empty=%d, #varDescs=%d, "
2127                      "#waits=%d, signal=none\n",
2128                      name, is_empty, vars_total, num_waits);
2129        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2130                      c_offload_sent_pointer_data,
2131                      "#Wait : %d \n", num_waits);
2132        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2133                      c_offload_signal,
2134                      "none %d\n", 0);
2135    }
2136    else {
2137        OFFLOAD_DEBUG_TRACE_1(1,
2138                      GET_OFFLOAD_NUMBER(get_timer_data()),
2139                      c_offload_init_func,
2140                      "Offload function %s, is_empty=%d, #varDescs=%d, "
2141                      "#waits=%d, signal=%p\n",
2142                      name, is_empty, vars_total, num_waits,
2143                      *signal);
2144
2145        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2146                      c_offload_signal,
2147                      "%d\n", signal);
2148    }
2149    OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2150                      c_offload_wait,
2151                      "#Wait : %d  %p\n", num_waits, waits);
2152
2153    if (m_status != 0) {
2154        m_status->result = OFFLOAD_SUCCESS;
2155        m_status->device_number = m_device.get_logical_index();
2156    }
2157
2158    m_need_runfunction = !is_empty;
2159
2160    // wait for dependencies to finish
2161    if (!wait_dependencies(waits, num_waits)) {
2162        cleanup();
2163        return false;
2164    }
2165
2166    // setup buffers
2167    if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
2168        cleanup();
2169        return false;
2170    }
2171
2172    // initiate send for pointers. Want to do it as early as possible.
2173    if (!send_pointer_data(signal != 0)) {
2174        cleanup();
2175        return false;
2176    }
2177
2178    // setup misc data for run function
2179    if (!setup_misc_data(name)) {
2180        cleanup();
2181        return false;
2182    }
2183
2184    // gather copyin data into buffer
2185    if (!gather_copyin_data()) {
2186        cleanup();
2187        return false;
2188    }
2189
2190    // Start the computation
2191    if (!compute()) {
2192        cleanup();
2193        return false;
2194    }
2195
2196    // initiate receive for pointers
2197    if (!receive_pointer_data(signal != 0)) {
2198        cleanup();
2199        return false;
2200    }
2201
2202    // if there is a signal save descriptor for the later use.
2203    if (signal != 0) {
2204        m_device.add_signal(*signal, this);
2205        return true;
2206    }
2207
2208    // wait for the offload to finish.
2209    if (!offload_finish()) {
2210        cleanup();
2211        return false;
2212    }
2213
2214    cleanup();
2215    return true;
2216}
2217
2218bool OffloadDescriptor::offload_finish()
2219{
2220    COIRESULT res;
2221
2222    // wait for compute dependencies to become signaled
2223    if (m_in_deps_total > 0) {
2224        OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
2225
2226        if (__offload_active_wait) {
2227            // keep CPU busy
2228            do {
2229                res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
2230            }
2231            while (res == COI_TIME_OUT_REACHED);
2232        }
2233        else {
2234            res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
2235        }
2236
2237        if (res != COI_SUCCESS) {
2238            if (m_status != 0) {
2239                m_status->result = translate_coi_error(res);
2240                return false;
2241            }
2242            report_coi_error(c_event_wait, res);
2243        }
2244    }
2245
2246    // scatter copyout data received from target
2247    if (!scatter_copyout_data()) {
2248        return false;
2249    }
2250    // wait for receive dependencies to become signaled
2251    if (m_out_deps_total > 0) {
2252        OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
2253
2254        if (__offload_active_wait) {
2255            // keep CPU busy
2256            do {
2257                res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
2258            }
2259            while (res == COI_TIME_OUT_REACHED);
2260        }
2261        else {
2262            res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
2263        }
2264
2265        if (res != COI_SUCCESS) {
2266            if (m_status != 0) {
2267                m_status->result = translate_coi_error(res);
2268                return false;
2269            }
2270            report_coi_error(c_event_wait, res);
2271        }
2272    }
2273
2274    // destroy buffers
2275    {
2276        OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
2277
2278        for (BufferList::const_iterator it = m_destroy_buffers.begin();
2279             it != m_destroy_buffers.end(); it++) {
2280            res = COI::BufferDestroy(*it);
2281            if (res != COI_SUCCESS) {
2282                if (m_status != 0) {
2283                    m_status->result = translate_coi_error(res);
2284                    return false;
2285                }
2286                report_coi_error(c_buf_destroy, res);
2287            }
2288        }
2289    }
2290
2291    return true;
2292}
2293
2294void OffloadDescriptor::cleanup()
2295{
2296    // release device in orsl
2297    ORSL::release(m_device.get_logical_index());
2298
2299    OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
2300
2301    // report stuff
2302    Offload_Report_Epilog(get_timer_data());
2303}
2304
2305bool OffloadDescriptor::is_signaled()
2306{
2307    bool signaled = true;
2308    COIRESULT res;
2309
2310    // check compute and receive dependencies
2311    if (m_in_deps_total > 0) {
2312        res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
2313        signaled = signaled && (res == COI_SUCCESS);
2314    }
2315    if (m_out_deps_total > 0) {
2316        res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
2317        signaled = signaled && (res == COI_SUCCESS);
2318    }
2319
2320    return signaled;
2321}
2322
2323// Send pointer data if source or destination or both of them are
2324// noncontiguous. There is guarantee that length of destination enough for
2325// transfered data.
2326bool OffloadDescriptor::send_noncontiguous_pointer_data(
2327    int i,
2328    PtrData* src_data,
2329    PtrData* dst_data,
2330    COIEVENT *event
2331    )
2332{
2333    int64_t offset_src, offset_dst;
2334    int64_t length_src, length_dst;
2335    int64_t length_src_cur, length_dst_cur;
2336    int64_t send_size, data_sent = 0;
2337    COIRESULT res;
2338    bool dst_is_empty = true;
2339    bool src_is_empty = true;
2340
2341    // Set length_src and length_dst
2342    length_src = (m_vars_extra[i].read_rng_src) ?
2343        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
2344    length_dst = !m_vars[i].into ? length_src :
2345                     (m_vars_extra[i].read_rng_dst) ?
2346                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
2347    send_size = (length_src < length_dst) ? length_src : length_dst;
2348
2349    // consequently get contiguous ranges,
2350    // define corresponded destination offset and send data
2351    do {
2352        if (src_is_empty) {
2353            if (m_vars_extra[i].read_rng_src) {
2354                if (!get_next_range(m_vars_extra[i].read_rng_src,
2355                         &offset_src)) {
2356                    // source ranges are over - nothing to send
2357                    break;
2358                }
2359            }
2360            else if (data_sent == 0) {
2361                offset_src = m_vars_extra[i].cpu_disp;
2362            }
2363            else {
2364                break;
2365            }
2366            length_src_cur = length_src;
2367        }
2368        else {
2369            // if source is contiguous or its contiguous range is greater
2370            // than destination one
2371            offset_src += send_size;
2372        }
2373        length_src_cur -= send_size;
2374        src_is_empty = length_src_cur == 0;
2375
2376        if (dst_is_empty) {
2377            if (m_vars[i].into) {
2378                if (m_vars_extra[i].read_rng_dst) {
2379                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
2380                             &offset_dst)) {
2381                        // destination ranges are over
2382                        LIBOFFLOAD_ERROR(c_destination_is_over);
2383                        return false;
2384                    }
2385                }
2386                // into is contiguous.
2387                else {
2388                    offset_dst = m_vars[i].disp;
2389                }
2390                length_dst_cur = length_dst;
2391            }
2392            // same as source
2393            else {
2394                offset_dst = offset_src;
2395                length_dst_cur = length_src;
2396            }
2397        }
2398        else {
2399            // if destination is contiguous or its contiguous range is greater
2400            // than source one
2401            offset_dst += send_size;
2402        }
2403        length_dst_cur -= send_size;
2404        dst_is_empty = length_dst_cur == 0;
2405
2406        if (src_data != 0 && src_data->cpu_buf != 0) {
2407            res = COI::BufferCopy(
2408                dst_data->mic_buf,
2409                src_data->cpu_buf,
2410                m_vars[i].mic_offset - dst_data->alloc_disp +
2411                m_vars[i].offset + offset_dst,
2412                m_vars_extra[i].cpu_offset + offset_src,
2413                send_size,
2414                COI_COPY_UNSPECIFIED,
2415                0, 0,
2416                event);
2417            if (res != COI_SUCCESS) {
2418                if (m_status != 0) {
2419                    m_status->result = translate_coi_error(res);
2420                    return false;
2421                }
2422                report_coi_error(c_buf_copy, res);
2423            }
2424        }
2425        else {
2426            char *base = offload_get_src_base(m_vars[i].ptr,
2427                m_vars[i].type.src);
2428
2429            res = COI::BufferWrite(
2430                dst_data->mic_buf,
2431                m_vars[i].mic_offset - dst_data->alloc_disp +
2432                m_vars[i].offset + offset_dst,
2433                base + offset_src,
2434                send_size,
2435                COI_COPY_UNSPECIFIED,
2436                0, 0,
2437                event);
2438            if (res != COI_SUCCESS) {
2439                if (m_status != 0) {
2440                    m_status->result = translate_coi_error(res);
2441                    return false;
2442                }
2443                report_coi_error(c_buf_write, res);
2444            }
2445        }
2446        data_sent += length_src;
2447    }
2448    while (true);
2449    return true;
2450}
2451
2452bool OffloadDescriptor::send_pointer_data(bool is_async)
2453{
2454    OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
2455
2456    uint64_t ptr_sent = 0;
2457    COIRESULT res;
2458
2459    // Initiate send for pointer data
2460    for (int i = 0; i < m_vars_total; i++) {
2461        switch (m_vars[i].type.dst) {
2462            case c_data_ptr_array:
2463                break;
2464            case c_data:
2465            case c_void_ptr:
2466            case c_cean_var:
2467                if (m_vars[i].direction.in &&
2468                    m_vars[i].flags.is_static_dstn) {
2469                    COIEVENT *event =
2470                        (is_async ||
2471                         m_vars[i].size >= __offload_use_async_buffer_write) ?
2472                        &m_in_deps[m_in_deps_total++] : 0;
2473                    PtrData* dst_data = m_vars[i].into ?
2474                                            m_vars_extra[i].dst_data :
2475                                            m_vars_extra[i].src_data;
2476                    PtrData* src_data =
2477                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
2478                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
2479                        m_vars[i].flags.is_static ?
2480                           m_vars_extra[i].src_data : 0;
2481
2482                    if (m_vars[i].flags.is_noncont_src ||
2483                        m_vars[i].flags.is_noncont_dst) {
2484                        if (!send_noncontiguous_pointer_data(
2485                                i, src_data, dst_data, event)) {
2486                            return false;
2487                        }
2488                    }
2489                    else if (src_data != 0 && src_data->cpu_buf != 0) {
2490                        res = COI::BufferCopy(
2491                            dst_data->mic_buf,
2492                            src_data->cpu_buf,
2493                            m_vars[i].mic_offset - dst_data->alloc_disp +
2494                            m_vars[i].offset + m_vars[i].disp,
2495                            m_vars_extra[i].cpu_offset +
2496                            m_vars_extra[i].cpu_disp,
2497                            m_vars[i].size,
2498                            COI_COPY_UNSPECIFIED,
2499                            0, 0,
2500                            event);
2501                        if (res != COI_SUCCESS) {
2502                            if (m_status != 0) {
2503                                m_status->result = translate_coi_error(res);
2504                                return false;
2505                            }
2506                            report_coi_error(c_buf_copy, res);
2507                        }
2508                    }
2509                    else {
2510                        char *base = offload_get_src_base(m_vars[i].ptr,
2511                                                          m_vars[i].type.src);
2512                        res = COI::BufferWrite(
2513                            dst_data->mic_buf,
2514                            m_vars[i].mic_offset - dst_data->alloc_disp +
2515                            m_vars[i].offset + m_vars[i].disp,
2516                            base + m_vars_extra[i].cpu_disp,
2517                            m_vars[i].size,
2518                            COI_COPY_UNSPECIFIED,
2519                            0, 0,
2520                            event);
2521                        if (res != COI_SUCCESS) {
2522                            if (m_status != 0) {
2523                                m_status->result = translate_coi_error(res);
2524                                return false;
2525                            }
2526                            report_coi_error(c_buf_write, res);
2527                        }
2528                    }
2529                    ptr_sent += m_vars[i].size;
2530                }
2531                break;
2532
2533            case c_string_ptr:
2534            case c_data_ptr:
2535            case c_cean_var_ptr:
2536            case c_dv_ptr:
2537                if (m_vars[i].direction.in && m_vars[i].size > 0) {
2538                    COIEVENT *event =
2539                        (is_async ||
2540                         m_vars[i].size >= __offload_use_async_buffer_write) ?
2541                        &m_in_deps[m_in_deps_total++] : 0;
2542                    PtrData* dst_data = m_vars[i].into ?
2543                                            m_vars_extra[i].dst_data :
2544                                            m_vars_extra[i].src_data;
2545                    PtrData* src_data =
2546                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
2547                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
2548                        m_vars[i].flags.is_static ?
2549                            m_vars_extra[i].src_data : 0;
2550
2551                    if (m_vars[i].flags.is_noncont_src ||
2552                        m_vars[i].flags.is_noncont_dst) {
2553                        send_noncontiguous_pointer_data(
2554                            i, src_data, dst_data, event);
2555                    }
2556                    else if (src_data != 0 && src_data->cpu_buf != 0) {
2557                        res = COI::BufferCopy(
2558                            dst_data->mic_buf,
2559                            src_data->cpu_buf,
2560                            m_vars[i].mic_offset - dst_data->alloc_disp +
2561                            m_vars[i].offset + m_vars[i].disp,
2562                            m_vars_extra[i].cpu_offset +
2563                            m_vars_extra[i].cpu_disp,
2564                            m_vars[i].size,
2565                            COI_COPY_UNSPECIFIED,
2566                            0, 0,
2567                            event);
2568                        if (res != COI_SUCCESS) {
2569                            if (m_status != 0) {
2570                                m_status->result = translate_coi_error(res);
2571                                return false;
2572                            }
2573                            report_coi_error(c_buf_copy, res);
2574                        }
2575                    }
2576                    else {
2577                        char *base = offload_get_src_base(m_vars[i].ptr,
2578                                                          m_vars[i].type.src);
2579                        res = COI::BufferWrite(
2580                            dst_data->mic_buf,
2581                            m_vars[i].mic_offset - dst_data->alloc_disp +
2582                            m_vars[i].offset + m_vars[i].disp,
2583                            base + m_vars_extra[i].cpu_disp,
2584                            m_vars[i].size,
2585                            COI_COPY_UNSPECIFIED,
2586                            0, 0,
2587                            event);
2588                        if (res != COI_SUCCESS) {
2589                            if (m_status != 0) {
2590                                m_status->result = translate_coi_error(res);
2591                                return false;
2592                            }
2593                            report_coi_error(c_buf_write, res);
2594                        }
2595                    }
2596
2597                    ptr_sent += m_vars[i].size;
2598                }
2599                break;
2600
2601            case c_dv_data:
2602            case c_dv_ptr_data:
2603                if (m_vars[i].direction.in &&
2604                    m_vars[i].size > 0) {
2605                    PtrData *ptr_data = m_vars[i].into ?
2606                                        m_vars_extra[i].dst_data :
2607                                        m_vars_extra[i].src_data;
2608                    PtrData* src_data = m_vars_extra[i].src_data;
2609
2610                    COIEVENT *event =
2611                        (is_async ||
2612                         m_vars[i].size >= __offload_use_async_buffer_write) ?
2613                        &m_in_deps[m_in_deps_total++] : 0;
2614
2615                    if (m_vars[i].flags.is_noncont_src ||
2616                        m_vars[i].flags.is_noncont_dst) {
2617                        send_noncontiguous_pointer_data(
2618                            i, src_data, ptr_data, event);
2619                    }
2620                    else if (src_data && src_data->cpu_buf != 0) {
2621                        res = COI::BufferCopy(
2622                            ptr_data->mic_buf,
2623                            src_data->cpu_buf,
2624                            m_vars[i].offset + ptr_data->mic_offset -
2625                            ptr_data->alloc_disp +
2626                            m_vars[i].disp,
2627                            m_vars_extra[i].cpu_offset +
2628                            m_vars_extra[i].cpu_disp,
2629                            m_vars[i].size,
2630                            COI_COPY_UNSPECIFIED,
2631                            0, 0,
2632                            event);
2633                        if (res != COI_SUCCESS) {
2634                            if (m_status != 0) {
2635                                m_status->result = translate_coi_error(res);
2636                                return false;
2637                            }
2638                            report_coi_error(c_buf_copy, res);
2639                        }
2640                    }
2641                    else {
2642                        char *base = offload_get_src_base(m_vars[i].ptr,
2643                                                          m_vars[i].type.src);
2644                        res = COI::BufferWrite(
2645                            ptr_data->mic_buf,
2646                            ptr_data->mic_offset - ptr_data->alloc_disp +
2647                            m_vars[i].offset + m_vars[i].disp,
2648                            base + m_vars_extra[i].cpu_disp,
2649                            m_vars[i].size,
2650                            COI_COPY_UNSPECIFIED,
2651                            0, 0,
2652                            event);
2653                        if (res != COI_SUCCESS) {
2654                            if (m_status != 0) {
2655                                m_status->result = translate_coi_error(res);
2656                                return false;
2657                            }
2658                            report_coi_error(c_buf_write, res);
2659                        }
2660                    }
2661                    ptr_sent += m_vars[i].size;
2662                }
2663                break;
2664
2665            case c_dv_data_slice:
2666            case c_dv_ptr_data_slice:
2667                if (m_vars[i].direction.in &&
2668                    m_vars[i].size > 0) {
2669                    PtrData *dst_data = m_vars[i].into ?
2670                                        m_vars_extra[i].dst_data :
2671                                        m_vars_extra[i].src_data;
2672                    PtrData* src_data =
2673                        (VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
2674                        VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) ||
2675                        VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) ||
2676                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
2677                        m_vars[i].flags.is_static) ?
2678                            m_vars_extra[i].src_data : 0;
2679                    COIEVENT *event =
2680                        (is_async ||
2681                         m_vars[i].size >= __offload_use_async_buffer_write) ?
2682                        &m_in_deps[m_in_deps_total++] : 0;
2683                    if (m_vars[i].flags.is_noncont_src ||
2684                        m_vars[i].flags.is_noncont_dst) {
2685                        send_noncontiguous_pointer_data(
2686                            i, src_data, dst_data, event);
2687                    }
2688                    else if (src_data && src_data->cpu_buf != 0) {
2689                        res = COI::BufferCopy(
2690                            dst_data->mic_buf,
2691                            src_data->cpu_buf,
2692                            m_vars[i].offset - dst_data->alloc_disp +
2693                            dst_data->mic_offset +
2694                            m_vars[i].disp,
2695                            m_vars_extra[i].cpu_offset +
2696                            m_vars_extra[i].cpu_disp,
2697                            m_vars[i].size,
2698                            COI_COPY_UNSPECIFIED,
2699                            0, 0,
2700                            event);
2701                        if (res != COI_SUCCESS) {
2702                            if (m_status != 0) {
2703                                m_status->result = translate_coi_error(res);
2704                                return false;
2705                            }
2706                            report_coi_error(c_buf_copy, res);
2707                        }
2708                    }
2709                    else {
2710                        char *base = offload_get_src_base(m_vars[i].ptr,
2711                                                          m_vars[i].type.src);
2712                        res = COI::BufferWrite(
2713                            dst_data->mic_buf,
2714                            dst_data->mic_offset - dst_data->alloc_disp +
2715                            m_vars[i].offset + m_vars[i].disp,
2716                            base + m_vars_extra[i].cpu_disp,
2717                            m_vars[i].size,
2718                            COI_COPY_UNSPECIFIED,
2719                            0, 0,
2720                            event);
2721                        if (res != COI_SUCCESS) {
2722                            if (m_status != 0) {
2723                                m_status->result = translate_coi_error(res);
2724                                return false;
2725                            }
2726                            report_coi_error(c_buf_write, res);
2727                        }
2728                    }
2729
2730                    ptr_sent += m_vars[i].size;
2731                }
2732                break;
2733
2734            default:
2735                break;
2736        }
2737
2738        // alloc field isn't used at target.
2739        // We can reuse it for offset of array pointers.
2740        if (m_vars_extra[i].is_arr_ptr_el) {
2741            m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
2742        }
2743    }
2744
2745    if (m_status) {
2746        m_status->data_sent += ptr_sent;
2747    }
2748
2749    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
2750    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
2751                  c_offload_sent_pointer_data,
2752                  "Total pointer data sent to target: [%lld] bytes\n",
2753                  ptr_sent);
2754
2755    return true;
2756}
2757
2758bool OffloadDescriptor::gather_copyin_data()
2759{
2760    OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
2761
2762    if (m_need_runfunction && m_in_datalen > 0) {
2763        COIMAPINSTANCE map_inst;
2764        char *data;
2765
2766        // init marshaller
2767        if (m_inout_buf != 0) {
2768            OffloadTimer timer_map(get_timer_data(),
2769                                   c_offload_host_map_in_data_buffer);
2770
2771            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
2772                                           COI_MAP_WRITE_ENTIRE_BUFFER,
2773                                           0, 0, 0, &map_inst,
2774                                           reinterpret_cast<void**>(&data));
2775            if (res != COI_SUCCESS) {
2776                if (m_status != 0) {
2777                    m_status->result = translate_coi_error(res);
2778                    return false;
2779                }
2780                report_coi_error(c_buf_map, res);
2781            }
2782        }
2783        else {
2784            data = (char*) m_func_desc + m_func_desc->data_offset;
2785        }
2786
2787        // send variable descriptors
2788        memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
2789        data += m_vars_total * sizeof(VarDesc);
2790
2791        // init marshaller
2792        m_in.init_buffer(data, m_in_datalen);
2793
2794        // Gather copy data into buffer
2795        for (int i = 0; i < m_vars_total; i++) {
2796            bool src_is_for_mic = (m_vars[i].direction.out ||
2797                                   m_vars[i].into == NULL);
2798            PtrData* ptr_data = src_is_for_mic ?
2799                                m_vars_extra[i].src_data :
2800                                m_vars_extra[i].dst_data;
2801            if (m_vars[i].flags.alloc_disp) {
2802                m_in.send_data(&ptr_data->alloc_disp,
2803                               sizeof(ptr_data->alloc_disp));
2804            }
2805
2806            // send sink address to the target
2807            if (m_vars[i].flags.sink_addr) {
2808                m_in.send_data(&ptr_data->mic_addr,
2809                               sizeof(ptr_data->mic_addr));
2810            }
2811
2812            switch (m_vars[i].type.dst) {
2813                case c_data_ptr_array:
2814                    break;
2815                case c_data:
2816                case c_void_ptr:
2817                case c_cean_var:
2818                    if (m_vars[i].direction.in &&
2819                        !m_vars[i].flags.is_static_dstn) {
2820
2821                        char *ptr = offload_get_src_base(m_vars[i].ptr,
2822                                                         m_vars[i].type.src);
2823                        if (m_vars[i].type.dst == c_cean_var) {
2824                            // offset and length are derived from the array
2825                            // descriptor
2826                            int64_t size = m_vars[i].size;
2827                            int64_t disp = m_vars[i].disp;
2828                            m_in.send_data(reinterpret_cast<char*>(&size),
2829                                           sizeof(int64_t));
2830                            m_in.send_data(reinterpret_cast<char*>(&disp),
2831                                           sizeof(int64_t));
2832                        }
2833
2834                        m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
2835                                       m_vars[i].size);
2836                    }
2837                    break;
2838
2839                case c_dv:
2840                    if (m_vars[i].direction.bits ||
2841                        m_vars[i].alloc_if ||
2842                        m_vars[i].free_if) {
2843                        // send dope vector excluding base
2844                        char *ptr = static_cast<char*>(m_vars[i].ptr);
2845                        m_in.send_data(ptr + sizeof(uint64_t),
2846                                       m_vars[i].size - sizeof(uint64_t));
2847                    }
2848                    break;
2849
2850                case c_data_ptr:
2851                    // send to target addresses of obsolete
2852                    // stacks to be released
2853                    if (m_vars[i].flags.is_stack_buf &&
2854                        !m_vars[i].direction.bits &&
2855                        m_vars[i].alloc_if &&
2856                        m_vars[i].size != 0) {
2857                        for (PtrDataList::iterator it =
2858                            m_destroy_stack.begin();
2859                            it != m_destroy_stack.end(); it++) {
2860                            PtrData * ptr_data = *it;
2861                            m_in.send_data(&(ptr_data->mic_addr),
2862                                sizeof(ptr_data->mic_addr));
2863                        }
2864                    }
2865                    break;
2866                case c_func_ptr:
2867                    if (m_vars[i].direction.in) {
2868                        m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
2869                    }
2870                    break;
2871
2872                default:
2873                    break;
2874            }
2875        }
2876
2877        if (m_status) {
2878            m_status->data_sent += m_in.get_tfr_size();
2879        }
2880
2881        if (m_func_desc->data_offset == 0) {
2882            OffloadTimer timer_unmap(get_timer_data(),
2883                                     c_offload_host_unmap_in_data_buffer);
2884            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
2885            if (res != COI_SUCCESS) {
2886                if (m_status != 0) {
2887                    m_status->result = translate_coi_error(res);
2888                    return false;
2889                }
2890                report_coi_error(c_buf_unmap, res);
2891            }
2892        }
2893    }
2894
2895    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
2896    OFFLOAD_DEBUG_TRACE_1(1,
2897                  GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
2898                  "Total copyin data sent to target: [%lld] bytes\n",
2899                  m_in.get_tfr_size());
2900
2901    return true;
2902}
2903
2904bool OffloadDescriptor::compute()
2905{
2906    OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
2907
2908    if (m_need_runfunction) {
2909        OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
2910                              c_offload_compute, "Compute task on MIC\n");
2911
2912        void* misc = m_func_desc;
2913        int   misc_len = m_func_desc_size;
2914        void* ret = 0;
2915        int   ret_len = 0;
2916
2917        if (m_func_desc->data_offset != 0) {
2918            misc_len += m_in_datalen;
2919
2920            if (m_out_datalen > 0) {
2921                ret = (char*) m_func_desc + m_func_desc->data_offset;
2922                ret_len = m_out_datalen;
2923            }
2924        }
2925
2926        // dispatch task
2927        COIRESULT res;
2928        COIEVENT event;
2929        res = m_device.compute(m_compute_buffers,
2930                               misc, misc_len,
2931                               ret, ret_len,
2932                               m_in_deps_total,
2933                               m_in_deps_total > 0 ? m_in_deps : 0,
2934                               &event);
2935        if (res != COI_SUCCESS) {
2936            if (m_status != 0) {
2937                m_status->result = translate_coi_error(res);
2938                return false;
2939            }
2940            report_coi_error(c_pipeline_run_func, res);
2941        }
2942
2943        m_in_deps_total = 1;
2944        m_in_deps[0] = event;
2945    }
2946
2947    return true;
2948}
2949
2950// recieve pointer data if source or destination or both of them are
2951// noncontiguous. There is guarantee that length of destination enough for
2952// transfered data.
2953bool OffloadDescriptor::recieve_noncontiguous_pointer_data(
2954    int i,
2955    char* base,
2956    COIBUFFER dst_buf,
2957    COIEVENT *event
2958)
2959{
2960    int64_t offset_src, offset_dst;
2961    int64_t length_src, length_dst;
2962    int64_t length_src_cur, length_dst_cur;
2963    int64_t recieve_size, data_recieved = 0;
2964    COIRESULT res;
2965    bool dst_is_empty = true;
2966    bool src_is_empty = true;
2967
2968    // Set length_src and length_dst
2969    length_src = (m_vars_extra[i].read_rng_src) ?
2970        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
2971    length_dst = !m_vars[i].into ? length_src :
2972                     (m_vars_extra[i].read_rng_dst) ?
2973                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
2974    recieve_size = (length_src < length_dst) ? length_src : length_dst;
2975
2976    // consequently get contiguous ranges,
2977    // define corresponded destination offset and recieve data
2978    do {
2979        // get sorce offset
2980        if (src_is_empty) {
2981            if (m_vars_extra[i].read_rng_src) {
2982                if (!get_next_range(m_vars_extra[i].read_rng_src,
2983                         &offset_src)) {
2984                    // source ranges are over - nothing to send
2985                    break;
2986                }
2987            }
2988            else if (data_recieved == 0) {
2989                offset_src = 0;
2990            }
2991            else {
2992                break;
2993            }
2994            length_src_cur = length_src;
2995        }
2996        else {
2997            // if source is contiguous or its contiguous range is greater
2998            // than destination one
2999            offset_src += recieve_size;
3000        }
3001        length_src_cur -= recieve_size;
3002        src_is_empty = length_src_cur == 0;
3003
3004        // get destination offset
3005        if (dst_is_empty) {
3006            if (m_vars[i].into) {
3007                if (m_vars_extra[i].read_rng_dst) {
3008                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
3009                             &offset_dst)) {
3010                        // destination ranges are over
3011                        LIBOFFLOAD_ERROR(c_destination_is_over);
3012                        return false;
3013                    }
3014                }
3015                // destination is contiguous.
3016                else {
3017                    offset_dst = m_vars_extra[i].cpu_disp;
3018                }
3019                length_dst_cur = length_dst;
3020            }
3021            // same as source
3022            else {
3023                offset_dst = offset_src;
3024                length_dst_cur = length_src;
3025            }
3026        }
3027        else {
3028            // if destination is contiguous or its contiguous range is greater
3029            // than source one
3030            offset_dst += recieve_size;
3031        }
3032        length_dst_cur -= recieve_size;
3033        dst_is_empty = length_dst_cur == 0;
3034
3035        if (dst_buf != 0) {
3036            res = COI::BufferCopy(
3037                dst_buf,
3038                m_vars_extra[i].src_data->mic_buf,
3039                m_vars_extra[i].cpu_offset + offset_dst,
3040                m_vars[i].offset + offset_src +
3041                m_vars[i].mic_offset -
3042                m_vars_extra[i].src_data->alloc_disp,
3043                recieve_size,
3044                COI_COPY_UNSPECIFIED,
3045                m_in_deps_total,
3046                m_in_deps_total > 0 ? m_in_deps : 0,
3047                event);
3048            if (res != COI_SUCCESS) {
3049                if (m_status != 0) {
3050                    m_status->result = translate_coi_error(res);
3051                    return false;
3052                }
3053                report_coi_error(c_buf_copy, res);
3054            }
3055        }
3056        else {
3057            res = COI::BufferRead(
3058                m_vars_extra[i].src_data->mic_buf,
3059                m_vars[i].offset + offset_src +
3060                m_vars[i].mic_offset -
3061                m_vars_extra[i].src_data->alloc_disp,
3062                base + offset_dst,
3063                recieve_size,
3064                COI_COPY_UNSPECIFIED,
3065                m_in_deps_total,
3066                m_in_deps_total > 0 ? m_in_deps : 0,
3067                event);
3068            if (res != COI_SUCCESS) {
3069                if (m_status != 0) {
3070                    m_status->result = translate_coi_error(res);
3071                    return false;
3072                }
3073                report_coi_error(c_buf_read, res);
3074            }
3075        }
3076        data_recieved += recieve_size;
3077    }
3078    while (true);
3079    return true;
3080}
3081
3082bool OffloadDescriptor::receive_pointer_data(bool is_async)
3083{
3084    OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
3085
3086    uint64_t ptr_received = 0;
3087    COIRESULT res;
3088
3089    for (int i = 0; i < m_vars_total; i++) {
3090        switch (m_vars[i].type.src) {
3091            case c_data_ptr_array:
3092                break;
3093            case c_data:
3094            case c_void_ptr:
3095            case c_cean_var:
3096                if (m_vars[i].direction.out &&
3097                    m_vars[i].flags.is_static) {
3098                    COIEVENT *event =
3099                        (is_async ||
3100                         m_in_deps_total > 0 ||
3101                         m_vars[i].size >= __offload_use_async_buffer_read) ?
3102                        &m_out_deps[m_out_deps_total++] : 0;
3103                    PtrData *ptr_data = NULL;
3104                    COIBUFFER dst_buf = NULL; // buffer at host
3105                    char *base;
3106
3107                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
3108                        ptr_data = m_vars[i].into ?
3109                                   m_vars_extra[i].dst_data :
3110                                   m_vars_extra[i].src_data;
3111                    }
3112                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
3113                        if (m_vars[i].flags.is_static_dstn) {
3114                            ptr_data = m_vars[i].into ?
3115                                       m_vars_extra[i].dst_data :
3116                                       m_vars_extra[i].src_data;
3117                        }
3118                    }
3119                    dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
3120                    if (dst_buf == NULL) {
3121                        base = offload_get_src_base(
3122                            m_vars[i].into ?
3123                            static_cast<char*>(m_vars[i].into) :
3124                            static_cast<char*>(m_vars[i].ptr),
3125                            m_vars[i].type.dst);
3126                    }
3127
3128                    if (m_vars[i].flags.is_noncont_src ||
3129                        m_vars[i].flags.is_noncont_dst) {
3130                        recieve_noncontiguous_pointer_data(
3131                            i, base, dst_buf, event);
3132                    }
3133                    else if (dst_buf != 0) {
3134                        res = COI::BufferCopy(
3135                            dst_buf,
3136                            m_vars_extra[i].src_data->mic_buf,
3137                            m_vars_extra[i].cpu_offset +
3138                            m_vars_extra[i].cpu_disp,
3139                            m_vars[i].offset + m_vars[i].disp,
3140                            m_vars[i].size,
3141                            COI_COPY_UNSPECIFIED,
3142                            m_in_deps_total,
3143                            m_in_deps_total > 0 ? m_in_deps : 0,
3144                            event);
3145                        if (res != COI_SUCCESS) {
3146                            if (m_status != 0) {
3147                                m_status->result = translate_coi_error(res);
3148                                return false;
3149                            }
3150                            report_coi_error(c_buf_copy, res);
3151                        }
3152                    }
3153                    else {
3154                       res = COI::BufferRead(
3155                            m_vars_extra[i].src_data->mic_buf,
3156                            m_vars[i].offset + m_vars[i].disp,
3157                            base + m_vars_extra[i].cpu_offset +
3158                            m_vars_extra[i].cpu_disp,
3159                            m_vars[i].size,
3160                            COI_COPY_UNSPECIFIED,
3161                            m_in_deps_total,
3162                            m_in_deps_total > 0 ? m_in_deps : 0,
3163                            event);
3164                        if (res != COI_SUCCESS) {
3165                            if (m_status != 0) {
3166                                m_status->result = translate_coi_error(res);
3167                                return false;
3168                            }
3169                            report_coi_error(c_buf_read, res);
3170                        }
3171                    }
3172                    ptr_received += m_vars[i].size;
3173                }
3174                break;
3175
3176            case c_string_ptr:
3177            case c_data_ptr:
3178            case c_cean_var_ptr:
3179            case c_dv_data:
3180            case c_dv_ptr_data:
3181            case c_dv_data_slice:
3182            case c_dv_ptr_data_slice:
3183            case c_dv_ptr: {
3184                COIBUFFER dst_buf = NULL; // buffer on host
3185                if (m_vars[i].direction.out && m_vars[i].size > 0) {
3186                    COIEVENT *event =
3187                        (is_async ||
3188                         m_in_deps_total > 0 ||
3189                         m_vars[i].size >= __offload_use_async_buffer_read) ?
3190                        &m_out_deps[m_out_deps_total++] : 0;
3191
3192                    uint64_t dst_offset = 0;
3193                    char *base = static_cast<char*>(m_vars[i].ptr);
3194
3195                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
3196                        PtrData *ptr_data = m_vars[i].into ?
3197                                            m_vars_extra[i].dst_data :
3198                                            m_vars_extra[i].src_data;
3199                        dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
3200                        if (dst_buf == NULL) {
3201                            base = m_vars[i].into ?
3202                                   *static_cast<char**>(m_vars[i].into) :
3203                                   *static_cast<char**>(m_vars[i].ptr);
3204                        }
3205                        dst_offset = m_vars_extra[i].cpu_offset +
3206                                     m_vars_extra[i].cpu_disp;
3207                    }
3208                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
3209                        if (m_vars[i].flags.is_static_dstn) {
3210                            dst_buf = m_vars[i].into ?
3211                                        m_vars_extra[i].dst_data->cpu_buf :
3212                                        m_vars_extra[i].src_data->cpu_buf;
3213                        }
3214                        if (dst_buf == NULL) {
3215                            base = offload_get_src_base(
3216                                m_vars[i].into ?
3217                                static_cast<char*>(m_vars[i].into) :
3218                                static_cast<char*>(m_vars[i].ptr),
3219                                m_vars[i].type.dst);
3220                        }
3221                        dst_offset = m_vars_extra[i].cpu_offset +
3222                                     m_vars_extra[i].cpu_disp;
3223                    }
3224                    else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) ||
3225                             VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
3226                        PtrData *ptr_data = m_vars[i].into != 0 ?
3227                                            m_vars_extra[i].dst_data :
3228                                            m_vars_extra[i].src_data;
3229                        dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
3230                        if (dst_buf == NULL) {
3231                            base = offload_get_src_base(
3232                                m_vars[i].into ?
3233                                static_cast<char*>(m_vars[i].into) :
3234                                static_cast<char*>(m_vars[i].ptr),
3235                                m_vars[i].type.dst);
3236
3237                        }
3238                        dst_offset = m_vars_extra[i].cpu_offset +
3239                                     m_vars_extra[i].cpu_disp;
3240                    }
3241
3242                    if (m_vars[i].flags.is_noncont_src ||
3243                        m_vars[i].flags.is_noncont_dst) {
3244                        recieve_noncontiguous_pointer_data(
3245                            i, base, dst_buf, event);
3246                    }
3247                    else if (dst_buf != 0) {
3248                        res = COI::BufferCopy(
3249                            dst_buf,
3250                            m_vars_extra[i].src_data->mic_buf,
3251                            dst_offset,
3252                            m_vars[i].offset + m_vars[i].disp +
3253                                m_vars[i].mic_offset -
3254                                m_vars_extra[i].src_data->alloc_disp,
3255                            m_vars[i].size,
3256                            COI_COPY_UNSPECIFIED,
3257                            m_in_deps_total,
3258                            m_in_deps_total > 0 ? m_in_deps : 0,
3259                            event);
3260                        if (res != COI_SUCCESS) {
3261                            if (m_status != 0) {
3262                                m_status->result = translate_coi_error(res);
3263                                return false;
3264                            }
3265                            report_coi_error(c_buf_copy, res);
3266                        }
3267                    }
3268                    else {
3269                        res = COI::BufferRead(
3270                            m_vars_extra[i].src_data->mic_buf,
3271                            m_vars[i].offset + m_vars[i].disp +
3272                                m_vars[i].mic_offset -
3273                                m_vars_extra[i].src_data->alloc_disp,
3274                            base + dst_offset,
3275                            m_vars[i].size,
3276                            COI_COPY_UNSPECIFIED,
3277                            m_in_deps_total,
3278                            m_in_deps_total > 0 ? m_in_deps : 0,
3279                            event);
3280                        if (res != COI_SUCCESS) {
3281                            if (m_status != 0) {
3282                                m_status->result = translate_coi_error(res);
3283                                return false;
3284                            }
3285                            report_coi_error(c_buf_read, res);
3286                        }
3287                    }
3288                    ptr_received += m_vars[i].size;
3289                }
3290                break;
3291            }
3292
3293            default:
3294                break;
3295        }
3296
3297        // destroy buffers for obsolete stacks
3298        if (m_destroy_stack.size() != 0) {
3299            for (PtrDataList::iterator it = m_destroy_stack.begin();
3300                it != m_destroy_stack.end(); it++) {
3301                PtrData *ptr_data = *it;
3302                m_destroy_buffers.push_back(ptr_data->mic_buf);
3303                OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
3304                                  ptr_data->mic_addr);
3305            }
3306            m_destroy_stack.clear();
3307        }
3308        if (m_vars[i].free_if) {
3309            // remove association for automatic variables
3310            if (m_is_openmp && !m_vars[i].flags.is_static &&
3311                (m_vars[i].type.src == c_data ||
3312                 m_vars[i].type.src == c_void_ptr ||
3313                 m_vars[i].type.src == c_cean_var)) {
3314                AutoData *auto_data = m_vars_extra[i].auto_data;
3315                if (auto_data != 0 && auto_data->remove_reference() == 0) {
3316                    m_device.remove_auto_data(auto_data->cpu_addr.start());
3317                }
3318            }
3319
3320            // destroy buffers
3321            if (m_vars[i].direction.out || m_vars[i].into == NULL) {
3322                if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) &&
3323                    !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) &&
3324                    !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) {
3325                    continue;
3326                }
3327
3328                PtrData *ptr_data = m_vars_extra[i].src_data;
3329                if (ptr_data->remove_reference() == 0) {
3330                    // destroy buffers
3331                    if (ptr_data->cpu_buf != 0) {
3332                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
3333                    }
3334                    if (ptr_data->mic_buf != 0) {
3335                        m_destroy_buffers.push_back(ptr_data->mic_buf);
3336                    }
3337                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
3338                                  ptr_data->cpu_addr.start());
3339
3340                    // remove association from map
3341                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
3342                }
3343            }
3344            else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
3345                     VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) ||
3346                     VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) {
3347                PtrData *ptr_data = m_vars_extra[i].dst_data;
3348                if (ptr_data->remove_reference() == 0) {
3349                    // destroy buffers
3350                    if (ptr_data->cpu_buf != 0) {
3351                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
3352                    }
3353                    if (ptr_data->mic_buf != 0) {
3354                        m_destroy_buffers.push_back(ptr_data->mic_buf);
3355                    }
3356                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
3357                                  ptr_data->cpu_addr.start());
3358
3359                    // remove association from map
3360                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
3361                }
3362            }
3363        }
3364    }
3365
3366    if (m_status) {
3367        m_status->data_received += ptr_received;
3368    }
3369
3370    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
3371    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
3372                  c_offload_received_pointer_data,
3373                  "Total pointer data received from target: [%lld] bytes\n",
3374                  ptr_received);
3375
3376    return true;
3377}
3378
3379bool OffloadDescriptor::scatter_copyout_data()
3380{
3381    OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
3382
3383    if (m_need_runfunction && m_out_datalen > 0) {
3384
3385        // total size that need to be transferred from target to host
3386        COIMAPINSTANCE map_inst;
3387        COIRESULT res;
3388        char *data;
3389
3390        // output data buffer
3391        if (m_func_desc->data_offset == 0) {
3392            OffloadTimer timer_map(get_timer_data(),
3393                                   c_offload_host_map_out_data_buffer);
3394
3395            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
3396                                           COI_MAP_READ_ONLY, 0, 0, 0,
3397                                           &map_inst,
3398                                            reinterpret_cast<void**>(&data));
3399            if (res != COI_SUCCESS) {
3400                if (m_status != 0) {
3401                    m_status->result = translate_coi_error(res);
3402                    return false;
3403                }
3404                report_coi_error(c_buf_map, res);
3405            }
3406        }
3407        else {
3408            data = (char*) m_func_desc + m_func_desc->data_offset;
3409        }
3410
3411        // get timing data
3412        OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
3413        data += OFFLOAD_TIMER_DATALEN();
3414
3415        // initialize output marshaller
3416        m_out.init_buffer(data, m_out_datalen);
3417
3418        for (int i = 0; i < m_vars_total; i++) {
3419            switch (m_vars[i].type.src) {
3420                case c_data_ptr_array:
3421                    break;
3422                case c_data:
3423                case c_void_ptr:
3424                case c_cean_var:
3425                    if (m_vars[i].direction.out &&
3426                        !m_vars[i].flags.is_static) {
3427
3428                        if (m_vars[i].into) {
3429                            char *ptr = offload_get_src_base(
3430                                static_cast<char*>(m_vars[i].into),
3431                                m_vars[i].type.dst);
3432                            m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
3433                                               m_vars[i].size);
3434                        }
3435                        else {
3436                            m_out.receive_data(
3437                                static_cast<char*>(m_vars[i].ptr) +
3438                                    m_vars_extra[i].cpu_disp,
3439                                m_vars[i].size);
3440                        }
3441                    }
3442                    break;
3443
3444                case c_func_ptr:
3445                    if (m_vars[i].direction.out) {
3446                        m_out.receive_func_ptr((const void**) m_vars[i].ptr);
3447                    }
3448                    break;
3449
3450                default:
3451                    break;
3452            }
3453        }
3454
3455        if (m_status) {
3456            m_status->data_received += m_out.get_tfr_size();
3457        }
3458
3459        if (m_func_desc->data_offset == 0) {
3460            OffloadTimer timer_unmap(get_timer_data(),
3461                                     c_offload_host_unmap_out_data_buffer);
3462
3463            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
3464            if (res != COI_SUCCESS) {
3465                if (m_status != 0) {
3466                    m_status->result = translate_coi_error(res);
3467                    return false;
3468                }
3469                report_coi_error(c_buf_unmap, res);
3470            }
3471        }
3472    }
3473
3474    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
3475    OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
3476                  m_out.get_tfr_size());
3477
3478    return true;
3479}
3480
3481void get_arr_desc_numbers(
3482    const arr_desc *ap,
3483    int64_t el_size,
3484    int64_t &offset,
3485    int64_t &size,
3486    int     &el_number,
3487    CeanReadRanges* &ptr_ranges
3488)
3489{
3490    if (is_arr_desc_contiguous(ap)) {
3491        ptr_ranges = NULL;
3492        __arr_data_offset_and_length(ap, offset, size);
3493        el_number = size / el_size;
3494    }
3495    else {
3496        ptr_ranges = init_read_ranges_arr_desc(ap);
3497        el_number = (ptr_ranges->range_size / el_size) *
3498                    ptr_ranges->range_max_number;
3499        size = ptr_ranges->range_size;
3500    }
3501}
3502
3503arr_desc * make_arr_desc(
3504    void*   ptr_val,
3505    int64_t extent_start_val,
3506    int64_t extent_elements_val,
3507    int64_t size
3508)
3509{
3510    arr_desc *res;
3511    res = (arr_desc *)malloc(sizeof(arr_desc));
3512    if (res == NULL)
3513      LIBOFFLOAD_ERROR(c_malloc);
3514    res->base = reinterpret_cast<int64_t>(ptr_val);
3515    res->rank = 1;
3516    res->dim[0].size = size;
3517    res->dim[0].lindex = 0;
3518    res->dim[0].lower = extent_start_val;
3519    res->dim[0].upper = extent_elements_val + extent_start_val - 1;
3520    res->dim[0].stride = 1;
3521    return res;
3522}
3523
3524bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
3525{
3526    int             pointers_number;
3527    int             tmp_val;
3528    int             new_index = m_vars_total;
3529    const arr_desc *ap;
3530    const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
3531    int             flags = vd3->array_fields;
3532    bool            src_is_for_mic = (m_vars[i].direction.out ||
3533                                      m_vars[i].into == NULL);
3534
3535    ReadArrElements<void *>  ptr;
3536    ReadArrElements<void *>  into;
3537    ReadArrElements<int64_t> ext_start;
3538    ReadArrElements<int64_t> ext_elements;
3539    ReadArrElements<int64_t> align;
3540    ReadArrElements<int64_t> alloc_if;
3541    ReadArrElements<int64_t> free_if;
3542    ReadArrElements<int64_t> into_start;
3543    ReadArrElements<int64_t> into_elem;
3544    ReadArrElements<int64_t> alloc_start;
3545    ReadArrElements<int64_t> alloc_elem;
3546
3547
3548    ap = static_cast<const arr_desc*>(vd3->ptr_array);
3549
3550    // "pointers_number" for total number of transfered pointers.
3551    // For each of them we create new var_desc and put it at the bottom
3552    // of the var_desc's array
3553    get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
3554        pointers_number, ptr.ranges);
3555    ptr.base = reinterpret_cast<char*>(ap->base);
3556
3557    // 2. prepare memory for new var_descs
3558    m_vars_total += pointers_number;
3559    m_vars       = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
3560    if (m_vars == NULL)
3561      LIBOFFLOAD_ERROR(c_malloc);
3562    m_vars_extra =
3563        (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
3564    if (m_vars_extra == NULL)
3565      LIBOFFLOAD_ERROR(c_malloc);
3566    m_in_deps    =
3567        (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
3568    if (m_in_deps == NULL)
3569      LIBOFFLOAD_ERROR(c_malloc);
3570    m_out_deps   =
3571        (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
3572    if (m_out_deps == NULL)
3573      LIBOFFLOAD_ERROR(c_malloc);
3574
3575    // 3. Prepare for reading new var_desc's fields
3576    //    EXTENT START
3577    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
3578        ap = static_cast<const arr_desc*>(vd3->extent_start);
3579        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
3580            ext_start.size, tmp_val, ext_start.ranges);
3581        ext_start.base = reinterpret_cast<char*>(ap->base);
3582        ext_start.el_size = ap->dim[ap->rank - 1].size;
3583
3584        if (tmp_val < pointers_number) {
3585            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
3586            return false;
3587        }
3588    }
3589    else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
3590        ext_start.val = (int64_t)vd3->extent_start;
3591    }
3592    else {
3593        ext_start.val = 0;
3594    }
3595
3596    //    EXTENT ELEMENTS NUMBER
3597    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
3598        ap = static_cast<const arr_desc*>(vd3->extent_elements);
3599        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
3600            ext_elements.offset, ext_elements.size,
3601            tmp_val, ext_elements.ranges);
3602        ext_elements.base = reinterpret_cast<char*>(ap->base);
3603        ext_elements.el_size = ap->dim[ap->rank - 1].size;
3604
3605        if (tmp_val < pointers_number) {
3606            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
3607            return false;
3608        }
3609    }
3610    else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
3611        ext_elements.val = (int64_t)vd3->extent_elements;
3612    }
3613    else {
3614        ext_elements.val = m_vars[i].count;
3615    }
3616
3617    //    ALLOC_IF
3618    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
3619        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
3620        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
3621            alloc_if.size, tmp_val, alloc_if.ranges);
3622        alloc_if.base = reinterpret_cast<char*>(ap->base);
3623        alloc_if.el_size = ap->dim[ap->rank - 1].size;
3624
3625        if (tmp_val < pointers_number) {
3626            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
3627            return false;
3628        }
3629    }
3630    else {
3631        alloc_if.val = m_vars[i].count;
3632    }
3633
3634    //    FREE_IF
3635    if ((flags & (1<<flag_free_if_is_array)) != 0) {
3636        ap = static_cast<const arr_desc*>(vd3->free_if_array);
3637        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
3638            free_if.size, tmp_val, free_if.ranges);
3639        free_if.base = reinterpret_cast<char*>(ap->base);
3640        free_if.el_size = ap->dim[ap->rank - 1].size;
3641
3642        if (tmp_val < pointers_number) {
3643            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
3644            return false;
3645        }
3646    }
3647    else {
3648        free_if.val = m_vars[i].count;
3649    }
3650
3651    //    ALIGN
3652
3653    if ((flags & (1<<flag_align_is_array)) != 0) {
3654        ap = static_cast<const arr_desc*>(vd3->align_array);
3655        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
3656            align.size, tmp_val, align.ranges);
3657        align.base = reinterpret_cast<char*>(ap->base);
3658        align.el_size = ap->dim[ap->rank - 1].size;
3659
3660        if (tmp_val < pointers_number) {
3661            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
3662            return false;
3663        }
3664    }
3665    else {
3666        align.val = m_vars[i].align;
3667    }
3668
3669    // 3.1 INTO
3670
3671    if (m_vars[i].into) {
3672        ap = static_cast<const arr_desc*>(m_vars[i].into);
3673        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
3674            into.size, tmp_val, into.ranges);
3675        into.base = reinterpret_cast<char*>(ap->base);
3676
3677        if (tmp_val < pointers_number) {
3678            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
3679            return false;
3680        }
3681    }
3682
3683    // 3.2 INTO_START
3684
3685    if ((flags & (1<<flag_into_start_is_array)) != 0) {
3686        ap = static_cast<const arr_desc*>(vd3->into_start);
3687        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
3688            into_start.size, tmp_val, into_start.ranges);
3689        into_start.base = reinterpret_cast<char*>(ap->base);
3690        into_start.el_size = ap->dim[ap->rank - 1].size;
3691
3692        if (tmp_val < pointers_number) {
3693            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
3694            return false;
3695        }
3696    }
3697    else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
3698        into_start.val = (int64_t)vd3->into_start;
3699    }
3700    else {
3701        into_start.val = 0;
3702    }
3703
3704    // 3.3 INTO_ELEMENTS
3705
3706    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
3707        ap = static_cast<const arr_desc*>(vd3->into_elements);
3708        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
3709            into_elem.size, tmp_val, into_elem.ranges);
3710        into_elem.base = reinterpret_cast<char*>(ap->base);
3711        into_elem.el_size = ap->dim[ap->rank - 1].size;
3712
3713        if (tmp_val < pointers_number) {
3714            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
3715            return false;
3716        }
3717    }
3718    else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
3719        into_elem.val = (int64_t)vd3->into_elements;
3720    }
3721    else {
3722        into_elem.val = m_vars[i].count;
3723    }
3724
3725    //    alloc_start
3726
3727    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
3728        ap = static_cast<const arr_desc*>(vd3->alloc_start);
3729        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
3730            alloc_start.offset, alloc_start.size, tmp_val,
3731            alloc_start.ranges);
3732        alloc_start.base = reinterpret_cast<char*>(ap->base);
3733        alloc_start.el_size = ap->dim[ap->rank - 1].size;
3734
3735        if (tmp_val < pointers_number) {
3736            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
3737            return false;
3738        }
3739    }
3740    else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
3741        alloc_start.val = (int64_t)vd3->alloc_start;
3742    }
3743    else {
3744        alloc_start.val = 0;
3745    }
3746
3747    //    alloc_elem
3748
3749    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
3750        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
3751        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
3752            alloc_elem.size, tmp_val, alloc_elem.ranges);
3753        alloc_elem.base = reinterpret_cast<char*>(ap->base);
3754        alloc_elem.el_size = ap->dim[ap->rank - 1].size;
3755        if (tmp_val < pointers_number) {
3756            LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
3757                             "alloc_extent elements");
3758            return false;
3759        }
3760    }
3761    else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
3762        alloc_elem.val = (int64_t)vd3->alloc_elements;
3763    }
3764    else {
3765        alloc_elem.val = 0;
3766    }
3767
3768    for (int k = 0; k < pointers_number; k++) {
3769        int type = flags & 0x3f;
3770        int type_src, type_dst;
3771        //  Get new values
3772        // type_src, type_dst
3773        type_src = type_dst = (type == c_data_ptr_array) ?
3774                              c_data_ptr   : (type == c_func_ptr_array) ?
3775                              c_func_ptr   : (type == c_void_ptr_array) ?
3776                              c_void_ptr   : (type == c_string_ptr_array) ?
3777                              c_string_ptr : 0;
3778
3779        // Get ptr val
3780        if (!ptr.read_next(true)) {
3781            break;
3782        }
3783        else {
3784            ptr.val = (void*)(ptr.base + ptr.offset);
3785        }
3786
3787        // !!! If we got error at phase of reading - it's an internal
3788        // !!! error, as we must detect mismatch before
3789
3790        // Get into val
3791        if (m_vars[i].into) {
3792            if (!into.read_next(true)) {
3793                LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
3794                LIBOFFLOAD_ABORT;
3795            }
3796            else {
3797                into.val = (void*)(into.base + into.offset);
3798            }
3799        }
3800
3801        // Get other components of the clause
3802        if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
3803            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
3804            LIBOFFLOAD_ABORT;
3805        }
3806        if (!ext_elements.read_next(
3807                flags & (1<<flag_extent_elements_is_array))) {
3808            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
3809            LIBOFFLOAD_ABORT;
3810        }
3811        if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
3812            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
3813            LIBOFFLOAD_ABORT;
3814        }
3815        if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
3816            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
3817            LIBOFFLOAD_ABORT;
3818        }
3819        if (!align.read_next(flags & (1<<flag_align_is_array))) {
3820            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
3821            LIBOFFLOAD_ABORT;
3822        }
3823        if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
3824            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
3825            LIBOFFLOAD_ABORT;
3826        }
3827        if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
3828            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
3829            LIBOFFLOAD_ABORT;
3830        }
3831        if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
3832            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
3833            LIBOFFLOAD_ABORT;
3834        }
3835        if (!alloc_elem.read_next(
3836                 flags & (1<<flag_alloc_elements_is_array))) {
3837            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
3838            LIBOFFLOAD_ABORT;
3839        }
3840
3841        m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
3842        m_vars[new_index + k].alloc_if = alloc_if.val;
3843        m_vars[new_index + k].free_if = free_if.val;
3844        m_vars[new_index + k].align = align.val;
3845        m_vars[new_index + k].mic_offset = 0;
3846        m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
3847        m_vars[new_index + k].offset = 0;
3848        m_vars[new_index + k].size = m_vars[i].size;
3849
3850        if (ext_start.val == 0) {
3851            m_vars[new_index + k].count = ext_elements.val;
3852            m_vars[new_index + k].ptr = ptr.val;
3853            if (type_src == c_string_ptr) {
3854                m_vars[new_index + k].size = 0;
3855            }
3856        }
3857        else {
3858            m_vars[new_index + k].count = 0;
3859            m_vars[new_index + k].ptr =
3860                static_cast<void*>(make_arr_desc(
3861                ptr.val,
3862                ext_start.val,
3863                ext_elements.val,
3864                m_vars[i].size));
3865
3866            type_src = type_src == c_data_ptr ? c_cean_var_ptr :
3867                                   c_string_ptr ? c_cean_var_ptr :
3868                                   type_src;
3869            if (!m_vars[i].into) {
3870                type_dst = type_src;
3871            }
3872        }
3873
3874        if (m_vars[i].into && into_elem.val != 0) {
3875            m_vars[new_index + k].into =
3876                static_cast<void*>(make_arr_desc(
3877                into.val,
3878                into_start.val,
3879                into_elem.val,
3880                m_vars[i].size));
3881            type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
3882                       (type == c_string_ptr_array) ? c_cean_var_ptr :
3883                        type_src;
3884        }
3885        else {
3886            m_vars[new_index + k].into = NULL;
3887        }
3888
3889        if (alloc_elem.val != 0) {
3890            m_vars[new_index + k].alloc =
3891                static_cast<void*>(make_arr_desc(
3892                ptr.val,
3893                alloc_start.val,
3894                alloc_elem.val,
3895                m_vars[i].size));
3896        }
3897        else {
3898            m_vars[new_index + k].alloc = NULL;
3899        }
3900
3901        m_vars[new_index + k].type.src = type_src;
3902        m_vars[new_index + k].type.dst = type_dst;
3903
3904        m_vars_extra[new_index + k].is_arr_ptr_el = 1;
3905        m_vars_extra[new_index + k].ptr_arr_offset =
3906            src_is_for_mic ? ptr.offset : into.offset;
3907    }
3908    // count and alloc fields are useless at target. They can be reused
3909    // for pointer arrays.
3910    m_vars[i].count = pointers_number;
3911    m_vars[i].ptr_arr_offset = new_index;
3912    return true;
3913}
3914
3915static void __offload_fini_library(void)
3916{
3917    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
3918    if (mic_engines_total > 0) {
3919        delete[] mic_engines;
3920
3921        if (mic_proxy_fs_root != 0) {
3922            free(mic_proxy_fs_root);
3923            mic_proxy_fs_root = 0;
3924        }
3925
3926        if (mic_library_path != 0) {
3927            free(mic_library_path);
3928            mic_library_path = 0;
3929        }
3930
3931        // destroy thread key
3932        thread_key_delete(mic_thread_key);
3933    }
3934
3935    // unload COI library
3936    if (COI::is_available) {
3937        COI::fini();
3938    }
3939
3940    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
3941}
3942
3943static void __offload_init_library_once(void)
3944{
3945    COIRESULT res;
3946    uint32_t num_devices;
3947    std::bitset<MIC_ENGINES_MAX> devices;
3948
3949    prefix = report_get_message_str(c_report_host);
3950
3951    // initialize trace
3952    const char *env_var = getenv(htrace_envname);
3953    if (env_var != 0 && *env_var != '\0') {
3954        int64_t new_val;
3955        if (__offload_parse_int_string(env_var, new_val)) {
3956            console_enabled = new_val & 0x0f;
3957        }
3958    }
3959
3960    env_var = getenv(offload_report_envname);
3961    if (env_var != 0 && *env_var != '\0') {
3962        int64_t env_val;
3963        if (__offload_parse_int_string(env_var, env_val)) {
3964            if (env_val == OFFLOAD_REPORT_1 ||
3965                env_val == OFFLOAD_REPORT_2 ||
3966                env_val == OFFLOAD_REPORT_3) {
3967                offload_report_level = env_val;
3968            }
3969            else {
3970                LIBOFFLOAD_ERROR(c_invalid_env_report_value,
3971                                 offload_report_envname);
3972            }
3973        }
3974        else {
3975            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
3976                             offload_report_envname);
3977        }
3978    }
3979    else if (!offload_report_level) {
3980        env_var = getenv(timer_envname);
3981        if (env_var != 0 && *env_var != '\0') {
3982            timer_enabled = atoi(env_var);
3983        }
3984    }
3985
3986    // initialize COI
3987    if (!COI::init()) {
3988        return;
3989    }
3990
3991    // get number of devices installed in the system
3992    res = COI::EngineGetCount(COI_ISA_KNC, &num_devices);
3993    if (res != COI_SUCCESS) {
3994        return;
3995    }
3996
3997    if (num_devices > MIC_ENGINES_MAX) {
3998        num_devices = MIC_ENGINES_MAX;
3999    }
4000
4001    // fill in the list of devices that can be used for offloading
4002    env_var = getenv("OFFLOAD_DEVICES");
4003    if (env_var != 0) {
4004        if (strcasecmp(env_var, "none") != 0) {
4005            // value is composed of comma separated physical device indexes
4006            char *buf = strdup(env_var);
4007            char *str, *ptr;
4008            for (str = strtok_r(buf, ",", &ptr); str != 0;
4009                 str = strtok_r(0, ",", &ptr)) {
4010                // convert string to an int
4011                int64_t num;
4012                if (!__offload_parse_int_string(str, num)) {
4013                    LIBOFFLOAD_ERROR(c_mic_init5);
4014
4015                    // fallback to using all installed devices
4016                    devices.reset();
4017                    for (int i = 0; i < num_devices; i++) {
4018                        devices.set(i);
4019                    }
4020                    break;
4021                }
4022                if (num < 0 || num >= num_devices) {
4023                    LIBOFFLOAD_ERROR(c_mic_init6, num);
4024                    continue;
4025                }
4026                devices.set(num);
4027            }
4028            free(buf);
4029        }
4030    }
4031    else {
4032        // use all available devices
4033        for (int i = 0; i < num_devices; i++) {
4034            COIENGINE engine;
4035            res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine);
4036            if (res == COI_SUCCESS) {
4037                devices.set(i);
4038            }
4039        }
4040    }
4041
4042    mic_engines_total = devices.count();
4043
4044    // no need to continue if there are no devices to offload to
4045    if (mic_engines_total <= 0) {
4046        return;
4047    }
4048
4049    // initialize indexes for available devices
4050    mic_engines = new Engine[mic_engines_total];
4051    for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) {
4052        if (devices[p_idx]) {
4053            mic_engines[l_idx].set_indexes(l_idx, p_idx);
4054            l_idx++;
4055        }
4056    }
4057
4058    // library search path for device binaries
4059    env_var = getenv("MIC_LD_LIBRARY_PATH");
4060    if (env_var != 0) {
4061        mic_library_path = strdup(env_var);
4062    }
4063
4064    // memory size reserved for COI buffers
4065    env_var = getenv("MIC_BUFFERSIZE");
4066    if (env_var != 0) {
4067        uint64_t new_size;
4068        if (__offload_parse_size_string(env_var, new_size)) {
4069            mic_buffer_size = new_size;
4070        }
4071        else {
4072            LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
4073        }
4074    }
4075
4076    // determine stacksize for the pipeline on the device
4077    env_var = getenv("MIC_STACKSIZE");
4078    if (env_var != 0 && *env_var != '\0') {
4079        uint64_t new_size;
4080        if (__offload_parse_size_string(env_var, new_size) &&
4081            (new_size >= 16384) && ((new_size & 4095) == 0)) {
4082            mic_stack_size = new_size;
4083        }
4084        else {
4085            LIBOFFLOAD_ERROR(c_mic_init3);
4086        }
4087    }
4088
4089    // proxy I/O
4090    env_var = getenv("MIC_PROXY_IO");
4091    if (env_var != 0 && *env_var != '\0') {
4092        int64_t new_val;
4093        if (__offload_parse_int_string(env_var, new_val)) {
4094            mic_proxy_io = new_val;
4095        }
4096        else {
4097            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
4098        }
4099    }
4100    env_var = getenv("MIC_PROXY_FS_ROOT");
4101    if (env_var != 0 && *env_var != '\0') {
4102        mic_proxy_fs_root = strdup(env_var);
4103    }
4104
4105    // Prepare environment for the target process using the following
4106    // rules
4107    // - If MIC_ENV_PREFIX is set then any environment variable on the
4108    //   host which has that prefix are copied to the device without
4109    //   the prefix.
4110    //   All other host environment variables are ignored.
4111    // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
4112    //   environment is duplicated.
4113    env_var = getenv("MIC_ENV_PREFIX");
4114    if (env_var != 0 && *env_var != '\0') {
4115        mic_env_vars.set_prefix(env_var);
4116
4117        int len = strlen(env_var);
4118        for (int i = 0; environ[i] != 0; i++) {
4119            if (strncmp(environ[i], env_var, len) == 0 &&
4120                strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
4121                environ[i][len] != '=') {
4122                mic_env_vars.analyze_env_var(environ[i]);
4123            }
4124        }
4125    }
4126
4127    // create key for thread data
4128    if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
4129        LIBOFFLOAD_ERROR(c_mic_init4, errno);
4130        return;
4131    }
4132
4133    // cpu frequency
4134    cpu_frequency = COI::PerfGetCycleFrequency();
4135
4136    env_var = getenv(mic_use_2mb_buffers_envname);
4137    if (env_var != 0 && *env_var != '\0') {
4138        uint64_t new_size;
4139        if (__offload_parse_size_string(env_var, new_size)) {
4140            __offload_use_2mb_buffers = new_size;
4141        }
4142        else {
4143            LIBOFFLOAD_ERROR(c_invalid_env_var_value,
4144                             mic_use_2mb_buffers_envname);
4145        }
4146    }
4147
4148    env_var = getenv(mic_use_async_buffer_write_envname);
4149    if (env_var != 0 && *env_var != '\0') {
4150        uint64_t new_size;
4151        if (__offload_parse_size_string(env_var, new_size)) {
4152            __offload_use_async_buffer_write = new_size;
4153        }
4154    }
4155
4156    env_var = getenv(mic_use_async_buffer_read_envname);
4157    if (env_var != 0 && *env_var != '\0') {
4158        uint64_t new_size;
4159        if (__offload_parse_size_string(env_var, new_size)) {
4160            __offload_use_async_buffer_read = new_size;
4161        }
4162    }
4163
4164    // mic initialization type
4165    env_var = getenv(offload_init_envname);
4166    if (env_var != 0 && *env_var != '\0') {
4167        if (strcmp(env_var, "on_offload") == 0) {
4168            __offload_init_type = c_init_on_offload;
4169        }
4170        else if (strcmp(env_var, "on_offload_all") == 0) {
4171            __offload_init_type = c_init_on_offload_all;
4172        }
4173#ifndef TARGET_WINNT
4174        else if (strcmp(env_var, "on_start") == 0) {
4175            __offload_init_type = c_init_on_start;
4176        }
4177#endif // TARGET_WINNT
4178        else {
4179            LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
4180        }
4181    }
4182
4183    // active wait
4184    env_var = getenv(offload_active_wait_envname);
4185    if (env_var != 0 && *env_var != '\0') {
4186        int64_t new_val;
4187        if (__offload_parse_int_string(env_var, new_val)) {
4188            __offload_active_wait = new_val;
4189        }
4190        else {
4191            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
4192                             offload_active_wait_envname);
4193        }
4194    }
4195
4196    // omp device num
4197    env_var = getenv(omp_device_num_envname);
4198    if (env_var != 0 && *env_var != '\0') {
4199        int64_t new_val;
4200        if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
4201            __omp_device_num = new_val;
4202        }
4203        else {
4204            LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
4205                             omp_device_num_envname);
4206        }
4207    }
4208
4209    // init ORSL
4210    ORSL::init();
4211}
4212
4213extern int __offload_init_library(void)
4214{
4215    // do one time intialization
4216    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
4217    __offload_run_once(&ctrl, __offload_init_library_once);
4218
4219    // offload is available if COI is available and the number of devices > 0
4220    bool is_available = COI::is_available && (mic_engines_total > 0);
4221
4222    // register pending libraries if there are any
4223    if (is_available && __target_libs) {
4224        mutex_locker_t locker(__target_libs_lock);
4225
4226        for (TargetImageList::iterator it = __target_libs_list.begin();
4227             it != __target_libs_list.end(); it++) {
4228            // Register library in COI
4229            COI::ProcessRegisterLibraries(1, &it->data, &it->size,
4230                                          &it->origin, &it->offset);
4231
4232            // add lib to all engines
4233            for (int i = 0; i < mic_engines_total; i++) {
4234                mic_engines[i].add_lib(*it);
4235            }
4236        }
4237
4238        __target_libs = false;
4239        __target_libs_list.clear();
4240    }
4241
4242    return is_available;
4243}
4244
4245extern "C" void __offload_register_image(const void *target_image)
4246{
4247    const struct Image *image = static_cast<const struct Image*>(target_image);
4248
4249    // decode image
4250    const char *name = image->data;
4251    const void *data = image->data + strlen(image->data) + 1;
4252    uint64_t    size = image->size;
4253    const char *origin = 0;
4254    uint64_t    offset = 0;
4255
4256    // our actions depend on the image type
4257    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
4258    switch (hdr->e_type) {
4259        case ET_EXEC:
4260            // Each offload application is supposed to have only one target
4261            // image representing target executable.
4262            // No thread synchronization is required here as the initialization
4263            // code is always executed in a single thread.
4264            if (__target_exe != 0) {
4265                LIBOFFLOAD_ERROR(c_multiple_target_exes);
4266                exit(1);
4267            }
4268            __target_exe = new TargetImage(name, data, size, origin, offset);
4269
4270            // Registration code for execs is always called from the context
4271            // of main and thus we can safely call any function here,
4272            // including LoadLibrary API on windows. This is the place where
4273            // we do the offload library initialization.
4274            if (__offload_init_library()) {
4275                // initialize engine if init_type is on_start
4276                if (__offload_init_type == c_init_on_start) {
4277                    for (int i = 0; i < mic_engines_total; i++) {
4278                        mic_engines[i].init();
4279                    }
4280                }
4281            }
4282            break;
4283
4284        case ET_DYN:
4285            // Registration code for libraries is called from the DllMain
4286            // context (on windows) and thus we cannot do anything usefull
4287            // here. So we just add it to the list of pending libraries for
4288            // the later use.
4289            __target_libs_lock.lock();
4290            __target_libs = true;
4291            __target_libs_list.push_back(TargetImage(name, data, size,
4292                                                     origin, offset));
4293            __target_libs_lock.unlock();
4294            break;
4295
4296        default:
4297            // something is definitely wrong, issue an error and exit
4298            LIBOFFLOAD_ERROR(c_unknown_binary_type);
4299            exit(1);
4300    }
4301}
4302
4303extern "C" void __offload_unregister_image(const void *target_image)
4304{
4305    // Target image is packed as follows:
4306    //      8 bytes                - size of the target binary
4307    //      null-terminated string - binary name
4308    //      <size> bytes           - binary contents
4309    const struct Image {
4310         int64_t size;
4311         char data[];
4312    } *image = static_cast<const struct Image*>(target_image);
4313
4314    // decode image
4315    const char *name = image->data;
4316    const void *data = image->data + strlen(image->data) + 1;
4317
4318    // our actions depend on the image type
4319    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
4320    if (hdr->e_type == ET_EXEC) {
4321        // We are executing exec's desctructors.
4322        // It is time to do a library cleanup.
4323        if (timer_enabled) {
4324            Offload_Timer_Print();
4325        }
4326
4327#ifdef MYO_SUPPORT
4328        __offload_myoFini();
4329#endif // MYO_SUPPORT
4330
4331        __offload_fini_library();
4332    }
4333}
4334
4335// Runtime trace interface for user programs
4336
4337void __offload_console_trace(int level)
4338{
4339    console_enabled = level;
4340}
4341
4342// User-visible offload API
4343
4344int _Offload_number_of_devices(void)
4345{
4346    __offload_init_library();
4347    return mic_engines_total;
4348}
4349
4350int _Offload_get_device_number(void)
4351{
4352    return -1;
4353}
4354
4355int _Offload_get_physical_device_number(void)
4356{
4357    return -1;
4358}
4359
4360int _Offload_signaled(int index, void *signal)
4361{
4362    __offload_init_library();
4363
4364    // check index value
4365    if (index < 0 || mic_engines_total <= 0) {
4366        LIBOFFLOAD_ERROR(c_offload_signaled1, index);
4367        LIBOFFLOAD_ABORT;
4368    }
4369
4370    // find associated async task
4371    OffloadDescriptor *task =
4372        mic_engines[index % mic_engines_total].find_signal(signal, false);
4373    if (task == 0) {
4374        LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
4375        LIBOFFLOAD_ABORT;
4376    }
4377
4378    return task->is_signaled();
4379}
4380
4381void _Offload_report(int val)
4382{
4383    if (val == OFFLOAD_REPORT_ON ||
4384        val == OFFLOAD_REPORT_OFF) {
4385        offload_report_enabled = val;
4386    }
4387}
4388
4389// IDB support
4390int   __dbg_is_attached = 0;
4391int   __dbg_target_id = -1;
4392pid_t __dbg_target_so_pid = -1;
4393char  __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
4394const int __dbg_api_major_version = 1;
4395const int __dbg_api_minor_version = 0;
4396
4397void __dbg_target_so_loaded()
4398{
4399}
4400void __dbg_target_so_unloaded()
4401{
4402}
4403