1/*
2    Copyright (c) 2014 Intel Corporation.  All Rights Reserved.
3
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7
8      * Redistributions of source code must retain the above copyright
9        notice, this list of conditions and the following disclaimer.
10      * Redistributions in binary form must reproduce the above copyright
11        notice, this list of conditions and the following disclaimer in the
12        documentation and/or other materials provided with the distribution.
13      * Neither the name of Intel Corporation nor the names of its
14        contributors may be used to endorse or promote products derived
15        from this software without specific prior written permission.
16
17    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28*/
29
30
31#ifndef OFFLOAD_ENGINE_H_INCLUDED
32#define OFFLOAD_ENGINE_H_INCLUDED
33
34#include <limits.h>
35
36#include <list>
37#include <set>
38#include <map>
39#include "offload_common.h"
40#include "coi/coi_client.h"
41
42// Address range
43class MemRange {
44public:
45    MemRange() : m_start(0), m_length(0) {}
46    MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
47
48    const void* start() const {
49        return m_start;
50    }
51
52    const void* end() const {
53        return static_cast<const char*>(m_start) + m_length;
54    }
55
56    uint64_t length() const {
57        return m_length;
58    }
59
60    // returns true if given range overlaps with another one
61    bool overlaps(const MemRange &o) const {
62        // Two address ranges A[start, end) and B[start,end) overlap
63        // if A.start < B.end and A.end > B.start.
64        return start() < o.end() && end() > o.start();
65    }
66
67    // returns true if given range contains the other range
68    bool contains(const MemRange &o) const {
69        return start() <= o.start() && o.end() <= end();
70    }
71
72private:
73    const void* m_start;
74    uint64_t    m_length;
75};
76
77// Data associated with a pointer variable
78class PtrData {
79public:
80    PtrData(const void *addr, uint64_t len) :
81        cpu_addr(addr, len), cpu_buf(0),
82        mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
83        ref_count(0), is_static(false)
84    {}
85
86    //
87    // Copy constructor
88    //
89    PtrData(const PtrData& ptr):
90        cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
91        mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
92        mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
93        ref_count(ptr.ref_count), is_static(ptr.is_static)
94    {}
95
96    bool operator<(const PtrData &o) const {
97        // Variables are sorted by the CPU start address.
98        // Overlapping memory ranges are considered equal.
99        return (cpu_addr.start() < o.cpu_addr.start()) &&
100               !cpu_addr.overlaps(o.cpu_addr);
101    }
102
103    long add_reference() {
104        if (is_static) {
105            return LONG_MAX;
106        }
107#ifndef TARGET_WINNT
108        return __sync_fetch_and_add(&ref_count, 1);
109#else // TARGET_WINNT
110        return _InterlockedIncrement(&ref_count) - 1;
111#endif // TARGET_WINNT
112    }
113
114    long remove_reference() {
115        if (is_static) {
116            return LONG_MAX;
117        }
118#ifndef TARGET_WINNT
119        return __sync_sub_and_fetch(&ref_count, 1);
120#else // TARGET_WINNT
121        return _InterlockedDecrement(&ref_count);
122#endif // TARGET_WINNT
123    }
124
125    long get_reference() const {
126        if (is_static) {
127            return LONG_MAX;
128        }
129        return ref_count;
130    }
131
132public:
133    // CPU address range
134    const MemRange  cpu_addr;
135
136    // CPU and MIC buffers
137    COIBUFFER       cpu_buf;
138    COIBUFFER       mic_buf;
139
140    // placeholder for buffer address on mic
141    uint64_t        mic_addr;
142
143    uint64_t        alloc_disp;
144
145    // additional offset to pointer data on MIC for improving bandwidth for
146    // data which is not 4K aligned
147    uint32_t        mic_offset;
148
149    // if true buffers are created from static memory
150    bool            is_static;
151    mutex_t         alloc_ptr_data_lock;
152
153private:
154    // reference count for the entry
155    long            ref_count;
156};
157
158typedef std::list<PtrData*> PtrDataList;
159
160// Data associated with automatic variable
161class AutoData {
162public:
163    AutoData(const void *addr, uint64_t len) :
164        cpu_addr(addr, len), ref_count(0)
165    {}
166
167    bool operator<(const AutoData &o) const {
168        // Variables are sorted by the CPU start address.
169        // Overlapping memory ranges are considered equal.
170        return (cpu_addr.start() < o.cpu_addr.start()) &&
171               !cpu_addr.overlaps(o.cpu_addr);
172    }
173
174    long add_reference() {
175#ifndef TARGET_WINNT
176        return __sync_fetch_and_add(&ref_count, 1);
177#else // TARGET_WINNT
178        return _InterlockedIncrement(&ref_count) - 1;
179#endif // TARGET_WINNT
180    }
181
182    long remove_reference() {
183#ifndef TARGET_WINNT
184        return __sync_sub_and_fetch(&ref_count, 1);
185#else // TARGET_WINNT
186        return _InterlockedDecrement(&ref_count);
187#endif // TARGET_WINNT
188    }
189
190    long get_reference() const {
191        return ref_count;
192    }
193
194public:
195    // CPU address range
196    const MemRange cpu_addr;
197
198private:
199    // reference count for the entry
200    long ref_count;
201};
202
203// Set of autimatic variables
204typedef std::set<AutoData> AutoSet;
205
206// Target image data
207struct TargetImage
208{
209    TargetImage(const char *_name, const void *_data, uint64_t _size,
210                const char *_origin, uint64_t _offset) :
211        name(_name), data(_data), size(_size),
212        origin(_origin), offset(_offset)
213    {}
214
215    // library name
216    const char* name;
217
218    // contents and size
219    const void* data;
220    uint64_t    size;
221
222    // file of origin and offset within that file
223    const char* origin;
224    uint64_t    offset;
225};
226
227typedef std::list<TargetImage> TargetImageList;
228
229// Data associated with persistent auto objects
230struct PersistData
231{
232    PersistData(const void *addr, uint64_t routine_num, uint64_t size) :
233        stack_cpu_addr(addr), routine_id(routine_num)
234    {
235        stack_ptr_data = new PtrData(0, size);
236    }
237    // 1-st key value - begining of the stack at CPU
238    const void *   stack_cpu_addr;
239    // 2-nd key value - identifier of routine invocation at CPU
240    uint64_t   routine_id;
241    // corresponded PtrData; only stack_ptr_data->mic_buf is used
242    PtrData * stack_ptr_data;
243    // used to get offset of the variable in stack buffer
244    char * cpu_stack_addr;
245};
246
247typedef std::list<PersistData> PersistDataList;
248
249// class representing a single engine
250struct Engine {
251    friend void __offload_init_library_once(void);
252    friend void __offload_fini_library(void);
253
254#define check_result(res, tag, ...) \
255    { \
256        if (res == COI_PROCESS_DIED) { \
257            fini_process(true); \
258            exit(1); \
259        } \
260        if (res != COI_SUCCESS) { \
261            __liboffload_error_support(tag, __VA_ARGS__); \
262            exit(1); \
263        } \
264    }
265
266    int get_logical_index() const {
267        return m_index;
268    }
269
270    int get_physical_index() const {
271        return m_physical_index;
272    }
273
274    const COIPROCESS& get_process() const {
275        return m_process;
276    }
277
278    // initialize device
279    void init(void);
280
281    // add new library
282    void add_lib(const TargetImage &lib)
283    {
284        m_lock.lock();
285        m_ready = false;
286        m_images.push_back(lib);
287        m_lock.unlock();
288    }
289
290    COIRESULT compute(
291        const std::list<COIBUFFER> &buffers,
292        const void*         data,
293        uint16_t            data_size,
294        void*               ret,
295        uint16_t            ret_size,
296        uint32_t            num_deps,
297        const COIEVENT*     deps,
298        COIEVENT*           event
299    );
300
301#ifdef MYO_SUPPORT
302    // temporary workaround for blocking behavior for myoiLibInit/Fini calls
303    void init_myo(COIEVENT *event) {
304        COIRESULT res;
305        res = COI::PipelineRunFunction(get_pipeline(),
306                                       m_funcs[c_func_myo_init],
307                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
308                                       event);
309        check_result(res, c_pipeline_run_func, m_index, res);
310    }
311
312    void fini_myo(COIEVENT *event) {
313        COIRESULT res;
314        res = COI::PipelineRunFunction(get_pipeline(),
315                                       m_funcs[c_func_myo_fini],
316                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
317                                       event);
318        check_result(res, c_pipeline_run_func, m_index, res);
319    }
320#endif // MYO_SUPPORT
321
322    //
323    // Memory association table
324    //
325    PtrData* find_ptr_data(const void *ptr) {
326        m_ptr_lock.lock();
327        PtrSet::iterator res = m_ptr_set.find(PtrData(ptr, 0));
328        m_ptr_lock.unlock();
329        if (res == m_ptr_set.end()) {
330            return 0;
331        }
332        return const_cast<PtrData*>(res.operator->());
333    }
334
335    PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
336        m_ptr_lock.lock();
337        std::pair<PtrSet::iterator, bool> res =
338            m_ptr_set.insert(PtrData(ptr, len));
339        PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
340        m_ptr_lock.unlock();
341
342        is_new = res.second;
343        if (is_new) {
344            // It's necessary to lock as soon as possible.
345            // unlock must be done at call site of insert_ptr_data at
346            // branch for is_new
347            ptr_data->alloc_ptr_data_lock.lock();
348        }
349        return ptr_data;
350    }
351
352    void remove_ptr_data(const void *ptr) {
353        m_ptr_lock.lock();
354        m_ptr_set.erase(PtrData(ptr, 0));
355        m_ptr_lock.unlock();
356    }
357
358    //
359    // Automatic variables
360    //
361    AutoData* find_auto_data(const void *ptr) {
362        AutoSet &auto_vars = get_auto_vars();
363        AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
364        if (res == auto_vars.end()) {
365            return 0;
366        }
367        return const_cast<AutoData*>(res.operator->());
368    }
369
370    AutoData* insert_auto_data(const void *ptr, uint64_t len) {
371        AutoSet &auto_vars = get_auto_vars();
372        std::pair<AutoSet::iterator, bool> res =
373            auto_vars.insert(AutoData(ptr, len));
374        return const_cast<AutoData*>(res.first.operator->());
375    }
376
377    void remove_auto_data(const void *ptr) {
378        get_auto_vars().erase(AutoData(ptr, 0));
379    }
380
381    //
382    // Signals
383    //
384    void add_signal(const void *signal, OffloadDescriptor *desc) {
385        m_signal_lock.lock();
386        m_signal_map[signal] = desc;
387        m_signal_lock.unlock();
388    }
389
390    OffloadDescriptor* find_signal(const void *signal, bool remove) {
391        OffloadDescriptor *desc = 0;
392
393        m_signal_lock.lock();
394        {
395            SignalMap::iterator it = m_signal_map.find(signal);
396            if (it != m_signal_map.end()) {
397                desc = it->second;
398                if (remove) {
399                    m_signal_map.erase(it);
400                }
401            }
402        }
403        m_signal_lock.unlock();
404
405        return desc;
406    }
407
408    // stop device process
409    void fini_process(bool verbose);
410
411    // list of stacks active at the engine
412    PersistDataList m_persist_list;
413
414private:
415    Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
416               m_proc_number(0)
417    {}
418
419    ~Engine() {
420        if (m_process != 0) {
421            fini_process(false);
422        }
423    }
424
425    // set indexes
426    void set_indexes(int logical_index, int physical_index) {
427        m_index = logical_index;
428        m_physical_index = physical_index;
429    }
430
431    // start process on device
432    void init_process();
433
434    void load_libraries(void);
435    void init_ptr_data(void);
436
437    // performs library intialization on the device side
438    pid_t init_device(void);
439
440private:
441    // get pipeline associated with a calling thread
442    COIPIPELINE get_pipeline(void);
443
444    // get automatic vars set associated with the calling thread
445    AutoSet& get_auto_vars(void);
446
447    // destructor for thread data
448    static void destroy_thread_data(void *data);
449
450private:
451    typedef std::set<PtrData> PtrSet;
452    typedef std::map<const void*, OffloadDescriptor*> SignalMap;
453
454    // device indexes
455    int         m_index;
456    int         m_physical_index;
457
458    // number of COI pipes created for the engine
459    long        m_proc_number;
460
461    // process handle
462    COIPROCESS  m_process;
463
464    // If false, device either has not been initialized or new libraries
465    // have been added.
466    bool        m_ready;
467    mutex_t     m_lock;
468
469    // List of libraries to be loaded
470    TargetImageList m_images;
471
472    // var table
473    PtrSet      m_ptr_set;
474    mutex_t     m_ptr_lock;
475
476    // signals
477    SignalMap m_signal_map;
478    mutex_t   m_signal_lock;
479
480    // constants for accessing device function handles
481    enum {
482        c_func_compute = 0,
483#ifdef MYO_SUPPORT
484        c_func_myo_init,
485        c_func_myo_fini,
486#endif // MYO_SUPPORT
487        c_func_init,
488        c_func_var_table_size,
489        c_func_var_table_copy,
490        c_funcs_total
491    };
492    static const char* m_func_names[c_funcs_total];
493
494    // device function handles
495    COIFUNCTION m_funcs[c_funcs_total];
496
497    // int -> name mapping for device signals
498    static const int   c_signal_max = 32;
499    static const char* c_signal_names[c_signal_max];
500};
501
502#endif // OFFLOAD_ENGINE_H_INCLUDED
503