1/* Barrelfish THC language extensions */
2
3/*
4 * Copyright (c) 2015, ETH Zurich.
5 * Copyright (c) 2015, Hewlett Packard Enterprise Development LP.
6 * All rights reserved.
7 *
8 * This file is distributed under the terms in the attached LICENSE file.
9 * If you do not find this file, copies can be found by writing to:
10 * ETH Zurich D-INFK, Universitaetstrasse 6, CH-8092 Zurich. Attn: Systems Group.
11 */
12
13#ifndef _THC_INTERNAL_H_
14#define _THC_INTERNAL_H_
15
16/***********************************************************************/
17
18typedef struct ptstate_t PTState_t;
19typedef struct stack_t stack_t;
20typedef struct finish_t finish_t;
21
22// Definition of an AWE, asynchronous work element.  This definition must
23// match the assembly-language definitions at the bottom of thc.c which
24// access fields in the AWE structure.
25
26enum awe_status {
27  EAGER_AWE = 0,
28  LAZY_AWE,
29  NEEDS_LAZY_STACK,
30  ALLOCATED_LAZY_STACK
31};
32
33struct awe_t {
34  // Fields representing the code to run when the AWE is executed.
35  void  *eip;
36  void  *ebp;
37  void  *esp;
38
39  // Can be EAGER_ASYNC, LAZY_ASYNC or NEEDS_LASY_STACK
40  enum awe_status status;
41
42  // Stack which is allocated if awe is caller yields to this AWE.
43  void  *lazy_stack;
44
45  // Link from an AWE to the per-thread state for the thread it
46  // runs in.
47  PTState_t *pts;
48
49  // Link from an AWE to the immediately-enclosing finish
50  finish_t *current_fb;
51
52  // Fields used by the runtime system to link together AWEs, e.g.,
53  // on a thread's run-queue, or on a list of waiters on a
54  // synchronization object.
55  awe_t *prev;
56  awe_t *next;
57};
58
59/***********************************************************************/
60
61// Definition of a finish block's data structure.
62//
63// Finish blocks are held on a linked list threaded through the start_node
64// and end_node fields.  The blocks dynamically nested within a given
65// finish block are held between these two nodes.  (This enables easy
66// iteration through all these dynamically nested blocks).
67
68typedef struct finish_list_t finish_list_t;
69
70struct finish_list_t {
71  finish_list_t  *prev;
72  finish_list_t  *next;
73  finish_t       *fb;
74};
75
76struct finish_t {
77  void           *old_sp;    /* stack pointer when entering do {} finish */
78  unsigned long   count;
79  awe_t          *finish_awe;
80  int             fb_kind;
81  int             cancel_requested;
82  finish_list_t   start_node;
83  finish_list_t   end_node;
84  finish_t       *enclosing_fb;
85  void           *enclosing_lazy_stack;
86  cancel_item_t  *cancel_item;
87};
88
89/***********************************************************************/
90
91// Per-thread runtime system state
92
93struct stack_t {
94  stack_t *next;
95};
96
97struct ptstate_t {
98
99  // Thread-local fields: .............................................
100
101  // Head/tail sentinels of the dispatch list
102  awe_t aweHead;
103  awe_t aweTail;
104
105  // Immediately-enclosing finish block for the currently running code
106  finish_t *current_fb;
107
108  // Initialization / termination flags
109  int doneInit;
110  int shouldExit;
111
112  // Stack that the thread's dispatch loop will run on
113  void *dispatchStack;
114
115  // If we are running on a lazily allocated stack, this will point to its start
116  void *curr_lazy_stack;
117
118  // Function to execute whenever the dispatch loop is idle (e.g.,
119  // to block the thread until an incoming message which might change
120  // the state of the dispatch loop).
121  THCIdleFn_t idle_fn;
122  void *idle_args;
123  void *idle_stack;
124
125  // Stack to be de-allocated on the next execution of the dispatch loop
126  // (an async call terminates by re-entering the dispatch loop with
127  // pendingFree set to the stack it was using.  It cannot dealloacte
128  // its own stack while it is in use).
129  void *pendingFree;
130
131  // AWE to enter for the dispatch loop on this thread
132  awe_t dispatch_awe;
133
134  // Free stacks for re-use
135  stack_t *free_stacks;
136
137
138#ifndef NDEBUG
139  // Debugging statistics
140  int stackMemoriesAllocated;
141  int stackMemoriesDeallocated;
142  int stacksAllocated;
143  int stacksDeallocated;
144  int finishBlocksStarted;
145  int finishBlocksEnded;
146  int asyncCallsStarted;
147  int asyncCallsEnded;
148  int aweCreated;
149  int aweResumed;
150  int idleStarted;
151  int idleComplete;
152  int cancelsRequested;
153  int cancelsAdded;
154  int cancelsRun;
155  int cancelsRemoved;
156  int getTls;
157  int lock;
158  int sendCount;
159  int recvCount;
160#endif
161
162  // Shared fields: ...................................................
163
164  // Latch protecting the dispatch list
165  struct thc_latch latch;
166
167  // Head/tail sentinels of the remote dispatch list on which other
168  // threads place AWEs that they have unblocks but which belong to
169  // this thread
170  awe_t aweRemoteHead;
171  awe_t aweRemoteTail;
172};
173
174typedef void (*THCContFn_t)(void *cont, void *args);
175
176void *_thc_allocstack(void);
177void _thc_freestack(void *s);
178void _thc_onaltstack(void *s, void *fn, void *args);
179void _thc_startasync(void *f, void *stack);
180void _thc_endasync(void *f, void *s);
181void _thc_startfinishblock(finish_t *fb, int fb_kind);
182void _thc_endfinishblock(finish_t *fb, void *stack);
183void _thc_do_cancel_request(finish_t *fb);
184void _thc_callcont(awe_t *awe, THCContFn_t fn, void *args) __attribute__((returns_twice));
185int  _thc_schedulecont(awe_t *awe) __attribute__((returns_twice));
186void _thc_lazy_awe_marker(void);
187void _thc_pendingfree(void);
188
189/***********************************************************************/
190
191// Symbols declared in the .text.nx section
192
193extern int _start_text_nx;
194extern int _end_text_nx;
195
196/***********************************************************************/
197
198/* Macro to force callee-saves to be spilled to the stack */
199
200#if defined(__x86_64__)
201#define KILL_CALLEE_SAVES()						\
202  __asm__ volatile ("" : : : "rbx", "r12", "r13", "r14", "r15",         \
203		    "memory", "cc")
204#elif defined(__i386__)
205#ifdef __pic__
206#define KILL_CALLEE_SAVES()					        \
207  __asm__ volatile ("" : : : "edi", "esi", "esp", "memory", "cc")
208#else
209#define KILL_CALLEE_SAVES()						\
210  __asm__ volatile ("" : : : "ebx", "edi", "esi", "esp", "memory", "cc")
211#endif
212#elif defined(__arm__)
213// see ARM Procedure Call Standard (APCS): 5.1 Machine Registers
214// NB: gcc complains about clobbering two registers:
215//  . v8 (i.e., r11), is the frame pointer in ARM and cannot be clobbered
216//  . v6 is the PIC register
217//
218#if defined(__pic__)
219    #define KILL_CALLEE_SAVES()                                           \
220    __asm__ volatile ("" : : : "sp",                                      \
221                         "v1", "v2", "v3", "v4", "v5", "v7",              \
222                         "s16", "s17", "s18", "s19", "s20", "s21", "s22", \
223                         "s23", "s24", "s25", "s26", "s27", "s28", "s29", \
224                         "s30", "31",                                     \
225                         "memory")
226#else // same as before, but including v6
227    #define KILL_CALLEE_SAVES()                                           \
228    __asm__ volatile ("" : : : "sp",                                      \
229                         "v1", "v2", "v3", "v4", "v5", "v6", "v7",        \
230                         "s16", "s17", "s18", "s19", "s20", "s21", "s22", \
231                         "s23", "s24", "s25", "s26", "s27", "s28", "s29", \
232                         "s30", "31",                                     \
233                         "memory")
234
235#endif
236#elif defined(__aarch64__)
237    #define KILL_CALLEE_SAVES()                                           \
238    __asm__ volatile ("" : : :                                      \
239                         "x19", "x20", "x21", "x22", "x23", "x24", "x25",        \
240                         "x26", "x27", "x28", \
241                         "31",                                     \
242                         "memory")
243
244#else
245#error "Need definition of KILL_CALLEE_SAVES"
246#endif
247
248#define __WORD_SIZE (sizeof(void*))
249
250
251/***********************************************************************/
252
253#ifdef CONFIG_LAZY_THC
254
255/***********************************************************************/
256
257#if defined(__x86_64__)
258/* Force args on stack - there must be a better way of doing this, but */
259/* regparam(0) doesn't work on x86_64                                  */
260#define FORCE_ARGS_STACK      void*__a, void*__b, void*__c, void*__d, \
261                              void*__e, void*__f,
262#define FORCE_ARGS_STACK_CALL NULL, NULL, NULL, NULL, NULL, NULL,
263#elif defined(__i386__)
264#define FORCE_ARGS_STACK
265#define FORCE_ARGS_STACK_CALL
266#elif defined(__arm__) || defined(__aarch64__)
267#define FORCE_ARGS_STACK assert(0 && "THC not yet implemented on ARM")
268#define FORCE_ARGS_STACK_CALL assert(0 && "THC not yet implemented on ARM")
269#elif defined(__aarch64__)
270#define FORCE_ARGS_STACK assert(0 && "THC not yet implemented on ARM")
271#define FORCE_ARGS_STACK_CALL assert(0 && "THC not yet implemented on ARM")
272#else
273#error "Need definition of FORCE_ARGS_STACK"
274#endif
275
276#define FORCE_FRAME_POINTER_USE                                         \
277    /* Do a zero byte alloca to force local variable access via ebp  */ \
278    /* Note, this does not add any code (even with -O0.              */ \
279    __builtin_alloca(0)
280
281#if defined(__x86_64__)
282#define GET_STACK_POINTER(STACK_PTR)					\
283  __asm__ volatile ("movq %%rsp, %0       \n\t"				\
284		    : "=m"(STACK_PTR) : )
285#define RESTORE_OLD_STACK_POINTER(OLD_STACK_PTR)			\
286  __asm__ volatile ("movq %0, %%rsp       \n\t"				\
287		    : : "m"(OLD_STACK_PTR))
288#elif defined(__i386__)
289#define GET_STACK_POINTER(STACK_PTR)					\
290  __asm__ volatile ("movl %%esp, %0       \n\t"				\
291		    : "=m"(STACK_PTR) : )
292#define RESTORE_OLD_STACK_POINTER(OLD_STACK_PTR)			\
293  __asm__ volatile ("movl %0, %%esp       \n\t"				\
294		    : : "m"(OLD_STACK_PTR))
295#elif defined(__arm__) || defined(__aarch64__)
296#define GET_STACK_POINTER(_) assert(0 && "THC not yet implemented on ARM")
297#define RESTORE_OLD_STACK_POINTER(_) assert(0 && "THC not yet implemented on ARM")
298#else
299#error "Need definition of GET_STACK_POINTER and RESTORE_OLD_STACK_POINTER"
300#endif
301
302
303#if defined(__x86_64__) || defined(__i386__)
304// INIT_LAZY_AWE() is used in the beggining of the nested function in ASYNC_.
305// The signature of the nested function is:
306//   void _thc_nested_async(FORCE_ARGS_STACK awe_t *awe)
307//
308// So in INIT_LAZY_AWE, the stack in x86 looks like:
309//  sp ->
310//        .......
311//  rbp-> [ saved rbp ] rbp[0]
312//        [ RET ]       rbp[1]
313//        [ awe ]       rbp[2] (passed as first arg)
314#define THC_LAZY_FRAME_PREV(FRAME_PTR) *((FRAME_PTR)+0)
315#define THC_LAZY_FRAME_RET(FRAME_PTR)  *((FRAME_PTR)+1)
316#define THC_LAZY_FRAME_AWE(FRAME_PTR)  *((FRAME_PTR)+2)
317#endif
318
319#if defined(__x86_64__)
320#define INIT_LAZY_AWE(AWE_PTR, LAZY_MARKER)				\
321  __asm__ volatile (							\
322    " movq 8(%%rbp), %%rsi       \n\t"					\
323    " movq %%rsi,    0(%0)       \n\t" /* RIP   (our return address) */	\
324    " movq 0(%%rbp), %%rsi       \n\t"					\
325    " movq %%rsi,    8(%0)       \n\t" /* RBP                        */	\
326    " movq %1,       8(%%rbp)    \n\t" /* put marker as ret address  */ \
327    : : "r"((AWE_PTR)), "r"((LAZY_MARKER)) : "rsi" );
328#define RETURN_CONT(JMP_ADDR)			                        \
329  __asm__ volatile (							\
330    " movq %rbp, %rsp            \n\t" /* free frame                 */ \
331    " popq %rbp                  \n\t" /* restore rbp                */ \
332    " addq $8, %rsp              \n\t" /* pop old ret address        */ \
333    " jmp  " JMP_ADDR "          \n\t" /* jump to continuation       */ \
334    );
335#elif defined(__i386__)
336#define INIT_LAZY_AWE(AWE_PTR, LAZY_MARKER)				\
337  __asm__ volatile (							\
338    " movl 4(%%ebp), %%esi       \n\t"					\
339    " movl %%esi,    0(%0)       \n\t" /* EIP   (our return address) */	\
340    " movl 0(%%ebp), %%esi       \n\t"					\
341    " movl %%esi,    4(%0)       \n\t" /* EBP                        */	\
342    " movl %1,       4(%%ebp)    \n\t" /* put marker as ret address  */ \
343    : : "r"((AWE_PTR)), "r"((LAZY_MARKER)) : "esi" );
344#define RETURN_CONT(JMP_ADDR)			                        \
345  __asm__ volatile (							\
346    " movl %ebp, %esp            \n\t" /* free frame                 */ \
347    " popl %ebp                  \n\t" /* restore ebp                */ \
348    " addl $4, %esp              \n\t" /* clean up stack for callee  */ \
349    " jmp  " JMP_ADDR "          \n\t" /* jump to continuation       */ \
350    );
351#elif defined(__arm__) || defined(__aarch64__)
352
353// *** NOTEs for the adventurous: porting lazy THC to ARM
354//
355// INIT_LAZY_AWE puts a marker in place of the returned address, which is saved
356// in the awe structure. check_for_lazy_awe() checks for this  marker and lazily
357// initializes an awe if needed.
358//
359// In ARM, the caller passes the return address via lr and not the stack.
360// Gcc (4.7) usually compiles functions the following way:
361//   mov     ip, sp
362//   push    {rXX, rYY, fp, ip, lr, pc}
363//   sub     fp, ip, #4
364//   ....
365//   ldm     sp, {rXX, rYY, fp, sp, pc}
366//
367// So the return address is pushed on the stack by the callee, but I'm not sure
368// how consistent is this even if we only consider gcc.
369//
370// check_for_lazy_awe() and init_lazy_awe() also need to change.
371
372#define INIT_LAZY_AWE(_) assert(0 && "THC not yet implemented on AARCH64")
373#define RETURN_CONT(_) assert(0 && "THC not yet implemented on AARCH64")
374#define GET_LAZY_AWE(_) assert(0 && "THC not yet implemented on AARCH64")
375#else
376#error "Need definition of INIT_LAZY_AWE & GET_LAZY_AWE"
377#endif
378
379/***********************************************************************/
380
381#define SCHEDULE_CONT(_AWE_PTR, NESTED_FUNC)			\
382  ({								\
383    KILL_CALLEE_SAVES();					\
384    NESTED_FUNC(FORCE_ARGS_STACK_CALL _AWE_PTR);               \
385  })
386
387#define CALL_CONT(_FN,_ARG)                                     \
388  do {                                                          \
389    awe_t _awe;                                                 \
390    _awe.status     = EAGER_AWE;				\
391    _awe.lazy_stack = NULL;					\
392    KILL_CALLEE_SAVES();                                        \
393    _thc_callcont(&_awe, (THCContFn_t)(_FN), (_ARG));           \
394  } while (0)
395
396
397#define CALL_CONT_LAZY(_FN,_ARG)                                \
398  do {                                                          \
399    awe_t _awe;                                                 \
400    _awe.status     = LAZY_AWE;					\
401    _awe.lazy_stack = NULL;					\
402    KILL_CALLEE_SAVES();                                        \
403    _thc_callcont(&_awe, (THCContFn_t)(_FN), (_ARG));           \
404  } while (0)
405
406/***********************************************************************/
407
408#else /* EAGER_THC */
409
410/***********************************************************************/
411
412// not required in the  lazy CALL_CONT in the eager version
413#define FORCE_FRAME_POINTER_USE      /* Not used */ do {} while(0)
414#define GET_STACK_POINTER(_)         /* Not used */
415#define RESTORE_OLD_STACK_POINTER(_) /* Not used */
416
417
418// SWIZZLE_DEF:
419//  - _NAME: name of the function
420//  - _NS:   new stack, address just above top of commited region
421//  - _FN:   (nested) function to call:  void _FN(void)
422
423#if (defined(__x86_64__) && (defined(linux) || defined(BARRELFISH)))
424#define SWIZZLE_DEF_(_NAME,_NS,_FN)                                     \
425  __attribute__((noinline)) void _NAME(void) {                          \
426    __asm__ volatile("movq %0, %%rdi      \n\t" /* put NS to %rdi   */  \
427                     "subq $8, %%rdi      \n\t" /* fix NS address   */  \
428                     "movq %%rsp, (%%rdi) \n\t" /* store sp to NS   */  \
429                     "movq %%rdi, %%rsp   \n\t" /* set sp to NS     */  \
430                     "call " _FN "        \n\t" /* call _FN         */  \
431                     "popq %%rsp          \n\t" /* restore old sp   */  \
432                     :                                                  \
433                     : "m" (_NS)                                        \
434                     : "memory", "cc", "rsi", "rdi");                   \
435  }
436#define SWIZZLE_DEF(_NAME,_NS,_FN) SWIZZLE_DEF_(_NAME,_NS,_FN)
437#elif (defined(__i386__) && (defined(linux) || defined(BARRELFISH)))
438#define SWIZZLE_DEF(_NAME,_NS,_FN)                                      \
439  __attribute__((noinline)) void _NAME(void) {                          \
440    __asm__ volatile("movl %0, %%edx           \n\t"			\
441                     "subl $4, %%edx           \n\t"			\
442                     "movl %%esp, (%%edx)      \n\t"			\
443                     "movl %%edx, %%esp        \n\t"			\
444                     "call " _FN "             \n\t"			\
445                     "pop %%esp                \n\t"			\
446                     :							\
447                     : "m" (_NS)                                        \
448                     : "memory", "cc", "eax", "edx");			\
449  }
450#elif defined(__arm__) && (defined(linux) || defined(BARRELFISH))
451
452// Notes:
453// - ARM Architecutre Reference Manual ARMv7-A and ARMv7-R:
454//   STMDB:
455//   "The SP and PC can be in the list in ARM code, but not in Thumb code.
456//   However, ARM instructions that include the SP or the PC in the list are
457//   deprecated."
458// - This can probably be optimized
459//
460#define SWIZZLE_DEF(_NAME, _NS, _FN)                                          \
461    __attribute__((noinline)) void _NAME(void) {                              \
462    __asm__ volatile("ldr r0, %0      \n\t" /* set r0 to new stack */         \
463                     "mov r1, sp      \n\t" /* set r1 to old stack */         \
464                     "stmdb r0!, {r1} \n\t" /* save old stack to new stack */ \
465                     "mov sp, r0      \n\t" /* set sp to new stack */         \
466                     "bl " _FN "      \n\t" /* call _FN */                    \
467                     "ldmia sp, {r1}  \n\t" /* old stack pointer to r1 */     \
468                     "mov sp, r1      \n\t" /* restore stack pointer */       \
469                     :                                                        \
470                     : "m" (_NS)                                              \
471                     : "memory", "r0", "r1");                                 \
472    }
473#elif defined(__aarch64__) && (defined(linux) || defined(BARRELFISH))
474
475// - NYI
476#define SWIZZLE_DEF(_NAME, _NS, _FN) assert(0 && "THC not yet implemented on AARCH64")
477
478#else
479#error "No definition of SWIZZLE_DEF for THC"
480#endif
481
482/***********************************************************************/
483
484#define SCHEDULE_CONT(_AWE_PTR)                 \
485  ({                                            \
486    KILL_CALLEE_SAVES();                        \
487    _thc_schedulecont((awe_t*)_AWE_PTR);        \
488  })
489
490#define CALL_CONT(_FN,_ARG)                                     \
491  do {                                                          \
492    awe_t _awe;                                                 \
493    KILL_CALLEE_SAVES();                                        \
494    _thc_callcont(&_awe, (THCContFn_t)(_FN), (_ARG));           \
495  } while (0)
496
497// no lazy CALL_CONT in the eager version
498#define CALL_CONT_LAZY CALL_CONT
499
500#endif // LAZY / EAGER THC
501
502#endif // _THC_INTERNAL_H_
503