1\documentclass[a4paper,twoside]{report} % for a report (default)
2
3\usepackage{amsmath}
4\usepackage{bftn} % You need this
5\usepackage{cite}
6\usepackage{color}
7\usepackage{listings}
8
9\title{Inter-dispatcher communication in Barrelfish}   % title of report
10\author{Andrew Baumann}	% author
11\tnnumber{011}  % give the number of the tech report
12\tnkey{IDC} % Short title, will appear in footer
13
14% \date{Month Year} % Not needed - will be taken from version history
15
16\begin{document}
17\maketitle
18
19%
20% Include version history first
21%
22\begin{versionhistory}
23\vhEntry{0.0}{06.08.2010}{AB}{Initial version}
24\vhEntry{0.1}{01.09.2010}{AB}{Documented Flounder IDL, stub API and waitset}
25\vhEntry{0.2}{21.10.2010}{AB}{Completed runtime support chapter}
26\vhEntry{0.3}{05.12.2011}{MN}{Added meta-parameter note, AHCI references}
27\end{versionhistory}
28
29% \intro{Abstract}		% Insert abstract here
30% \intro{Acknowledgements}	% Uncomment (if needed) for acknowledgements
31% \tableofcontents		% Uncomment (if needed) for final draft
32% \listoffigures		% Uncomment (if needed) for final draft
33% \listoftables			% Uncomment (if needed) for final draft
34
35\lstset{
36  language=C,
37  basicstyle=\ttfamily \small,
38  flexiblecolumns=false,
39  basewidth={0.5em,0.45em},
40  boxpos=t,
41}
42
43\newcommand{\note}[1]{[\textcolor{red}{\emph{#1}}]}
44
45\tableofcontents
46
47%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
48\chapter{Introduction}
49
50TODO
51
52\chapter{Flounder stub compiler}\label{cha:flounder}
53
54\section{Overview}
55
56\section{Interface definition language}\label{sec:idl}
57
58\subsection{Lexical conventions}
59The following convention, derived from Mackerel\cite{btn002-mackerel} has
60been adopted for Flounder. It is similar to the convention in modern
61day programming languages like C and Java.
62
63\begin{description}
64\item[Whitespace:]  As in C and Java, Flounder considers sequences of
65  space, newline, tab, and carriage return characters to be
66  whitespace.  Whitespace is generally not significant. 
67
68\item[Comments:] Flounder supports C-style comments.  Single line comments
69  start with \texttt{//} and continue until the end of the line.
70  Multiline comments are enclosed between \texttt{/*} and \texttt{*/};
71  anything inbetween is ignored and treated as white space.
72
73\item[Identifiers:] Valid Flounder identifiers are sequences of numbers
74  (0-9), letters (a-z, A-Z) and the underscore character ``\texttt{\_}''.  They
75  must start with a letter or ``\texttt{\_}''.
76  \begin{align*}
77    identifier & \rightarrow ( letter \mid \_ ) (letter \mid digit \mid \_) \\
78    letter & \rightarrow (\textsf{A \ldots Z} \mid  \textsf{a \ldots z})\\
79    digit & \rightarrow (\textsf{0 \ldots 9})
80\end{align*}
81  
82\item[Integer Literals:] A Flounder integer literals is a sequence of
83  digits, optionally preceded by a radix specifier.  As in C, decimal (base 10)
84  literals have no specifier and hexadecimal literals start with
85  \texttt{0x}.
86\begin{align*}
87digit & \rightarrow (\textsf{0 \ldots 9})\\
88hexadecimal & \rightarrow (\textsf{0x})(\textsf{0 \ldots 9} \mid \textsf{A
89\ldots F} \mid \textsf{a \ldots f})
90\end{align*}
91
92
93\item[Reserved words:] The following are reserved words in Flounder:
94\begin{alltt}\begin{tabbing}
95xxxxxxxxx \= xxxxxxxxx \= xxxxxxxxx \= xxxxxxxxx \= xxxxxxxxx \= xxxxxxxxx \kill
96alias \> call \> enum \> in \> interface \> message \\
97out \> response \> rpc \> struct \> typedef 
98\end{tabbing}\end{alltt}
99
100\item[Builtin types:] The following identifiers are recognised as builtin types:
101\begin{alltt}\begin{tabbing}
102xxxxxxxxxxxx \= xxxxxxxxxxxx \= xxxxxxxxxxxx \= xxxxxxxxxxxx \= xxxxxxxxxxxx \=
103xxxxxxxxxxxx \kill
104bool \> cap \> char \> give_away_cap \> int \> int8 \\
105int16 \> int32 \> int64 \> intptr \> iref \> size \\
106string \> uint8 \> uint16 \> uint32 \> uint64 \> uintptr
107\end{tabbing}\end{alltt}
108
109\item[Special characters:] The following characters are used as operators,
110  separators, terminators or for other special purposes in Flounder:
111\begin{alltt}
112  \{ \} [ ] ( ) ; ,
113\end{alltt}
114
115\end{description}
116
117
118\subsection{Interface specification}
119
120A Flounder input file must specify a single interface, of the form:
121
122\begin{syntax}
123interface \synit{name} ["\synit{description}"]
124\verb+{+
125  \synit{types};
126  \ldots
127  \synit{messages};
128  \ldots
129\verb+}+;
130\end{syntax}
131
132\begin{description}
133\item[name] is an identifier for the interface type, and will be used to
134  generate identifiers in the target language (typically C).  While
135  not currently enforced by the Flounder compiler, by
136  convention this should only differ from the name of the file in case
137  and suffix, e.g.\ the file \texttt{xapic.dev} defines a device type
138  of name \texttt{xAPIC}. 
139
140\item[description] is a string literal in double quotes, which
141  describes the interface being specified, for example
142  \texttt{"Name service"}.
143\end{description}
144
145After the description string follows in curly brackets a series of
146declarations which specify types, messages and their arguments.   Each
147declaration must be one of the following:
148
149\begin{itemize}
150
151  \item A transparent type alias, using the \texttt{alias} declaration,
152    described in Section~\ref{sec:lang:alias}.
153
154  \item A type definition, using the \texttt{typedef} declaration,
155    described in Section~\ref{sec:lang:typedef}.
156
157  \item A simple message specification, using either the \texttt{message},
158    \texttt{call} or \texttt{response} declarations, describing a
159    single unidirectional message with typed arguments,
160    and described in Section~\ref{sec:lang:message}.
161
162  \item An \texttt{rpc} specification, which describes two related
163    messages (a call and the response) with a single declaration,
164    and described in Section~\ref{sec:lang:rpc}.
165
166\end{itemize}
167
168Type aliases and definitions (\texttt{alias} and \texttt{typedef}) are
169optional, but if present must precede all messages. There must be at least
170one message in every interface.
171
172\subsection{Transparent type alias}\label{sec:lang:alias}
173
174\begin{syntax}
175alias \synit{aliasname} \synit{builtintype};
176\end{syntax}
177
178This simply declares a given alias name that may henceforth be used in the
179interface file in place of the given builtin type. It does not alter the
180generated code.
181
182\subsection{Type definition}\label{sec:lang:typedef}
183
184In contrast to an alias, the \texttt{typedef} keyword causes a new type
185to be created in the context of both the interface file and the programming
186language where the generated stubs are used.
187
188\subsubsection{Structure}
189
190\begin{syntax}
191typedef struct \verb+{+
192  \synit{typename} \synit{fieldname};
193  \ldots
194\verb+}+ \synit{structname};
195\end{syntax}
196
197This declares a compound type, akin to a structure in C.
198
199\subsubsection{Fixed-size array}
200
201\begin{syntax}
202typedef \synit{typename} \synit{arrayname}\verb+[+\synit{size}\verb+]+;
203\end{syntax}
204
205Where \emph{typename} is a previously-defined type, and \emph{size} is an
206integer literal, defines an array type with the given \emph{arrayname}. This
207name refers to an array of a fixed size.
208
209\subsubsection{Enumeration}
210
211\begin{syntax}
212typedef enum \verb+{+
213  \synit{fieldname},
214  \ldots
215\verb+}+ \synit{structname};
216\end{syntax}
217
218An enumeration, as in C, is represented as an integer type that may take
219one of the predefined symbolic values.
220
221\subsubsection{Alias}
222
223\begin{syntax}
224typedef \synit{aliasname} \synit{typename};
225\end{syntax}
226
227This form creates a new name (in both Flounder and C) for a
228previously-defined or builtin type.
229
230
231\subsection{Simple message}\label{sec:lang:message}
232
233A simple message specification takes the form:
234
235\begin{syntax}
236@\synit{context}\verb+(+\synit{name}\verb+=+\synit{value}, \ldots \verb+)+ \synit{(optional)}
237message|call|response \synit{name} \verb+(+\synit{argdecl}, \ldots \verb+)+;
238\end{syntax}
239
240The three identifiers \texttt{message}, \texttt{call}, and \texttt{response}
241exist for legacy reasons, and are treated identically by the stub compiler.
242They may be used as a simple form of documentation, when messages are intended
243to travel in a particular direction between client and server.
244
245The meta-parameters beginning with the context identifier are optional and may
246be used by individual stub compilers for additional information about messages
247that is not part of the payload. Multiple such contexts may also be provided.
248
249Message arguments use a comma-separated C-like syntax, and may take one of the
250following forms:
251
252\begin{description}
253 \item[Simple arguments] are denoted by type name and argument name literals,
254separated by whitespace.
255 \item[Dynamic array arguments] include the type name, argument name, and size
256name literals, as follows:
257\begin{syntax}
258\synit{typename} \synit{argname}\verb+[+\synit{sizename}\verb+]+
259\end{syntax}
260\end{description}
261
262For example, the following defines a message named ``dummy'' with three
263arguments: an integer, a character string, and a dynamic array of unsigned 8-bit
264integers (ie. a data buffer).
265
266\begin{example}
267  message dummy(int32 arg, string s, uint8 buf[buflen]);
268\end{example}
269
270\subsection{RPC}\label{sec:lang:rpc}
271
272\begin{syntax}
273@\synit{context}\verb+(+\synit{name}\verb+=+\synit{value}, \ldots \verb+)+ \synit{(optional)}
274rpc \synit{name} \verb+(+in|out \synit{argdecl}, \ldots \verb+)+;
275\end{syntax}
276
277An \texttt{rpc} declaration defines a pair of related one-way messages for the
278RPC \emph{call} and its \emph{response}. Argument declarations take the same
279syntax as for a simple message, but are prefixed with either \texttt{in} or
280\texttt{out}. All \texttt{in} arguments are arguments of the call, and all
281\texttt{out} arguments are arguments of the response.
282
283For example, the following defines an RPC that takes as input a string
284and returns an \texttt{errval} type (which must previously have been defined):
285
286\begin{example}
287  rpc testrpc(in string s, out errval reterr);
288\end{example}
289
290With the exception of its special treatment by the RPC client backend
291(described in Section~\ref{sec:rpcclient}), this RPC is exactly equivalent
292to the following message specifications:
293
294\begin{example}
295  message testrpc_call(string s);
296  message testrpc_response(errval reterr);
297\end{example}
298
299\section{Invocation}
300
301When invoked, Flounder processes a single interface file, and generates into a
302single output file the results of one or more \emph{backends}. The names of the
303input and output files and the backends to use are determined by the
304command-line arguments, which are as follows:
305
306\begin{verbatim}
307Usage: flounder [OPTION...] input.if output
308  -G       --generic-header    Create a generic header file
309           --generic-stub      Create generic part of stub implemention
310  -a ARCH  --arch=ARCH         Architecture for stubs
311  -i FILE  --import=FILE       Include a given file before processing
312           --lmp-header        Create a header file for LMP
313           --lmp-stub          Create a stub file for LMP
314           --ump-header        Create a header file for UMP
315           --ump-stub          Create a stub file for UMP
316           --rck-header        Create a header file for RCK
317           --rck-stub          Create a stub file for RCK
318           --bmp-header        Create a header file for BMP
319           --bmp-stub          Create a stub file for BMP
320           --loopback-header   Create a header file for loopback
321           --loopback-stub     Create a stub file for loopback
322           --rpcclient-header  Create a header file for RPC
323           --rpcclient-stub    Create a stub file for RPC
324           --msgbuf-header     Create a header file for message buffers
325           --msgbuf-stub       Create a stub file for message buffers
326  -T       --thc-header        Create a THC header file
327  -B       --thc-stubs         Create a THC stubs C file
328           --ahci-header       Create a header file for AHCI
329           --ahci-stub         Create a stub file for AHCI
330\end{verbatim}
331
332Zero or more files may be specified as \emph{imports}. An import file
333provides a simplistic mechanism for sharing type definitions among multiple
334interfaces: it consists of one or more \texttt{alias} or \texttt{typedef}
335statements. All import files are processed in the order in which they occur on
336the command line, before parsing of the interface definition file begins.
337Otherwise, types defined in import files are equivalent to types appearing in
338an interface definition.
339
340The architecture option must be specified when generating stubs for either LMP,
341UMP, or RCK backends; when specified, it must be one of the architectures
342Flounder understands (see \texttt{Arch.hs} in the flounder source), which is
343typically the same as the set of architectures supported by Barrelfish.
344
345All other options cause Flounder to emit the C code generated by one of its
346backends. The \emph{generic} backend defines the common interface and support
347code that is shared by all other backends, and is always required.
348Chapter~\ref{cha:hake} describes the location and contents of the generated
349files that are used in the Barrelfish build system.
350
351\chapter{Flounder stub API}
352
353Most Flounder backends generate message stubs for specific interconnect
354drivers, with implementations varying widely according to the properties of
355the interconnect driver. However, common to all interconnect drivers for a
356given interface is the interface to the \emph{binding object}, which allows
357the typed messages defined in the interface definition to be sent and received
358in an event-driven programming style. The interface to the binding object for
359a given interface is generated by the \texttt{--generic-header} option to
360Flounder, and is documented here.
361
362Most interaction with bindings involves interface-type-specific functions.
363In the examples that follow, we assume that a Flounder interface
364named \texttt{iface} is used, with the following specification:
365
366\begin{example}
367interface iface \{
368  message basic(uint32 arg);
369  message str(uint32 arg, string s);
370  message caps(uint32 arg, cap cap1, cap cap2);
371  message buf(uint8 buf[buflen]);
372\};
373\end{example}
374
375
376\section{Binding process}\label{sec:binding_bindingproc}
377
378In order to communicate, two dispatchers must establish a communication channel
379using a common interconnect driver, and must acquire binding objects for each
380endpoint of the channel. This process is known as \emph{binding}, and operates
381in two modes:
382
383\begin{enumerate}
384 \item A \emph{client} dispatcher initiates a binding to a service by calling
385       the interface's \emph{bind} function
386 \item A \emph{server} dispatcher accepts an incoming binding request from a
387       client, on a service which it has previously \emph{exported}
388\end{enumerate}
389
390In any binding attempt, one dispatcher must act as server and the other as
391the client, however once a binding is established, the binding objects and the
392process of communication on both sides of the binding are equivalent and
393indistinguishable.
394
395\subsection{Server-side: export}
396
397In order to accept binding requests, a dispatcher must first export a service.
398This is done by calling the \emph{export} function for the given interface:
399
400\begin{lstlisting}
401typedef void idc_export_callback_fn(void *st, errval_t err, iref_t iref);
402typedef errval_t iface_connect_fn(void *st, struct iface_binding *binding);
403
404errval_t iface_export(void *st, idc_export_callback_fn *export_cb,
405                      iface_connect_fn *connect_cb, struct waitset *ws,
406                      idc_export_flags_t flags);
407\end{lstlisting}
408
409The \lstinline+iface_export()+ function initiates the export of a new service.
410It takes a state pointer, export callback, connection callback, waitset
411pointer, and flags. If it returns success, the export has been successfully
412initiated.
413
414When the export either succeeds or fails, the export callback will be invoked. 
415It takes the state pointer (which was passed to \lstinline+iface_export()+), an
416error code indicating success or failure of the export, and an \emph{interface
417reference} (iref). The iref is only meaningful if the export succeeds; if this
418is the case, it identifies the exported service in such a way that a client can
419use the iref to initiate a binding to the service. In order for the service to
420be usable, the iref must be communicated to the potential clients in some
421manner; typically by registering it in the name service, which is described in
422Chapter~\ref{cha:nameservice}.
423
424Once a service has been exported, clients may attempt to bind to it. When this
425occurs, the connection callback will be invoked. This function takes the state
426pointer used at export time, and a pointer to the new partly-constructed binding
427object. If it returns success, the binding request will be accepted, and the
428binding process will complete. If it returns an error code indicating failure,
429the binding attempt will be rejected, and the error code will be returned to
430the client. When accepting a binding request, the connection callback also has
431the task of filling in the vtable of receive handers, as described in
432Section~\ref{sec:binding_rx}, and the error handler, described in
433Section~\ref{sec:binding_errors}.
434
435\subsection{Client-side: bind}
436
437In order to initiate a binding, a client dispatcher calls the \emph{bind}
438function for a given interface:
439
440\begin{lstlisting}
441typedef void iface_bind_continuation_fn(void *st, errval_t err,
442                                        struct iface_binding *binding);
443
444errval_t iface_bind(iref_t i, iface_bind_continuation_fn *continuation,
445                    void *st, struct waitset *waitset, idc_bind_flags_t flags);
446\end{lstlisting}
447
448This function takes the iref for the service (obtained, for example, by a name
449service query), a bind continuation function, state pointer, waitset poitner,
450and flags. If it returns success, a binding attempt has been initiated.
451
452At some later time, when the binding either completes or fails, the bind
453continuation function will be run. This takes the state pointer passed to
454\lstinline+iface_bind()+, an error code indicating success or failure, and a
455pointer to the newly-constructed binding, which is only valid if the error code
456indicates success.
457
458\section{Binding objects}\label{sec:binding_objs}
459
460The end result of the binding process (either client-side or server-side) is a
461binding object, which is the abstract interface to a specific interconnect
462driver for a specific interface type. It is implemented in C as the
463\lstinline+struct iface_binding+ type, the defined portion of which is shown
464below, and described in the following sections:
465
466\begin{lstlisting}
467struct iface_binding {
468    /* Arbitrary user state pointer */
469    void *st;
470
471    /* Waitset used for receive handlers and send continuations */
472    struct waitset *waitset;
473
474    /* Mutex for the use of user code. */
475    /* Must be held before any operation where there is a possibility of
476     * concurrent access to the same binding (eg. multiple threads, or
477     * asynchronous event handlers that use the same binding object). */
478    struct event_mutex mutex;
479
480    /* returns true iff a message could currently be accepted by the binding */
481    iface_can_send_fn *can_send;
482
483    /* register an event for when a message is likely to be able to be sent */
484    iface_register_send_fn *register_send;
485
486    /* change the waitset used by a binding */
487    iface_change_waitset_fn *change_waitset;
488
489    /* perform control operations */
490    iface_control_fn *control;
491
492    /* error handler for any async errors associated with this binding */
493    /* must be filled-in by user */
494    iface_error_handler_fn *error_handler;
495
496    /* Incoming message handlers (filled in by user) */
497    struct iface_rx_vtbl rx_vtbl;
498};
499\end{lstlisting}
500
501The \lstinline+st+ pointer is available for users to associate arbitrary local
502state with each binding object. It is not used by the binding implementation.
503
504The \lstinline+waitset+ pointer identifies the waitset used by the binding for
505executing receive handler functions, send continuations, and internal logic. It
506is read-only for users of the binding; to change the actual waitset, the
507\lstinline+change_waitset()+ control function must be used.
508
509Binding implementations are not thread-safe, and it is the responsibility of
510user code to ensure that no concurrent access to the same binding occurs.
511The \lstinline+mutex+ field is not used by the binding implementation, but is
512provided for user code to ensure safety of concurrent accesses to the binding
513object. It is used, for example, to arbitrate access to the shared monitor
514binding, as described in Section~\ref{sec:monitor}.
515
516The remaining fields are documented in the following sections.
517
518
519\section{Receiving messages}\label{sec:binding_rx}
520
521When a message arrives, the binding implementation calls the \emph{receive
522handler} function located in the binding's \lstinline+rx_vtbl+ table of function
523pointers. This table has an entry for each message type, with arguments
524corresponding to those of the message, as shown below:
525
526\begin{lstlisting}
527/*
528 * Message type signatures (receive)
529 */
530typedef void iface_basic__rx_method_fn(struct iface_binding *_binding, uint32_t arg);
531typedef void iface_str__rx_method_fn(struct iface_binding *_binding, uint32_t arg, char *s);
532typedef void iface_caps__rx_method_fn(struct iface_binding *_binding, uint32_t arg,
533                                      struct capref cap1, struct capref cap2);
534typedef void iface_buf__rx_method_fn(struct iface_binding *_binding, uint8_t *buf, size_t buflen);
535
536/*
537 * VTable struct definition for the interface (receive)
538 */
539struct iface_rx_vtbl {
540    iface_basic__rx_method_fn *basic;
541    iface_str__rx_method_fn *str;
542    iface_caps__rx_method_fn *caps;
543    iface_buf__rx_method_fn *buf;
544};
545\end{lstlisting}
546
547It is the responsibility of the user to fill in the \lstinline+rx_vtbl+ when
548the binding is initialised, either in the connection callback on the
549server side, or in the bind callback on the client side. This is typically
550achieved by copying a static table of function pointers. It is a fatal error
551if a message arrives on a binding and the handler function for that message
552type is uninitialised or NULL.
553
554Any message arguments passed by reference become the property of the user, and
555must be freed by the appropriate memory management mechanisms:
556
557\begin{itemize}
558 \item Strings and arrays live on the heap, and should be
559    released by a call to \lstinline+free()+
560 \item Capabilities refer to an occupied capability slot allocated from the
561    standard capability space allocator, and should be released by a call to
562    \lstinline+cap_destroy()+
563\end{itemize}
564
565
566\section{Sending messages}
567
568A message may be sent on the binding by calling the appropriate transmit
569function. For our example interface, these are as follows:
570
571\begin{lstlisting}
572errval_t iface_basic__tx(struct iface_binding *_binding, struct event_closure _continuation,
573                         uint32_t arg);
574errval_t iface_str__tx(struct iface_binding *_binding, struct event_closure _continuation,
575                       uint32_t arg, const char *s);
576errval_t iface_caps__tx(struct iface_binding *_binding, struct event_closure _continuation,
577                        uint32_t arg, struct capref cap1, struct capref cap2);
578errval_t iface_buf__tx(struct iface_binding *_binding, struct event_closure _continuation,
579                       const uint8_t *buf, size_t buflen);
580\end{lstlisting}
581
582Each function takes as arguments the binding pointer and a
583\emph{continuation} closure (described below), followed by the message payload.
584If it returns success, the message has been successfully enqueued for
585transmission, and will eventually be sent if no asynchronous errors occur.
586
587The continuation is optional. When specified, it provides a generic closure
588which will be executed after the given message has been successfully sent.
589Two macros are provided to construct continuations: \lstinline+NOP_CONT+
590provides a no-op continuation, to be used when this functionality is not
591required, whereas \lstinline+MKCONT(func,arg)+ constructs a one-off continuation
592given a function pointer and its argument.
593
594All binding implementations can buffer exactly one outgoing message. If another
595send function is called on the same binding while the buffer is full, the
596\lstinline+FLOUNDER_ERR_TX_BUSY+ error will be returned, indicating that the
597binding is not currently able to accept another message. In this case, the user
598must arrange to retry the send at a later time, typically from a send
599continuation function, or by using the \lstinline+register_send()+ control
600function (described below).
601
602Any message arguments passed by reference (strings, arrays, and capabilities)
603are borrowed by the binding, and must remain live for as long as the message
604is buffered but not yet completely sent. The send continuation may be used to
605determine when it is safe to reclaim the memory used by reference parameters.
606
607
608\section{Control functions}
609
610Four standard control functions are defined for all binding implementations,
611and are invoked through function pointers in the binding object:
612\lstinline+can_send+, \lstinline+register_send+, \lstinline+change_waitset+
613and \lstinline+control+.
614
615\subsubsection*{can\_send}
616
617\begin{lstlisting}
618typedef bool iface_can_send_fn(struct iface_binding *binding);
619\end{lstlisting}
620
621This function returns true if and only if the binding could currently accept
622an outgoing message for transmission. Note this does not indicate the message
623could immediately be delivered to the underlying channel (further queuing
624may be involved), simply that a send function would succeed.
625
626\subsubsection*{register\_send}
627
628\begin{lstlisting}
629typedef errval_t iface_register_send_fn(struct iface_binding *binding,
630                         struct waitset *ws, struct event_closure continuation);
631\end{lstlisting}
632
633This function arranges for the given continuation closure to be invoked on the
634given waitset when the binding is able to accept another outgoing message
635(i.e. when \lstinline+can_send+ would next return true).
636It may fail if a continuation is already registered and has not yet executed;
637only one continuation may be registered at a time.
638
639\subsubsection*{change\_waitset}
640
641\begin{lstlisting}
642typedef errval_t iface_change_waitset_fn(struct iface_binding *binding, struct waitset *ws);
643\end{lstlisting}
644
645This function changes the waitset used by the binding for send continuations
646and running its internal event handlers.
647
648\subsubsection*{control}
649
650\begin{lstlisting}
651typedef errval_t iface_control_fn(struct iface_binding *binding, idc_control_t control);
652\end{lstlisting}
653
654This function provides a generic control mechanism. The currently-defined
655control operations are:
656
657\begin{description}
658 \item[\texttt{IDC\_CONTROL\_TEARDOWN}] Initiate connection teardown.
659 \item[\texttt{IDC\_CONTROL\_SET\_SYNC}] Enable synchronous optimisations.
660   See below.
661 \item[\texttt{IDC\_CONTROL\_CLEAR\_SYNC}] Disable synchronous optimisations.
662   Currently only implemented for the LMP interconnect driver, this operation
663   disables the (default) behaviour of yielding the CPU timeslice to the
664   receiver on every message send.
665\end{description}
666
667
668\section{Error handling}\label{sec:binding_errors}
669
670The binding structure contains an \lstinline+error_handler+ function pointer
671field, of the following type:
672
673\begin{lstlisting}
674typedef void iface_error_handler_fn(struct iface_binding *binding, errval_t err);
675\end{lstlisting}
676
677This function is called to report any asynchronous errors occuring on the
678binding, including connection teardown, and should be filled-in by the user
679during the binding process (i.e. whenever the \lstinline+rx_vtbl+ is installed).
680The default implementation of the error handler for every binding simply
681prints a message and terminates the current dispatcher.
682
683
684\chapter{Runtime support}
685
686This chapter describes the IDC runtime support that is part of
687\texttt{libbarrelfish} which is used by the Flounder-generated stubs.
688
689\section{Waitsets and event handling}
690\label{sec:waitset}
691
692At the core of the Barrelfish event handling mechanism lies the \emph{waitset},
693represented by the \lstinline+struct waitset+ type in C. Informally, waitsets
694facilitate the rendezvous of \emph{threads} with \emph{events}, where an event
695is represented simply as a closure (function and argument pointers), as follows:
696
697\begin{lstlisting}
698struct event_closure {
699    void (*handler)(void *arg);
700    void *arg;
701};
702\end{lstlisting}
703
704These events are typically raised by message channels in response to activity,
705such as the ability to receive a message on a specific channel. In the
706following we introduce the abstract properties of channels as they interact
707with the waitset; the concrete channel interfaces are specific to each
708interconnect driver and are described in the respective parts of
709Chapter~\ref{cha:icds}.
710
711All waitsets are local to a specific dispatcher, and may not be accessed from
712other dispatchers. Although there is no limit to the number of waitsets used,
713there always exists one \emph{default waitset} for each dispatcher, which
714may be located by calling the following function:
715
716\begin{lstlisting}
717struct waitset *get_default_waitset(void);
718\end{lstlisting}
719
720
721\subsection{(Abstract) event channels}
722
723All events managed by a waitset originate in \emph{channels},\footnote{The
724term ``channel'' is now something of a misnomer in this context, as events are
725no longer used exclusively for message channels.} represented internally by the
726\lstinline+struct waitset_chanstate+ type. This struct is typically embedded
727into the state of a real message channel or other event mechanism.
728
729Channels are used to track the registration of handlers for and occurrence
730of a single specific event. Multiple events (such as the ability to send a
731message, as distinct from the ability to receive a message) are supported by
732distinct channels -- an interconnect driver will typically use multiple event
733channels in its implementation. Moreover, each event channel supports only one
734registration -- registration of multiple event handlers on a single event may
735be implemented above this mechanism.
736
737Every channel exists in one of the following states:
738
739\begin{description}
740 \item[Unregistered:] the channel is initialised, but no event handler has been
741              registered, and it is not associated with a waitset
742 \item[Idle:] an event handler closure has been registered with the
743              channel on a specific waitset, but the event has not yet occurred
744 \item[Polled:] this is similar to an idle channel, in that an event has been
745              registered and not yet occurred, but due to the nature of the
746              channel implementation, it must be periodically polled for
747              activity to detect the occurrence of the event
748 \item[Pending:] the event for which a handler was registered has occurred, and
749              the channel is now waiting to deliver the event
750\end{description}
751
752The following operations on channels are used by interconnect drivers to change
753their state in the waitset:
754
755\begin{description}
756 \item[Register:] Associates an event handler with a previously-unregistered
757               channel, and moves it to either the \emph{idle} or \emph{polled}
758               state
759 \item[Deregister:] Cancels the previously-registered event handler of a
760               channel, moving it from either \emph{idle}, \emph{polled} or
761               \emph{pending} state back to unregistered
762 \item[Trigger:] Signals occurrence of an event on a \emph{idle} or
763               \emph{polled} channel, and changes it to the \emph{pending} state
764\end{description}
765
766These operations are available only to interconnect drivers, using
767the ``private'' API defined in \texttt{waitset\_chan.h}.
768
769Once a channel is in the \emph{pending} state, if it is not deregistered it will
770eventually be \emph{delivered} to a user of the waitset (calling
771a function such as \lstinline+get_next_event()+ or \lstinline+event_dispatch()+)
772unless it is deregistered beforehand. Once the event is delivered a channel
773reverts to \emph{unregistered} state. Event registrations are therefore
774single-shot; an event handler function that wishes to handle future events will
775typically re-register itself.
776
777
778\subsection{Waitset API}
779
780\subsubsection{Waitset creation/destruction}
781
782\begin{lstlisting}
783void waitset_init(struct waitset *ws);
784errval_t waitset_destroy(struct waitset *ws);
785\end{lstlisting}
786
787These functions initialise the state of a new waitset, or destroy the state
788of an existing waitset.
789
790\subsubsection{Event dispatch}
791
792\begin{lstlisting}
793errval_t get_next_event(struct waitset *ws, struct event_closure *retclosure);
794\end{lstlisting}
795
796This function is the core of the event handling mechanism. It returns the next
797pending event on the given waitset, if necessary by blocking the calling
798thread until an event is triggered.
799
800\begin{lstlisting}
801errval_t event_dispatch(struct waitset *ws);
802\end{lstlisting}
803
804This function is a wrapper for \lstinline+get_next_event()+ which also invokes
805the event (by calling \lstinline+event.handler(event.arg)+) before returning.
806It is typically used within an infinite loop on an event-handling thread.
807
808
809\subsection{Implementation}
810
811Internally, the waitset maintains a queue of waiting threads, and three queues
812of channels: one each for channels in the \emph{idle}, \emph{pending} and
813\emph{polled} state. The majority of the implementation consists of manipulating
814these queues, and implementing the state transitions described in the previous
815subsection.
816
817The core logic of the waitset lies in the \lstinline+get_next_event()+
818function, whose operation is as follows:
819
820\begin{enumerate}
821 \item If there are any channels in the pending queue, remove the next channel
822   from the queue, mark it unregistered, and return its event closure.
823 \item Otherwise, if there are any channels in the polled queue, and no other
824   thread is already polling channels on this waitset, poll channels in the
825   polled queue long for as long as the pending queue remains empty. The
826   polling mechanism is presently used only by the UMP interconnect driver,
827   and is described in more detail in Section~\ref{sec:ump_icd}.
828 \item Otherwise, block on the queue of waiting threads. When unblocked
829   (either when a channel becomes pending or when needed to begin polling),
830   resume from step 1.
831\end{enumerate}
832
833\section{Interconnect drivers}
834\label{sec:icds}
835
836Interconnect drivers implement the runtime support needed by specific
837message transport mechanisms. Chapter~\ref{cha:icds} describes the specifics
838of each interconnect driver. Although there is no uniform interface to the
839interconnect drivers, they typically support the following:
840
841\begin{itemize}
842 \item Binding (connection setup), as described in Section~\ref{sec:binding}
843 \item A mechanism to \emph{send} a message on a channel
844 \item A mechanism to \emph{receive} messages from a channel
845 \item An interface to the waitset, allowing the user to \emph{register} for
846       event notifications when it is possible to send or receive on a channel
847 \item Various channel-specific control operations
848\end{itemize}
849
850In general, messages at this level of the system are word-granularity
851fixed-size (or variable-size with an upper bound) untyped data.
852
853The interconnect drivers that are available in a given domain are determined by
854the compile-time configuration and processor architecture. The only driver
855that is always present is local message passing (LMP), which provides
856kernel-mediated communication with other dispatchers on the same core,
857and is used to communicate with the local Monitor.
858
859\section{Export and binding}
860\label{sec:binding}
861
862The export and binding process for all interconnect drivers is mediated by
863the monitors. Section~\ref{sec:binding_bindingproc} documents the user-level
864API provided by the binding objects, while here we describe what happens under
865the covers.
866
867The monitors are responsible for allocating interface references, or irefs,
868tracking the association of irefs to the dispatcher implementing that service,
869and mediating the bind process when a client wishes to bind to an iref.
870In order to boot-strap this mechanism, every dispatcher is always created
871with a binding to its local monitor, as described in Section~\ref{sec:monitor}.
872
873\subsection{Export}
874
875In order to export a new service, a dispatcher sends an
876\lstinline+alloc_iref_request+ message to its local monitor, passing an opaque
877service identifier (which is unique to the dispatcher). The monitor constructs
878an iref, and returns it to the user along with the service identifier in an
879\lstinline+alloc_iref_reply+ message.
880
881\subsection{Binding}
882
883In order to bind, a client dispatcher on a core presents the iref to its local
884monitor, and requests to bind with a specific interconnect driver. Although
885different interconnect drivers use different messages and parameters, the
886process is very similar for all, so by way of example we present here
887the binding mechanism for the UMP interconnect driver.
888
889\begin{figure}
890  \includegraphics[width=\columnwidth]{figures/ump_bind}
891  \caption{UMP bind process}\label{fig:ump_bind}
892\end{figure}
893
894Figure~\ref{fig:ump_bind} shows the steps in setting up UMP binding:
895\begin{enumerate}
896 \item The client sends \lstinline+bind_ump_client_request+ to its local monitor,
897      with the iref, the client's (opaque) connection ID, and various parameters
898      that are specific to the interconnect driver. For UMP, this includes a
899      capability to a shared frame, which will be used for message transfer.
900 \item The local monitor determines (from the iref) the core on which the
901      service resides, and forwards the request to the monitor on that core
902      using a \lstinline+bind_ump_request+ message (NB: mislabelled
903      \lstinline+remote_bind_request+ in the figure).
904 \item The monitor on the server's core determines the local dispatcher which
905      has exported this iref, and forwards the bind request to it using a
906      \lstinline+bind_ump_service_request+ message. It also provides the
907      opaque service identifier which the dispatcher gave when originally
908      exporting the service.
909 \item The server dispatcher uses the service identifier to locate and invoke
910      the connection callback for the given service. Assuming this succeeds, it
911      allocates its local state for the binding (e.g. the binding object). It
912      then replies to its local monitor with a
913      \lstinline+bind_ump_reply_monitor+ message (labelled
914      \lstinline+bind_service_reply+ in the figure).
915 \item The monitor proxies the reply back to the monitor on the client's core
916 \item The monitor on the client core proxies the reply back to the client,
917      providing the client's connection ID, which allows it to locate the
918      binding object and run the bind callback. The binding has now completed.
919\end{enumerate}
920
921The binding mechanisms for other interconnect drivers (e.g. LMP, BMP) are
922similar, in that they involve an exchange of capabilities between client and
923server mediated by the monitors. Once the binding is established, direct
924communication can take place using the exchanged resources.
925
926\section{Indirect capability transfer}
927\label{sec:capxfer}
928
929Because capabilities are maintained as references to per-core state in the
930CPU drivers, only the LMP interconnect driver, which traverses kernel-mode
931code, can directly deliver a capability along with untyped message payload.
932For other interconnect drivers, such as UMP, capabilities travel out-of-band
933from message payload through the monitors.
934
935The process of indirect transmission of capabilities is as follows:
936\begin{enumerate}
937 \item The binding object waits to acquire a mutex on its local monitor binding.
938     This is necessary, because it must send a message to its local monitor, and
939     the local monitor binding is shared by all bindings (and binding-related
940     code) executing on the same dispatcher. While waiting, it sends the
941     non-capability payload of the message as usual.
942
943  \item Upon receipt of the first message payload, the receiver sends back an
944     acknowledgement that it is ready to receive a capability. This handshake
945     is necessary to avoid over-running the receiver's buffers, and to ensure
946     that capabilities can be associated with the correct messages.
947
948  \item Once the monitor binding mutex is acquired, and the capability
949     acknowledgement has been seen, a \lstinline+cap_send_request+ message
950     is sent to the local monitor, containing the capability and an identifier
951     for the channel on which the capability is to be sent.
952
953  \item The monitor determines whether the cap may be sent to the remote core.
954    A full discussion of the rules used is beyond the scope of this document,
955    but in general capability types that refer to inherently local state (such
956    as LMP endpoints) may not be sent, nor may capabilities that are currently
957    being revoked. If the check fails, a \lstinline+cap_send_reply+ error is
958    sent back to the local dispatcher; if it succeeds, the capability is
959    serialised and proxied to the remote monitor as another
960    \lstinline+cap_send_request+ message on the inter-monitor channel.
961
962  \item The remote monitor reconstructs the capability from its serialised
963    representation, and forwards it on to the destination dispatcher with a
964    \lstinline+cap_receive_request+ message.
965
966  \item The receiver identifies the binding to which the capability belongs,
967    and invokes a callback on that binding. This in turn stores the capability
968    in the appropriate location.
969
970  \item When the last capability and message payload are received, the binding
971   object delivers the message to the user as usual.
972\end{enumerate}
973
974
975\chapter{Interconnect drivers}\label{cha:icds}
976
977\section{LMP: local (intra-core) message passing}
978\label{sec:lmp_icd}
979
980IDC between dispatchers on the same core uses LMP (local message passing).  LMP
981uses endpoint capabilities for communication.  In this case the channel struct
982contains a reference to a remote endpoint cap (for the sender) and a local
983endpoint cap (for the receiver). It also contains a reference to an endpoint
984structure. This is a struct contained in the receiver's dispatcher and refers to
985a buffer that is used to store incoming messages. The dispatcher can have
986multiple such endpoint buffers.
987
988\begin{figure}
989  \begin{center}
990    \includegraphics[width=0.5\columnwidth]{LMP.pdf}
991  \end{center}
992  \caption{Local Message Passing}
993  \label{fig:lmp}
994\end{figure}
995
996
997The sender's remote endpoint cap is derived from the receiver's dispatcher cap,
998and contains a pointer back to the DCB and an offset into the associated
999endpoint buffer.
1000
1001When a message is sent, the sender invokes the remote endpoint cap with the
1002message to send. This causes the kernel to copy the message into the associated
1003endpoint buffer and then make the receiver dispatcher runnable (which will cause
1004it's \texttt{run()} upcall to be invoked when it is scheduled). The run function
1005then finds the endpoint with a message in it and invokes the trigger on the
1006appropriate waitset. The channel's event handler is responsible for getting the
1007rest of the message out of the buffer and invoking appropriate user-defined
1008handlers.
1009
1010Sender: \texttt{lmp\_deliver\_payload (dispatch.c)}
1011
1012Receiver: \texttt{disp\_run (dispatch.c)}
1013          \texttt{lmp\_endpoints\_poll\_disabled (lmp\_endpoints.c)}
1014
1015Event handler: \texttt{myappif\_lmp\_rx\_handler (myappif\_flounder\_bindings.c)}
1016
1017\textbf{Binding:} The process of binding for LMP involves the sender creating
1018the endpoint capability and appropriate buffers, then invoking
1019\texttt{lmp\_bind\_request} on the monitor. The monitor sends the request to the
1020receiver, where it sets up its own endpoint cap and buffer and binding struct,
1021and returns the cap through the monitor to the sender.
1022
1023\textbf{Passing capabilities - LMP:} When a capability is sent in a message, then
1024the kernel must additionally copy the cap to the receiver's cspace (a slot for
1025this is setup in the endpoint). The receive handler must ensure that it copies
1026the cap out of this cspace slot in order to free it for future transfers.
1027
1028
1029
1030\subsection{Messaging with the local Monitor}
1031\label{sec:monitor}
1032
1033
1034\section{UMP: user-level message passing with coherent shared memory}
1035\label{sec:ump_icd}
1036
1037IDC between dispatchers on different cores uses UMP (user-level message passing)
1038(on x86).
1039
1040\begin{figure}
1041  \begin{center}
1042    \includegraphics[width=0.5\columnwidth]{UMP.pdf}
1043  \end{center}
1044  \caption{User-level Message Passing}
1045  \label{fig:ump}
1046\end{figure}
1047
1048UMP uses shared memory and polling and inter-processor interrupts for
1049communication. The shared memory is split into a send buffer and a receive
1050buffer (Figure~\ref{fig:ump}), with each entry in the buffers being the size of
1051a cache line. The sender writes into the send buffer, and the receiver reads
1052from it. The receiver polls to determine whether new messages have been
1053received.
1054
1055The binding struct for a UMP channel contains UMP specific channel structs
1056(nested within various included structs) which contain a reference to the shared
1057memory, as well as a counter specifying the current read or write position.
1058
1059In order to detect whether the write position has been reached when reading, the
1060channel also contains an epoch flag. Every buffer entry has an epoch bit that is
1061flipped every time a new entry is added. When reading a new entry the channels's
1062epoch is compared to the epoch bit of the buffer slot to be read. If it is the
1063same, then the entry is valid, if it is different then it is not valid. The
1064channel's epoch flag is flipped every time the counter wraps around.
1065
1066The sender also has to know whether it is going to run up against the
1067receivers's read position. Rather than flip epoch bits on a read (in order to
1068avoid bouncing cache lines the buffer is kept write-only for the sender and
1069read-only for the receiver) each message has a sequence id. When replying to the
1070sender the receiver sends its latest read sequence id. The sender then knows how
1071many messages are outstanding and together with knowledge of the buffer size,
1072can avoid overwriting messages that haven't been read yet.
1073
1074\textbf{Binding:} The UMP binding process involves the source creating the
1075channel buffers, then sending a cap to the buffer along with a bind request
1076through LMP to the monitor. The monitor serialises the cap and sends it through
1077to the destination monitor. The destination monitor recreates the cap, and sends
1078it on to the receiver. This sets up its channel struct accordingly and finalises
1079the binding.
1080
1081\textbf{Passing capabilities - UMP:} UMP messages containing capabilities must
1082be sent in two phases. The non-cap message is sent as normal. The capability
1083must be sent through the monitor as only the monitor is able to invoke a syscall
1084to serialise and deserialise a capability. The sender sends the capability with
1085a request to the monitor, the monitor serialises it and sends it to the
1086destination monitor, where it is deserialised, and send through LMP to the
1087destination.
1088
1089\section{RCK: message passing on the SCC}
1090\label{sec:rck_icd}
1091
1092\section{BMP: Beehive message passing}
1093\label{sec:bmp_icd}
1094
1095\chapter{Additional Flounder backends}
1096
1097\section{Loopback}
1098
1099\section{Message buffers}
1100
1101\section{RPC Client}
1102\label{sec:rpcclient}
1103
1104\section{THC}
1105\label{sec:thc}
1106
1107\section{AHCI Command Translator}
1108\label{sec:ahci}
1109
1110This backend is documented in the AHCI TN.
1111
1112\chapter{Name service}
1113\label{cha:nameservice}
1114
1115TODO
1116
1117
1118\chapter{Interaction with Hake build system}
1119\label{cha:hake}
1120
1121TODO
1122
1123\begin{itemize}
1124 \item Configuration options
1125 \item Contents of Hakefiles
1126 \item Location/contents of generated files
1127 \item Dependencies
1128\end{itemize}
1129
1130
1131\chapter{Unsupported features and future work}
1132
1133TODO
1134
1135\begin{itemize}
1136 \item Connection/binding teardown
1137 \item Variable-length arrays of types other than \lstinline+char+,
1138        \lstinline+int8+ and \lstinline+uint8+
1139 \item Types shared by multiple interfaces, control over C naming of generated
1140        type definitions
1141 \item Non-blocking variant of \lstinline+get_next_event()+
1142 \item Fast LRPC optimisations (combines waitset, thread switch and send)
1143\end{itemize}
1144
1145%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1146\bibliographystyle{plain} 
1147\bibliography{defs,barrelfish}
1148
1149\end{document}
1150