1%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2% Copyright (c) 2015, ETH Zurich.
3% All rights reserved.
4%
5% This file is distributed under the terms in the attached LICENSE file.
6% If you do not find this file, copies can be found by writing to:
7% ETH Zurich D-INFK, Universitaetstr 6, CH-8092 Zurich. Attn: Systems Group.
8%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
9
10\documentclass[a4paper,11pt,twoside]{report}
11\usepackage{bftn}
12\usepackage{calc}
13\usepackage{verbatim}
14\usepackage{xspace}
15\usepackage{pifont}
16\usepackage{pxfonts}
17\usepackage{textcomp}
18\usepackage{amsmath}
19\usepackage{multirow}
20\usepackage{listings}
21\usepackage[framemethod=default]{mdframed}
22\usepackage[shortlabels]{enumitem}
23\usepackage{parskip}
24\usepackage{xparse}
25\usepackage{ctable}
26\usepackage[pdftex]{hyperref}
27
28% hyperref setup
29\definecolor{linkcol}{rgb}{0,0,0.7}
30\hypersetup{
31  pdftitle={Barrelfish Specification},
32  plainpages=false,
33  linktocpage,
34  colorlinks,
35  linkcolor=linkcol,citecolor=linkcol,pagecolor=linkcol,urlcolor=linkcol
36  %breaklinks=true,pagebackref=true
37}
38
39\mdfdefinestyle{mdsyscall}{
40rightline=true,
41innerleftmargin=10,
42innerrightmargin=10,
43frametitlerulewidth=2pt,
44frametitlefont={\color{white}\varname},
45skipbelow=1em,
46skipabove=1em,
47}
48
49\DeclareDocumentEnvironment{arguments}{}
50{
51\newcommand{\argument}[2]{ \item[\varname{##1}] ##2 }
52\textbf{Arguments}\parskip
53\begin{description}[leftmargin=!,labelwidth=\widthof{count},labelindent=1em]
54}
55{
56\end{description}
57}
58
59\DeclareDocumentEnvironment{api}{o O {stable}}
60{
61\definecolor{unused}{RGB}{215,25,28}
62\definecolor{unstable}{RGB}{253,174,97}
63\definecolor{todo}{RGB}{255,255,191}
64\definecolor{lightblue}{RGB}{171,217,233}
65\definecolor{stable}{RGB}{44,123,182}
66\newcommand{\brief}[1]{##1\\}
67\newcommand{\note}[1]{\\\textbf{Note}\\##1\parskip}
68
69\begin{mdframed}[style=mdsyscall,frametitle=#1,frametitlebackgroundcolor=#2]
70}
71{
72\end{mdframed}
73}
74
75\newcommand{\fnname}[1]{\textit{\texttt{#1}}}%
76\newcommand{\datatype}[1]{\textit{\texttt{#1}}}%
77\newcommand{\varname}[1]{\texttt{#1}}%
78\newcommand{\keywname}[1]{\textbf{\texttt{#1}}}%
79\newcommand{\pathname}[1]{\texttt{#1}}%
80\newcommand{\tabindent}{\hspace*{3ex}}%
81\newcommand{\sockeye}{\lstinline[language=sockeye]}
82\newcommand{\ccode}{\lstinline[language=C]}
83
84% Default language for code listings is C
85\lstset{
86  language=C,
87  basicstyle=\small,
88  frame=lines,
89  breaklines=true,
90  showstringspaces=false,
91  texcl=true,
92  columns=flexible
93}
94
95\lstdefinelanguage{Mackerel}{
96  morekeywords={datatype,device,register,regtype,constants,type,at,
97              many,edit,io,also},
98  sensitive=false,
99  morecomment=[l]{//},
100  morecomment=[s]{/*}{*/},
101  morestring=[b]",
102}
103
104% sans-serif URLs
105\urlstyle{sf}
106
107\newcommand{\todo}[1]{[\textcolor{red}{\emph{#1}}]}
108
109\newcommand{\noarginvocation}[1]{\paragraph{#1 invocation}}
110\newcounter{invocArgCnt}
111\newenvironment{invocation}[1]{%
112  \noarginvocation{#1}
113
114  \begin{list}{\emph{Argument~\arabic{invocArgCnt}:}}{%
115    \usecounter{invocArgCnt}%
116    \setlength{\rightmargin}{\leftmargin}%
117    \setlength{\itemsep}{0ex}%
118  }
119  \renewcommand{\arg}{\item}
120}{%
121  \end{list}
122}
123
124
125% autoref (from hyperref) setup
126\def\chapterautorefname{Chapter}
127\def\sectionautorefname{Section}
128\def\subsectionautorefname{Section}
129\expandafter\def\csname section*autorefname\endcsname{Section}
130
131
132\title{Barrelfish Specification}
133\author{Barrelfish project}
134\tnnumber{10}
135\tnkey{Specification}
136
137\presetkeys{todonotes}{inline}{}
138
139\begin{document}
140
141\maketitle
142
143\begin{versionhistory}
144\vhEntry{0.1}{01.01.2009}{AB,SP,TR,AS,AK}{Initial Version}
145\vhEntry{0.2}{01.01.2015}{GZ}{Update ABI etc.}
146\end{versionhistory}
147
148\cleardoublepage
149\tableofcontents
150
151\chapter{Barrelfish Kernel API}
152
153\section{System Calls}\label{sec:syscalls}
154
155The section defines the specification of the common system call
156API that is provided by a Barrelfish CPU driver. Currently
157we have the following system calls:
158
159\begin{tabular}{|p{5cm}|>{\raggedright\arraybackslash}p{7cm}|}
160\hline
161
162SYSCALL\_INVOKE & Invoke a capability. \\
163SYSCALL\_YIELD & Yield the CPU. \\
164SYSCALL\_LRPC & Fast LRPC. \\
165SYSCALL\_DEBUG & Benchmarking and debug syscalls. \\
166SYSCALL\_REBOOT & Reboot the machine. \\
167SYSCALL\_NOP & No operation. \\
168SYSCALL\_PRINT & Write to console. \\
169SYSCALL\_SUSPEND & Suspend the CPU. \\
170SYSCALL\_GET\_ABS\_TIME & Get time elapsed since boot. \\
171\hline
172\end{tabular}
173
174\subsection{SYSCALL\_INVOKE -- Capability Invocation Interface}\label{sec:sys_invoke}
175
176The invoke call acts as a generic system call to apply operation on various OS
177objects (also known as capabilities). For any given object, a distinct set of
178operations are applicable depending on the capability type.
179
180This system call takes at least one argument, which must be the address of a
181capability in the caller's CSpace. The remaining arguments, if any, are
182interpreted based on the type of this first capability.
183
184Other than yielding, all kernel operations including IDC are (or should be)
185provided by capability invocation, and make use of this call. The
186possible invocations for every capability type are described in
187the capability management document (TN-013).
188
189This system call may only be used while the caller is
190\emph{enabled}. The reason is that the caller must be prepared to
191receive a reply immediately and that is only possible when
192enabled, as it requires the kernel to enter the dispatcher at the
193IDC entry point.
194
195\subsection{SYSCALL\_YIELD -- Yield the CPU}\label{sec:sys_yield}
196
197This system call yields the CPU. It takes a single argument, which must be
198either the CSpace address of a dispatcher capability, or \verb CPTR_NULL .
199In the first case, the given dispatcher is run unconditionally; in the
200latter case, the scheduler picks which dispatcher to run.
201
202This system call may only be used while the caller is \emph{disabled}.
203Furthermore, it clears the caller's \emph{disabled} flag, so the next time
204it will be entered is at the run entry point.
205
206\subsection{SYSCALL\_DEBUG -- Debug system calls}
207
208The debug system call (SYSCALL\_DEBUG) de-multiplexes using the second system
209call argument and is defined for the following operations.
210Those calls may not be supported, depending on build options,
211and are not part of the regular kernel interface.
212
213\begin{api}[DEBUG\_CONTEXT\_COUNTER\_RESET]
214\brief{Sets the \varname{context\_switch\_counter} to 0.}
215\end{api}
216
217\begin{api}[DEBUG\_CONTEXT\_COUNTER\_READ]
218\brief{Returns \varname{context\_switch\_counter}.}
219\end{api}
220
221\begin{api}[DEBUG\_TIMESLICE\_COUNTER\_READ]
222\brief{Returns \varname{kernel\_now}.}
223\end{api}
224
225\begin{api}[DEBUG\_FLUSH\_CACHE]
226\brief{Executes \fnname{wbinvd} on x86-64.}
227\end{api}
228
229\begin{api}[DEBUG\_SEND\_IPI][unstable]
230\brief{Sends an interrupt to a remote core.}
231\begin{arguments}
232\argument{destination}{Target core.}
233\argument{shorthand}{?}
234\argument{vector}{IRQ number.}
235\end{arguments}
236\note{Is this needed with the IPI capability?}
237\end{api}
238
239\begin{api}[DEBUG\_SET\_BREAKPOINT]
240\brief{Sets a hardware breakpoint at an address.}
241\begin{arguments}
242\argument{addr}{Where to break.}
243\argument{mode}{?}
244\argument{length}{?}
245\end{arguments}
246\note{Use dr7 and dr0 on x86-64.}
247\end{api}
248
249\begin{api}[DEBUG\_SEND\_NOTIFY][unused]
250\brief{Does only exist as a definition?}
251\end{api}
252
253\begin{api}[DEBUG\_SLEEP][unused]
254\brief{Does only exist as a definition?}
255\end{api}
256
257\begin{api}[DEBUG\_HARDWARE\_TIMER\_READ][unstable]
258\brief{Returns \fnname{tsc\_read}.}
259\note{Exists only for ARM.}
260\end{api}
261
262\begin{api}[DEBUG\_HARDWARE\_TIMER\_HERTZ\_READ][unstable]
263\brief{Returns \fnname{tsc\_get\_hz}.}
264\note{Exists only on ARM.}
265\end{api}
266
267\begin{api}[DEBUG\_HARDWARE\_GLOBAL\_TIMER\_LOW][unstable]
268\brief{Returns \fnname{gt\_read\_low}. The lower 32 bits of the timer.}
269\note{Exists only in OMAP, and returns 0 on GEM 5.}
270\end{api}
271
272\begin{api}[DEBUG\_HARDWARE\_GLOBAL\_TIMER\_HIGH][unstable]
273\brief{Returns global timer \fnname{gt\_read\_high}. The higher 32 bits of the timer.}
274\note{Exists only in OMAP, and returns 0 on GEM 5.}
275\end{api}
276
277\begin{api}[DEBUG\_GET\_TSC\_PER\_MS][unstable]
278\brief{Returns TSC (\fnname{rdtsc}) clock rate in ticks per ms.}
279\note{Implemention for x86 only.}
280\end{api}
281
282\begin{api}[DEBUG\_GET\_APIC\_TIMER][unstable]
283\brief{Returns the XAPIC timer counter.}
284\note{Implemention for x86-64 only.}
285\end{api}
286
287\begin{api}[DEBUG\_GET\_APIC\_TICKS\_PER\_SEC][unstable]
288\brief{Returns ticks per seconds of the APIC timer.}
289\note{Calibrated against RTC clock. Implemention for x86-64 only.}
290\end{api}
291
292\begin{api}[DEBUG\_FEIGN\_FRAME\_CAP][unused]
293\brief{Fabricates an arbitrary DevFrame cap.}
294\note{Implemention for x86-32 bit only. Not used?}
295\end{api}
296
297\begin{api}[DEBUG\_TRACE\_PMEM\_CTRL]
298\brief{Enables tracing for capabilities.}
299\begin{arguments}
300\argument{types}{?}
301\argument{start}{?}
302\argument{size}{?}
303\end{arguments}
304\note{Implemention for x86-64 and aarch64 only.}
305\end{api}
306
307\begin{api}[DEBUG\_GET\_APIC\_ID]
308\brief{Returns the xAPIC ID.}
309\note{Implemention for x86-64 only.}
310\end{api}
311
312\subsection{SYSCALL\_REBOOT -- Reboot the system}
313This call unconditionally hard reboots the system.
314\todo{This call should be removed -AB}
315
316\subsection{SYSCALL\_NOP}
317This call takes no arguments, and returns directly to the
318caller. It always succeeds.
319
320\subsection{SYSCALL\_PRINT}
321This call takes two arguments: an address in the caller's vspace, which
322must be mapped, and a size, and prints the string found at that address
323to the console. It may fail if any part of the string is not accessible
324to the calling domain.
325
326\subsection{SYSCALL\_SUSPEND}
327\todo{should probably be a cap invocation}
328
329\subsection{SYSCALL\_GET\_ABS\_TIME}
330\todo{Figure out proper time API, they appear in various
331DEBUG syscalls as well.}
332
333
334\section{Dispatch and Execution}\label{sec:dispatch}
335
336A dispatcher consists of code executing at user-level and a data
337structure located in pinned memory, split into two regions. One
338region is only accessible from the kernel, the other region is
339shared read/write between user and kernel. The fields in the
340kernel-defined part of the structure are described in
341\autoref{tab:dispcb}.
342
343\ctable[
344caption=Dispatcher control structure,
345label=tab:dispcb,
346width=\textwidth
347]{lll>{\raggedright}X}{}{
348  \FL
349  Field name & Size & Kernel R/W & Short description
350  \ML
351  \lstinline+disabled+ & word & R/W & If non-zero, the kernel will not
352  upcall the dispatcher, except to deliver a trap.
353  \NN
354  \lstinline+haswork+ & pointer & R & If non-zero, the kernel will
355  consider this dispatcher eligible to run.
356  \NN
357  \lstinline+crit_pc_low+ & pointer & R & Address of first instruction
358  in dispatcher's critical code section.
359  \NN
360  \lstinline+crit_pc_high+ & pointer & R & Address immediately after
361  last instruction of dispatcher's critical code section.
362  \NN
363  entry points & 4 function descriptors & R & Functions at which
364  the dispatcher code may be invoked
365  \NN
366  \lstinline+enabled_save_area+ & arch specific & W & Area for kernel
367  to save register state when enabled
368  \NN
369  \lstinline+disabled_save_area+ & arch specific & R/W & Area for
370  kernel to save and restore register state when disabled
371  \NN
372  \lstinline+trap_save_area+ & arch specific & W & Area for kernel to
373  save register state when a trap or a pagefault while disabled occurs
374  \NN
375  \lstinline+recv_cptr+ & capability pointer & R & Address of CNode to
376  store received capabilities of next local IDC into
377  \NN
378  \lstinline+recv_bits+ & word & R & Number of valid bits within
379  \lstinline+recv_cptr+
380  \NN
381  \lstinline+recv_slot+ & word & R & Slot within CNode to store
382  received capability of next local IDC into
383  \LL
384}
385
386Beyond these fields, the user may define and use their own data
387structures (eg. a stack for the dispatcher code to execute on,
388thread management structures, etc).
389
390\subsection{Disabled}
391
392A dispatcher is considered disabled by the kernel if either of the
393following conditions is true:
394
395\begin{itemize}
396\item its disabled word is non-zero
397\item its program counter is within the range specified by the
398\lstinline+crit_pc_low+ and \lstinline+crit_pc_high+ fields
399\end{itemize}
400
401The disabled state of a dispatcher controls where the kernel saves
402its registers, and is described in the following subsection. When
403the kernel resumes a dispatcher that was last running while
404disabled, it restores its machine state and resumes execution at the
405saved instruction, rather than upcalling it at an entry point.
406
407\subsection{Register save areas}
408
409The dispatcher structure contains enough space for three full copies
410of the machine register state to be saved. The \lstinline+trap_save_area+
411is used whenever the dispatcher takes a trap, regardless of whether
412it is enabled or disabled. Otherwise, the \lstinline+disabled_save_area+
413is used whenever the dispatcher is disabled (see above), and the
414\lstinline+enabled_save_area+ is used in all other cases.
415
416\autoref{fig:dispstatesaves} (Trap and PageFault states have
417been left out for brevity) shows important dispatcher states and into
418which register save area state is saved upon a state transition. The
419starting state for a domain is ``notrunning'' and depicted with a
420bold border in the Figure.
421
422\begin{figure}
423\centering
424\includegraphics[width=\textwidth]{disp_states_simple_save_area_analysis}
425\caption[Dispatcher state save areas]{Dispatcher state save areas.
426 Trap and PageFault states
427 omitted for brevity. Regular text and lines denote state changes
428 by the kernel. Dashed lines and italic text denote state changes
429 by user-space, which do not necessarily have to use the denoted
430 save area. The starting state is in the bold
431 node.}\label{fig:dispstatesaves}
432\end{figure}
433
434Arrows from right to left involve saving state into the labeled
435area. Arrows from left to right involve restoring state from the
436labeled area. It can be seen that no state can be overwritten. The
437kernel can recognize a disabled dispatcher by looking at the
438disabled flag, as well as the domain's instruction pointer. Nothing
439else needs to be examined.
440
441The dispatcher states are also depicted in \autoref{fig:dispstates}.
442
443\subsection{Dispatcher Entry Points}
444
445Unless restoring it from a disabled context, the kernel always
446enters a dispatcher at one of the following entry
447points. Whenever the kernel invokes a dispatcher at any of its entry
448points, it sets the disabled bit on. One (ABI-specific) register
449always points to the dispatcher structure. The value of all other
450registers depends on the entry point at which the dispatcher is
451invoked, and is described below.
452
453The entry points are:
454
455\begin{description}
456\item[Run] A dispatcher is entered at this entry point when it was
457not previously running, the last time it ran it was either enabled or
458yielded the CPU, and the kernel has given it the CPU. Other than the
459register that holds a pointer to the dispatcher itself, all other registers
460are undefined. The dispatcher's last machine state is saved in the
461\lstinline+enabled_save_area+.
462
463\item[PageFault] A dispatcher is entered at this entry point when it
464suffers a page fault while enabled. On entry, the dispatcher register is
465set, and the argument registers contain information about the cause of
466the fault. Volatile registers are saved in the
467\lstinline+enabled_save_area+; all other registers contain the user
468state at the time of the fault.
469
470\item[PageFault\_Disabled] A dispatcher is entered at this entry point when it
471suffers a page fault while disabled. On entry, the dispatcher register is
472set, and the argument registers contain information about the cause of
473the fault. Volatile registers are saved in the
474\lstinline+trap_save_area+; all other registers contain the user
475state at the time of the fault.
476
477\item[Trap] A dispatcher is entered at this entry point when it is
478running and it raises an exception (for example, illegal
479instruction, divide by zero, breakpoint, etc.). Unlike the other
480entry points, a dispatcher may be entered at its trap entry even
481when it was running disabled. The machine state at the time of the
482trap is saved in the \lstinline+trap_save_area+, and the argument
483registers convey information about the cause of the trap.
484
485\item[LRPC] A dispatcher is entered at this entry point when an
486LRPC message (see below) is delivered to it. This can only
487happen when it was not previously running, and was enabled. On
488entry, four registers are delivered containing the message payload,
489one stores the endpoint offset, and another contains the dispatcher pointer.
490\end{description}
491
492This diagram shows the states a \emph{dispatcher} can be in and how it
493gets there. The exceptional states Trap and PageFault have been
494omitted for brevity.
495
496\begin{figure}
497\centering
498\includegraphics[width=.7\columnwidth]{disp_states_simple}
499\caption[Typical dispatcher states]{Typical dispatcher states.
500  Trap and PageFault states
501  omitted for brevity. Regular text and lines denote state changes
502  by the kernel. Dashed lines and italic text denote state changes
503  by user-space. The starting state is in bold.}
504\label{fig:dispstates}
505\end{figure}
506
507\subsection{Interrupt delivery}\label{sec:interrupts}
508
509Hardware interrupts are delivered by the kernel as asynchronous IDC
510messages to a registered dispatcher. A dispatcher can be registered
511as for a specific IRQ by invoking the IRQTable capability,
512passing it an IDC endpoint to the dispatcher and the IRQ
513number. It is not possible for multiple IDC endpoints to be
514registered with the same IRQ number at any one time.
515
516Henceforth, the kernel will send an IDC message using asynchronous
517delivery to the registered endpoint. Asynchronous
518IDC is used as it does not cause priority inversion by directly
519dispatching the target dispatcher.
520
521\subsection{Exception delivery}
522
523When a CPU exception happens in user-space, it is reflected to the
524dispatcher on which it appeared. Page
525faults are dispatched to the page-fault entry point of the
526dispatcher. All other exceptions are dispatched to the trap entry
527point of the dispatcher. The disabled flag of the dispatcher is
528ignored in all cases and state is saved to the trap save area.
529
530\section{Scheduling}
531
532Upon reception of a timer interrupt, the kernel calls `schedule()`,
533which selects the next dispatcher to run. At the moment, a simple
534round-robin scheduler is implemented that walks a circular
535singly-linked list forever.
536\todo{RBED, gang-scheduling}
537
538\section{TODO}
539\begin{itemize}
540\item virtual machine support
541\item timers
542\item resource management
543\item thread migration
544\item event tracing / performance monitoring
545\end{itemize}
546
547\chapter{Barrelfish Library API}
548\todo{Documentation of libbarrelfish}
549
550\subsection{Initial Capability Space}
551
552The initial capability space of other domains is similar, but lacks the other
553cnodes in the root cnode, as illustrated in \autoref{fig:app_cspace}.
554
555\begin{figure}
556\centering
557\includegraphics[width=\textwidth]{app_cspace}
558\caption{initial capability space layout of user tasks}
559\label{fig:app_cspace}
560\end{figure}
561
562
563\chapter*{Acknowledgements}
564Paul, Rebecca, Tim, et al.
565
566
567\bibliographystyle{plain}
568\bibliography{defs,barrelfish}
569\end{document}
570