1%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2% Copyright (c) 2015, ETH Zurich. 3% All rights reserved. 4% 5% This file is distributed under the terms in the attached LICENSE file. 6% If you do not find this file, copies can be found by writing to: 7% ETH Zurich D-INFK, Universitaetstr 6, CH-8092 Zurich. Attn: Systems Group. 8%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 10\documentclass[a4paper,11pt,twoside]{report} 11\usepackage{bftn} 12\usepackage{calc} 13\usepackage{verbatim} 14\usepackage{xspace} 15\usepackage{pifont} 16\usepackage{pxfonts} 17\usepackage{textcomp} 18\usepackage{amsmath} 19\usepackage{multirow} 20\usepackage{listings} 21\usepackage[framemethod=default]{mdframed} 22\usepackage[shortlabels]{enumitem} 23\usepackage{parskip} 24\usepackage{xparse} 25\usepackage{ctable} 26\usepackage[pdftex]{hyperref} 27 28% hyperref setup 29\definecolor{linkcol}{rgb}{0,0,0.7} 30\hypersetup{ 31 pdftitle={Barrelfish Specification}, 32 plainpages=false, 33 linktocpage, 34 colorlinks, 35 linkcolor=linkcol,citecolor=linkcol,pagecolor=linkcol,urlcolor=linkcol 36 %breaklinks=true,pagebackref=true 37} 38 39\mdfdefinestyle{mdsyscall}{ 40rightline=true, 41innerleftmargin=10, 42innerrightmargin=10, 43frametitlerulewidth=2pt, 44frametitlefont={\color{white}\varname}, 45skipbelow=1em, 46skipabove=1em, 47} 48 49\DeclareDocumentEnvironment{arguments}{} 50{ 51\newcommand{\argument}[2]{ \item[\varname{##1}] ##2 } 52\textbf{Arguments}\parskip 53\begin{description}[leftmargin=!,labelwidth=\widthof{count},labelindent=1em] 54} 55{ 56\end{description} 57} 58 59\DeclareDocumentEnvironment{api}{o O {stable}} 60{ 61\definecolor{unused}{RGB}{215,25,28} 62\definecolor{unstable}{RGB}{253,174,97} 63\definecolor{todo}{RGB}{255,255,191} 64\definecolor{lightblue}{RGB}{171,217,233} 65\definecolor{stable}{RGB}{44,123,182} 66\newcommand{\brief}[1]{##1\\} 67\newcommand{\note}[1]{\\\textbf{Note}\\##1\parskip} 68 69\begin{mdframed}[style=mdsyscall,frametitle=#1,frametitlebackgroundcolor=#2] 70} 71{ 72\end{mdframed} 73} 74 75\newcommand{\fnname}[1]{\textit{\texttt{#1}}}% 76\newcommand{\datatype}[1]{\textit{\texttt{#1}}}% 77\newcommand{\varname}[1]{\texttt{#1}}% 78\newcommand{\keywname}[1]{\textbf{\texttt{#1}}}% 79\newcommand{\pathname}[1]{\texttt{#1}}% 80\newcommand{\tabindent}{\hspace*{3ex}}% 81\newcommand{\sockeye}{\lstinline[language=sockeye]} 82\newcommand{\ccode}{\lstinline[language=C]} 83 84% Default language for code listings is C 85\lstset{ 86 language=C, 87 basicstyle=\small, 88 frame=lines, 89 breaklines=true, 90 showstringspaces=false, 91 texcl=true, 92 columns=flexible 93} 94 95\lstdefinelanguage{Mackerel}{ 96 morekeywords={datatype,device,register,regtype,constants,type,at, 97 many,edit,io,also}, 98 sensitive=false, 99 morecomment=[l]{//}, 100 morecomment=[s]{/*}{*/}, 101 morestring=[b]", 102} 103 104% sans-serif URLs 105\urlstyle{sf} 106 107\newcommand{\todo}[1]{[\textcolor{red}{\emph{#1}}]} 108 109\newcommand{\noarginvocation}[1]{\paragraph{#1 invocation}} 110\newcounter{invocArgCnt} 111\newenvironment{invocation}[1]{% 112 \noarginvocation{#1} 113 114 \begin{list}{\emph{Argument~\arabic{invocArgCnt}:}}{% 115 \usecounter{invocArgCnt}% 116 \setlength{\rightmargin}{\leftmargin}% 117 \setlength{\itemsep}{0ex}% 118 } 119 \renewcommand{\arg}{\item} 120}{% 121 \end{list} 122} 123 124 125% autoref (from hyperref) setup 126\def\chapterautorefname{Chapter} 127\def\sectionautorefname{Section} 128\def\subsectionautorefname{Section} 129\expandafter\def\csname section*autorefname\endcsname{Section} 130 131 132\title{Barrelfish Specification} 133\author{Barrelfish project} 134\tnnumber{10} 135\tnkey{Specification} 136 137\presetkeys{todonotes}{inline}{} 138 139\begin{document} 140 141\maketitle 142 143\begin{versionhistory} 144\vhEntry{0.1}{01.01.2009}{AB,SP,TR,AS,AK}{Initial Version} 145\vhEntry{0.2}{01.01.2015}{GZ}{Update ABI etc.} 146\end{versionhistory} 147 148\cleardoublepage 149\tableofcontents 150 151\chapter{Barrelfish Kernel API} 152 153\section{System Calls}\label{sec:syscalls} 154 155The section defines the specification of the common system call 156API that is provided by a Barrelfish CPU driver. Currently 157we have the following system calls: 158 159\begin{tabular}{|p{5cm}|>{\raggedright\arraybackslash}p{7cm}|} 160\hline 161 162SYSCALL\_INVOKE & Invoke a capability. \\ 163SYSCALL\_YIELD & Yield the CPU. \\ 164SYSCALL\_LRPC & Fast LRPC. \\ 165SYSCALL\_DEBUG & Benchmarking and debug syscalls. \\ 166SYSCALL\_REBOOT & Reboot the machine. \\ 167SYSCALL\_NOP & No operation. \\ 168SYSCALL\_PRINT & Write to console. \\ 169SYSCALL\_SUSPEND & Suspend the CPU. \\ 170SYSCALL\_GET\_ABS\_TIME & Get time elapsed since boot. \\ 171\hline 172\end{tabular} 173 174\subsection{SYSCALL\_INVOKE -- Capability Invocation Interface}\label{sec:sys_invoke} 175 176The invoke call acts as a generic system call to apply operation on various OS 177objects (also known as capabilities). For any given object, a distinct set of 178operations are applicable depending on the capability type. 179 180This system call takes at least one argument, which must be the address of a 181capability in the caller's CSpace. The remaining arguments, if any, are 182interpreted based on the type of this first capability. 183 184Other than yielding, all kernel operations including IDC are (or should be) 185provided by capability invocation, and make use of this call. The 186possible invocations for every capability type are described in 187the capability management document (TN-013). 188 189This system call may only be used while the caller is 190\emph{enabled}. The reason is that the caller must be prepared to 191receive a reply immediately and that is only possible when 192enabled, as it requires the kernel to enter the dispatcher at the 193IDC entry point. 194 195\subsection{SYSCALL\_YIELD -- Yield the CPU}\label{sec:sys_yield} 196 197This system call yields the CPU. It takes a single argument, which must be 198either the CSpace address of a dispatcher capability, or \verb CPTR_NULL . 199In the first case, the given dispatcher is run unconditionally; in the 200latter case, the scheduler picks which dispatcher to run. 201 202This system call may only be used while the caller is \emph{disabled}. 203Furthermore, it clears the caller's \emph{disabled} flag, so the next time 204it will be entered is at the run entry point. 205 206\subsection{SYSCALL\_DEBUG -- Debug system calls} 207 208The debug system call (SYSCALL\_DEBUG) de-multiplexes using the second system 209call argument and is defined for the following operations. 210Those calls may not be supported, depending on build options, 211and are not part of the regular kernel interface. 212 213\begin{api}[DEBUG\_CONTEXT\_COUNTER\_RESET] 214\brief{Sets the \varname{context\_switch\_counter} to 0.} 215\end{api} 216 217\begin{api}[DEBUG\_CONTEXT\_COUNTER\_READ] 218\brief{Returns \varname{context\_switch\_counter}.} 219\end{api} 220 221\begin{api}[DEBUG\_TIMESLICE\_COUNTER\_READ] 222\brief{Returns \varname{kernel\_now}.} 223\end{api} 224 225\begin{api}[DEBUG\_FLUSH\_CACHE] 226\brief{Executes \fnname{wbinvd} on x86-64.} 227\end{api} 228 229\begin{api}[DEBUG\_SEND\_IPI][unstable] 230\brief{Sends an interrupt to a remote core.} 231\begin{arguments} 232\argument{destination}{Target core.} 233\argument{shorthand}{?} 234\argument{vector}{IRQ number.} 235\end{arguments} 236\note{Is this needed with the IPI capability?} 237\end{api} 238 239\begin{api}[DEBUG\_SET\_BREAKPOINT] 240\brief{Sets a hardware breakpoint at an address.} 241\begin{arguments} 242\argument{addr}{Where to break.} 243\argument{mode}{?} 244\argument{length}{?} 245\end{arguments} 246\note{Use dr7 and dr0 on x86-64.} 247\end{api} 248 249\begin{api}[DEBUG\_SEND\_NOTIFY][unused] 250\brief{Does only exist as a definition?} 251\end{api} 252 253\begin{api}[DEBUG\_SLEEP][unused] 254\brief{Does only exist as a definition?} 255\end{api} 256 257\begin{api}[DEBUG\_HARDWARE\_TIMER\_READ][unstable] 258\brief{Returns \fnname{tsc\_read}.} 259\note{Exists only for ARM.} 260\end{api} 261 262\begin{api}[DEBUG\_HARDWARE\_TIMER\_HERTZ\_READ][unstable] 263\brief{Returns \fnname{tsc\_get\_hz}.} 264\note{Exists only on ARM.} 265\end{api} 266 267\begin{api}[DEBUG\_HARDWARE\_GLOBAL\_TIMER\_LOW][unstable] 268\brief{Returns \fnname{gt\_read\_low}. The lower 32 bits of the timer.} 269\note{Exists only in OMAP, and returns 0 on GEM 5.} 270\end{api} 271 272\begin{api}[DEBUG\_HARDWARE\_GLOBAL\_TIMER\_HIGH][unstable] 273\brief{Returns global timer \fnname{gt\_read\_high}. The higher 32 bits of the timer.} 274\note{Exists only in OMAP, and returns 0 on GEM 5.} 275\end{api} 276 277\begin{api}[DEBUG\_GET\_TSC\_PER\_MS][unstable] 278\brief{Returns TSC (\fnname{rdtsc}) clock rate in ticks per ms.} 279\note{Implemention for x86 only.} 280\end{api} 281 282\begin{api}[DEBUG\_GET\_APIC\_TIMER][unstable] 283\brief{Returns the XAPIC timer counter.} 284\note{Implemention for x86-64 only.} 285\end{api} 286 287\begin{api}[DEBUG\_GET\_APIC\_TICKS\_PER\_SEC][unstable] 288\brief{Returns ticks per seconds of the APIC timer.} 289\note{Calibrated against RTC clock. Implemention for x86-64 only.} 290\end{api} 291 292\begin{api}[DEBUG\_FEIGN\_FRAME\_CAP][unused] 293\brief{Fabricates an arbitrary DevFrame cap.} 294\note{Implemention for x86-32 bit only. Not used?} 295\end{api} 296 297\begin{api}[DEBUG\_TRACE\_PMEM\_CTRL] 298\brief{Enables tracing for capabilities.} 299\begin{arguments} 300\argument{types}{?} 301\argument{start}{?} 302\argument{size}{?} 303\end{arguments} 304\note{Implemention for x86-64 and aarch64 only.} 305\end{api} 306 307\begin{api}[DEBUG\_GET\_APIC\_ID] 308\brief{Returns the xAPIC ID.} 309\note{Implemention for x86-64 only.} 310\end{api} 311 312\subsection{SYSCALL\_REBOOT -- Reboot the system} 313This call unconditionally hard reboots the system. 314\todo{This call should be removed -AB} 315 316\subsection{SYSCALL\_NOP} 317This call takes no arguments, and returns directly to the 318caller. It always succeeds. 319 320\subsection{SYSCALL\_PRINT} 321This call takes two arguments: an address in the caller's vspace, which 322must be mapped, and a size, and prints the string found at that address 323to the console. It may fail if any part of the string is not accessible 324to the calling domain. 325 326\subsection{SYSCALL\_SUSPEND} 327\todo{should probably be a cap invocation} 328 329\subsection{SYSCALL\_GET\_ABS\_TIME} 330\todo{Figure out proper time API, they appear in various 331DEBUG syscalls as well.} 332 333 334\section{Dispatch and Execution}\label{sec:dispatch} 335 336A dispatcher consists of code executing at user-level and a data 337structure located in pinned memory, split into two regions. One 338region is only accessible from the kernel, the other region is 339shared read/write between user and kernel. The fields in the 340kernel-defined part of the structure are described in 341\autoref{tab:dispcb}. 342 343\ctable[ 344caption=Dispatcher control structure, 345label=tab:dispcb, 346width=\textwidth 347]{lll>{\raggedright}X}{}{ 348 \FL 349 Field name & Size & Kernel R/W & Short description 350 \ML 351 \lstinline+disabled+ & word & R/W & If non-zero, the kernel will not 352 upcall the dispatcher, except to deliver a trap. 353 \NN 354 \lstinline+haswork+ & pointer & R & If non-zero, the kernel will 355 consider this dispatcher eligible to run. 356 \NN 357 \lstinline+crit_pc_low+ & pointer & R & Address of first instruction 358 in dispatcher's critical code section. 359 \NN 360 \lstinline+crit_pc_high+ & pointer & R & Address immediately after 361 last instruction of dispatcher's critical code section. 362 \NN 363 entry points & 4 function descriptors & R & Functions at which 364 the dispatcher code may be invoked 365 \NN 366 \lstinline+enabled_save_area+ & arch specific & W & Area for kernel 367 to save register state when enabled 368 \NN 369 \lstinline+disabled_save_area+ & arch specific & R/W & Area for 370 kernel to save and restore register state when disabled 371 \NN 372 \lstinline+trap_save_area+ & arch specific & W & Area for kernel to 373 save register state when a trap or a pagefault while disabled occurs 374 \NN 375 \lstinline+recv_cptr+ & capability pointer & R & Address of CNode to 376 store received capabilities of next local IDC into 377 \NN 378 \lstinline+recv_bits+ & word & R & Number of valid bits within 379 \lstinline+recv_cptr+ 380 \NN 381 \lstinline+recv_slot+ & word & R & Slot within CNode to store 382 received capability of next local IDC into 383 \LL 384} 385 386Beyond these fields, the user may define and use their own data 387structures (eg. a stack for the dispatcher code to execute on, 388thread management structures, etc). 389 390\subsection{Disabled} 391 392A dispatcher is considered disabled by the kernel if either of the 393following conditions is true: 394 395\begin{itemize} 396\item its disabled word is non-zero 397\item its program counter is within the range specified by the 398\lstinline+crit_pc_low+ and \lstinline+crit_pc_high+ fields 399\end{itemize} 400 401The disabled state of a dispatcher controls where the kernel saves 402its registers, and is described in the following subsection. When 403the kernel resumes a dispatcher that was last running while 404disabled, it restores its machine state and resumes execution at the 405saved instruction, rather than upcalling it at an entry point. 406 407\subsection{Register save areas} 408 409The dispatcher structure contains enough space for three full copies 410of the machine register state to be saved. The \lstinline+trap_save_area+ 411is used whenever the dispatcher takes a trap, regardless of whether 412it is enabled or disabled. Otherwise, the \lstinline+disabled_save_area+ 413is used whenever the dispatcher is disabled (see above), and the 414\lstinline+enabled_save_area+ is used in all other cases. 415 416\autoref{fig:dispstatesaves} (Trap and PageFault states have 417been left out for brevity) shows important dispatcher states and into 418which register save area state is saved upon a state transition. The 419starting state for a domain is ``notrunning'' and depicted with a 420bold border in the Figure. 421 422\begin{figure} 423\centering 424\includegraphics[width=\textwidth]{disp_states_simple_save_area_analysis} 425\caption[Dispatcher state save areas]{Dispatcher state save areas. 426 Trap and PageFault states 427 omitted for brevity. Regular text and lines denote state changes 428 by the kernel. Dashed lines and italic text denote state changes 429 by user-space, which do not necessarily have to use the denoted 430 save area. The starting state is in the bold 431 node.}\label{fig:dispstatesaves} 432\end{figure} 433 434Arrows from right to left involve saving state into the labeled 435area. Arrows from left to right involve restoring state from the 436labeled area. It can be seen that no state can be overwritten. The 437kernel can recognize a disabled dispatcher by looking at the 438disabled flag, as well as the domain's instruction pointer. Nothing 439else needs to be examined. 440 441The dispatcher states are also depicted in \autoref{fig:dispstates}. 442 443\subsection{Dispatcher Entry Points} 444 445Unless restoring it from a disabled context, the kernel always 446enters a dispatcher at one of the following entry 447points. Whenever the kernel invokes a dispatcher at any of its entry 448points, it sets the disabled bit on. One (ABI-specific) register 449always points to the dispatcher structure. The value of all other 450registers depends on the entry point at which the dispatcher is 451invoked, and is described below. 452 453The entry points are: 454 455\begin{description} 456\item[Run] A dispatcher is entered at this entry point when it was 457not previously running, the last time it ran it was either enabled or 458yielded the CPU, and the kernel has given it the CPU. Other than the 459register that holds a pointer to the dispatcher itself, all other registers 460are undefined. The dispatcher's last machine state is saved in the 461\lstinline+enabled_save_area+. 462 463\item[PageFault] A dispatcher is entered at this entry point when it 464suffers a page fault while enabled. On entry, the dispatcher register is 465set, and the argument registers contain information about the cause of 466the fault. Volatile registers are saved in the 467\lstinline+enabled_save_area+; all other registers contain the user 468state at the time of the fault. 469 470\item[PageFault\_Disabled] A dispatcher is entered at this entry point when it 471suffers a page fault while disabled. On entry, the dispatcher register is 472set, and the argument registers contain information about the cause of 473the fault. Volatile registers are saved in the 474\lstinline+trap_save_area+; all other registers contain the user 475state at the time of the fault. 476 477\item[Trap] A dispatcher is entered at this entry point when it is 478running and it raises an exception (for example, illegal 479instruction, divide by zero, breakpoint, etc.). Unlike the other 480entry points, a dispatcher may be entered at its trap entry even 481when it was running disabled. The machine state at the time of the 482trap is saved in the \lstinline+trap_save_area+, and the argument 483registers convey information about the cause of the trap. 484 485\item[LRPC] A dispatcher is entered at this entry point when an 486LRPC message (see below) is delivered to it. This can only 487happen when it was not previously running, and was enabled. On 488entry, four registers are delivered containing the message payload, 489one stores the endpoint offset, and another contains the dispatcher pointer. 490\end{description} 491 492This diagram shows the states a \emph{dispatcher} can be in and how it 493gets there. The exceptional states Trap and PageFault have been 494omitted for brevity. 495 496\begin{figure} 497\centering 498\includegraphics[width=.7\columnwidth]{disp_states_simple} 499\caption[Typical dispatcher states]{Typical dispatcher states. 500 Trap and PageFault states 501 omitted for brevity. Regular text and lines denote state changes 502 by the kernel. Dashed lines and italic text denote state changes 503 by user-space. The starting state is in bold.} 504\label{fig:dispstates} 505\end{figure} 506 507\subsection{Interrupt delivery}\label{sec:interrupts} 508 509Hardware interrupts are delivered by the kernel as asynchronous IDC 510messages to a registered dispatcher. A dispatcher can be registered 511as for a specific IRQ by invoking the IRQTable capability, 512passing it an IDC endpoint to the dispatcher and the IRQ 513number. It is not possible for multiple IDC endpoints to be 514registered with the same IRQ number at any one time. 515 516Henceforth, the kernel will send an IDC message using asynchronous 517delivery to the registered endpoint. Asynchronous 518IDC is used as it does not cause priority inversion by directly 519dispatching the target dispatcher. 520 521\subsection{Exception delivery} 522 523When a CPU exception happens in user-space, it is reflected to the 524dispatcher on which it appeared. Page 525faults are dispatched to the page-fault entry point of the 526dispatcher. All other exceptions are dispatched to the trap entry 527point of the dispatcher. The disabled flag of the dispatcher is 528ignored in all cases and state is saved to the trap save area. 529 530\section{Scheduling} 531 532Upon reception of a timer interrupt, the kernel calls `schedule()`, 533which selects the next dispatcher to run. At the moment, a simple 534round-robin scheduler is implemented that walks a circular 535singly-linked list forever. 536\todo{RBED, gang-scheduling} 537 538\section{TODO} 539\begin{itemize} 540\item virtual machine support 541\item timers 542\item resource management 543\item thread migration 544\item event tracing / performance monitoring 545\end{itemize} 546 547\chapter{Barrelfish Library API} 548\todo{Documentation of libbarrelfish} 549 550\subsection{Initial Capability Space} 551 552The initial capability space of other domains is similar, but lacks the other 553cnodes in the root cnode, as illustrated in \autoref{fig:app_cspace}. 554 555\begin{figure} 556\centering 557\includegraphics[width=\textwidth]{app_cspace} 558\caption{initial capability space layout of user tasks} 559\label{fig:app_cspace} 560\end{figure} 561 562 563\chapter*{Acknowledgements} 564Paul, Rebecca, Tim, et al. 565 566 567\bibliographystyle{plain} 568\bibliography{defs,barrelfish} 569\end{document} 570