vfs_aio.c revision 280258
1/*-
2 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 *    derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author.  This software is distributed AS-IS.
15 */
16
17/*
18 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19 */
20
21#include <sys/cdefs.h>
22__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_aio.c 280258 2015-03-19 13:37:36Z rwatson $");
23
24#include "opt_compat.h"
25
26#include <sys/param.h>
27#include <sys/systm.h>
28#include <sys/malloc.h>
29#include <sys/bio.h>
30#include <sys/buf.h>
31#include <sys/capsicum.h>
32#include <sys/eventhandler.h>
33#include <sys/sysproto.h>
34#include <sys/filedesc.h>
35#include <sys/kernel.h>
36#include <sys/module.h>
37#include <sys/kthread.h>
38#include <sys/fcntl.h>
39#include <sys/file.h>
40#include <sys/limits.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/unistd.h>
44#include <sys/posix4.h>
45#include <sys/proc.h>
46#include <sys/resourcevar.h>
47#include <sys/signalvar.h>
48#include <sys/protosw.h>
49#include <sys/rwlock.h>
50#include <sys/sema.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/syscall.h>
54#include <sys/sysent.h>
55#include <sys/sysctl.h>
56#include <sys/sx.h>
57#include <sys/taskqueue.h>
58#include <sys/vnode.h>
59#include <sys/conf.h>
60#include <sys/event.h>
61#include <sys/mount.h>
62
63#include <machine/atomic.h>
64
65#include <vm/vm.h>
66#include <vm/vm_extern.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_object.h>
70#include <vm/uma.h>
71#include <sys/aio.h>
72
73#include "opt_vfs_aio.h"
74
75/*
76 * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
77 * overflow. (XXX will be removed soon.)
78 */
79static u_long jobrefid;
80
81/*
82 * Counter for aio_fsync.
83 */
84static uint64_t jobseqno;
85
86#define JOBST_NULL		0
87#define JOBST_JOBQSOCK		1
88#define JOBST_JOBQGLOBAL	2
89#define JOBST_JOBRUNNING	3
90#define JOBST_JOBFINISHED	4
91#define JOBST_JOBQBUF		5
92#define JOBST_JOBQSYNC		6
93
94#ifndef MAX_AIO_PER_PROC
95#define MAX_AIO_PER_PROC	32
96#endif
97
98#ifndef MAX_AIO_QUEUE_PER_PROC
99#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
100#endif
101
102#ifndef MAX_AIO_PROCS
103#define MAX_AIO_PROCS		32
104#endif
105
106#ifndef MAX_AIO_QUEUE
107#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
108#endif
109
110#ifndef TARGET_AIO_PROCS
111#define TARGET_AIO_PROCS	4
112#endif
113
114#ifndef MAX_BUF_AIO
115#define MAX_BUF_AIO		16
116#endif
117
118#ifndef AIOD_TIMEOUT_DEFAULT
119#define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
120#endif
121
122#ifndef AIOD_LIFETIME_DEFAULT
123#define AIOD_LIFETIME_DEFAULT	(30 * hz)
124#endif
125
126FEATURE(aio, "Asynchronous I/O");
127
128static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
129
130static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
131
132static int max_aio_procs = MAX_AIO_PROCS;
133SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
134	CTLFLAG_RW, &max_aio_procs, 0,
135	"Maximum number of kernel threads to use for handling async IO ");
136
137static int num_aio_procs = 0;
138SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
139	CTLFLAG_RD, &num_aio_procs, 0,
140	"Number of presently active kernel threads for async IO");
141
142/*
143 * The code will adjust the actual number of AIO processes towards this
144 * number when it gets a chance.
145 */
146static int target_aio_procs = TARGET_AIO_PROCS;
147SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
148	0, "Preferred number of ready kernel threads for async IO");
149
150static int max_queue_count = MAX_AIO_QUEUE;
151SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
152    "Maximum number of aio requests to queue, globally");
153
154static int num_queue_count = 0;
155SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
156    "Number of queued aio requests");
157
158static int num_buf_aio = 0;
159SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
160    "Number of aio requests presently handled by the buf subsystem");
161
162/* Number of async I/O thread in the process of being started */
163/* XXX This should be local to aio_aqueue() */
164static int num_aio_resv_start = 0;
165
166static int aiod_timeout;
167SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
168    "Timeout value for synchronous aio operations");
169
170static int aiod_lifetime;
171SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
172    "Maximum lifetime for idle aiod");
173
174static int unloadable = 0;
175SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
176    "Allow unload of aio (not recommended)");
177
178
179static int max_aio_per_proc = MAX_AIO_PER_PROC;
180SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
181    0, "Maximum active aio requests per process (stored in the process)");
182
183static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
184SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
185    &max_aio_queue_per_proc, 0,
186    "Maximum queued aio requests per process (stored in the process)");
187
188static int max_buf_aio = MAX_BUF_AIO;
189SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
190    "Maximum buf aio requests per process (stored in the process)");
191
192typedef struct oaiocb {
193	int	aio_fildes;		/* File descriptor */
194	off_t	aio_offset;		/* File offset for I/O */
195	volatile void *aio_buf;         /* I/O buffer in process space */
196	size_t	aio_nbytes;		/* Number of bytes for I/O */
197	struct	osigevent aio_sigevent;	/* Signal to deliver */
198	int	aio_lio_opcode;		/* LIO opcode */
199	int	aio_reqprio;		/* Request priority -- ignored */
200	struct	__aiocb_private	_aiocb_private;
201} oaiocb_t;
202
203/*
204 * Below is a key of locks used to protect each member of struct aiocblist
205 * aioliojob and kaioinfo and any backends.
206 *
207 * * - need not protected
208 * a - locked by kaioinfo lock
209 * b - locked by backend lock, the backend lock can be null in some cases,
210 *     for example, BIO belongs to this type, in this case, proc lock is
211 *     reused.
212 * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
213 */
214
215/*
216 * Current, there is only two backends: BIO and generic file I/O.
217 * socket I/O is served by generic file I/O, this is not a good idea, since
218 * disk file I/O and any other types without O_NONBLOCK flag can block daemon
219 * threads, if there is no thread to serve socket I/O, the socket I/O will be
220 * delayed too long or starved, we should create some threads dedicated to
221 * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
222 * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
223 * structure is not safe because there is race between userland and aio
224 * daemons.
225 */
226
227struct aiocblist {
228	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
229	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
230	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
231	int	jobflags;		/* (a) job flags */
232	int	jobstate;		/* (b) job state */
233	int	inputcharge;		/* (*) input blockes */
234	int	outputcharge;		/* (*) output blockes */
235	struct	buf *bp;		/* (*) private to BIO backend,
236				  	 * buffer pointer
237					 */
238	struct	proc *userproc;		/* (*) user process */
239	struct  ucred *cred;		/* (*) active credential when created */
240	struct	file *fd_file;		/* (*) pointer to file structure */
241	struct	aioliojob *lio;		/* (*) optional lio job */
242	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
243	struct	knlist klist;		/* (a) list of knotes */
244	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
245	ksiginfo_t ksi;			/* (a) realtime signal info */
246	struct	task biotask;		/* (*) private to BIO backend */
247	uint64_t seqno;			/* (*) job number */
248	int	pending;		/* (a) number of pending I/O, aio_fsync only */
249};
250
251/* jobflags */
252#define AIOCBLIST_DONE		0x01
253#define AIOCBLIST_BUFDONE	0x02
254#define AIOCBLIST_RUNDOWN	0x04
255#define AIOCBLIST_CHECKSYNC	0x08
256
257/*
258 * AIO process info
259 */
260#define AIOP_FREE	0x1			/* proc on free queue */
261
262struct aiothreadlist {
263	int aiothreadflags;			/* (c) AIO proc flags */
264	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
265	struct thread *aiothread;		/* (*) the AIO thread */
266};
267
268/*
269 * data-structure for lio signal management
270 */
271struct aioliojob {
272	int	lioj_flags;			/* (a) listio flags */
273	int	lioj_count;			/* (a) listio flags */
274	int	lioj_finished_count;		/* (a) listio flags */
275	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
276	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
277	struct  knlist klist;			/* (a) list of knotes */
278	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
279};
280
281#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
282#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
283#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
284
285/*
286 * per process aio data structure
287 */
288struct kaioinfo {
289	struct mtx	kaio_mtx;	/* the lock to protect this struct */
290	int	kaio_flags;		/* (a) per process kaio flags */
291	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
292	int	kaio_active_count;	/* (c) number of currently used AIOs */
293	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
294	int	kaio_count;		/* (a) size of AIO queue */
295	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
296	int	kaio_buffer_count;	/* (a) number of physio buffers */
297	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
298	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
299	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
300	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
301	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
302	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
303						 *  NOT USED YET.
304						 */
305	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
306	struct	task	kaio_task;	/* (*) task to kick aio threads */
307};
308
309#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
310#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
311#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
312#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
313
314#define KAIO_RUNDOWN	0x1	/* process is being run down */
315#define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
316
317/*
318 * Operations used to interact with userland aio control blocks.
319 * Different ABIs provide their own operations.
320 */
321struct aiocb_ops {
322	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
323	long	(*fetch_status)(struct aiocb *ujob);
324	long	(*fetch_error)(struct aiocb *ujob);
325	int	(*store_status)(struct aiocb *ujob, long status);
326	int	(*store_error)(struct aiocb *ujob, long error);
327	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
328	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
329};
330
331static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
332static struct sema aio_newproc_sem;
333static struct mtx aio_job_mtx;
334static struct mtx aio_sock_mtx;
335static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
336static struct unrhdr *aiod_unr;
337
338void		aio_init_aioinfo(struct proc *p);
339static int	aio_onceonly(void);
340static int	aio_free_entry(struct aiocblist *aiocbe);
341static void	aio_process_rw(struct aiocblist *aiocbe);
342static void	aio_process_sync(struct aiocblist *aiocbe);
343static void	aio_process_mlock(struct aiocblist *aiocbe);
344static int	aio_newproc(int *);
345int		aio_aqueue(struct thread *td, struct aiocb *job,
346			struct aioliojob *lio, int type, struct aiocb_ops *ops);
347static void	aio_physwakeup(struct buf *bp);
348static void	aio_proc_rundown(void *arg, struct proc *p);
349static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
350static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
351static void	biohelper(void *, int);
352static void	aio_daemon(void *param);
353static void	aio_swake_cb(struct socket *, struct sockbuf *);
354static int	aio_unload(void);
355static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
356#define DONE_BUF	1
357#define DONE_QUEUE	2
358static int	aio_kick(struct proc *userp);
359static void	aio_kick_nowait(struct proc *userp);
360static void	aio_kick_helper(void *context, int pending);
361static int	filt_aioattach(struct knote *kn);
362static void	filt_aiodetach(struct knote *kn);
363static int	filt_aio(struct knote *kn, long hint);
364static int	filt_lioattach(struct knote *kn);
365static void	filt_liodetach(struct knote *kn);
366static int	filt_lio(struct knote *kn, long hint);
367
368/*
369 * Zones for:
370 * 	kaio	Per process async io info
371 *	aiop	async io thread data
372 *	aiocb	async io jobs
373 *	aiol	list io job pointer - internal to aio_suspend XXX
374 *	aiolio	list io jobs
375 */
376static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
377
378/* kqueue filters for aio */
379static struct filterops aio_filtops = {
380	.f_isfd = 0,
381	.f_attach = filt_aioattach,
382	.f_detach = filt_aiodetach,
383	.f_event = filt_aio,
384};
385static struct filterops lio_filtops = {
386	.f_isfd = 0,
387	.f_attach = filt_lioattach,
388	.f_detach = filt_liodetach,
389	.f_event = filt_lio
390};
391
392static eventhandler_tag exit_tag, exec_tag;
393
394TASKQUEUE_DEFINE_THREAD(aiod_bio);
395
396/*
397 * Main operations function for use as a kernel module.
398 */
399static int
400aio_modload(struct module *module, int cmd, void *arg)
401{
402	int error = 0;
403
404	switch (cmd) {
405	case MOD_LOAD:
406		aio_onceonly();
407		break;
408	case MOD_UNLOAD:
409		error = aio_unload();
410		break;
411	case MOD_SHUTDOWN:
412		break;
413	default:
414		error = EINVAL;
415		break;
416	}
417	return (error);
418}
419
420static moduledata_t aio_mod = {
421	"aio",
422	&aio_modload,
423	NULL
424};
425
426static struct syscall_helper_data aio_syscalls[] = {
427	SYSCALL_INIT_HELPER(aio_cancel),
428	SYSCALL_INIT_HELPER(aio_error),
429	SYSCALL_INIT_HELPER(aio_fsync),
430	SYSCALL_INIT_HELPER(aio_mlock),
431	SYSCALL_INIT_HELPER(aio_read),
432	SYSCALL_INIT_HELPER(aio_return),
433	SYSCALL_INIT_HELPER(aio_suspend),
434	SYSCALL_INIT_HELPER(aio_waitcomplete),
435	SYSCALL_INIT_HELPER(aio_write),
436	SYSCALL_INIT_HELPER(lio_listio),
437	SYSCALL_INIT_HELPER(oaio_read),
438	SYSCALL_INIT_HELPER(oaio_write),
439	SYSCALL_INIT_HELPER(olio_listio),
440	SYSCALL_INIT_LAST
441};
442
443#ifdef COMPAT_FREEBSD32
444#include <sys/mount.h>
445#include <sys/socket.h>
446#include <compat/freebsd32/freebsd32.h>
447#include <compat/freebsd32/freebsd32_proto.h>
448#include <compat/freebsd32/freebsd32_signal.h>
449#include <compat/freebsd32/freebsd32_syscall.h>
450#include <compat/freebsd32/freebsd32_util.h>
451
452static struct syscall_helper_data aio32_syscalls[] = {
453	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
454	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
455	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
456	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
457	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
458	SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
459	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
460	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
461	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
462	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
463	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
464	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
465	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
466	SYSCALL_INIT_LAST
467};
468#endif
469
470DECLARE_MODULE(aio, aio_mod,
471	SI_SUB_VFS, SI_ORDER_ANY);
472MODULE_VERSION(aio, 1);
473
474/*
475 * Startup initialization
476 */
477static int
478aio_onceonly(void)
479{
480	int error;
481
482	/* XXX: should probably just use so->callback */
483	aio_swake = &aio_swake_cb;
484	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
485	    EVENTHANDLER_PRI_ANY);
486	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
487	    EVENTHANDLER_PRI_ANY);
488	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
489	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
490	TAILQ_INIT(&aio_freeproc);
491	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
492	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
493	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
494	TAILQ_INIT(&aio_jobs);
495	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
496	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
497	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
498	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
499	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
500	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
501	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
502	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
503	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
504	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
505	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
506	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
507	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
508	jobrefid = 1;
509	async_io_version = _POSIX_VERSION;
510	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
511	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
512	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
513
514	error = syscall_helper_register(aio_syscalls);
515	if (error)
516		return (error);
517#ifdef COMPAT_FREEBSD32
518	error = syscall32_helper_register(aio32_syscalls);
519	if (error)
520		return (error);
521#endif
522	return (0);
523}
524
525/*
526 * Callback for unload of AIO when used as a module.
527 */
528static int
529aio_unload(void)
530{
531	int error;
532
533	/*
534	 * XXX: no unloads by default, it's too dangerous.
535	 * perhaps we could do it if locked out callers and then
536	 * did an aio_proc_rundown() on each process.
537	 *
538	 * jhb: aio_proc_rundown() needs to run on curproc though,
539	 * so I don't think that would fly.
540	 */
541	if (!unloadable)
542		return (EOPNOTSUPP);
543
544#ifdef COMPAT_FREEBSD32
545	syscall32_helper_unregister(aio32_syscalls);
546#endif
547	syscall_helper_unregister(aio_syscalls);
548
549	error = kqueue_del_filteropts(EVFILT_AIO);
550	if (error)
551		return error;
552	error = kqueue_del_filteropts(EVFILT_LIO);
553	if (error)
554		return error;
555	async_io_version = 0;
556	aio_swake = NULL;
557	taskqueue_free(taskqueue_aiod_bio);
558	delete_unrhdr(aiod_unr);
559	uma_zdestroy(kaio_zone);
560	uma_zdestroy(aiop_zone);
561	uma_zdestroy(aiocb_zone);
562	uma_zdestroy(aiol_zone);
563	uma_zdestroy(aiolio_zone);
564	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
565	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
566	mtx_destroy(&aio_job_mtx);
567	mtx_destroy(&aio_sock_mtx);
568	sema_destroy(&aio_newproc_sem);
569	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
570	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
571	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
572	return (0);
573}
574
575/*
576 * Init the per-process aioinfo structure.  The aioinfo limits are set
577 * per-process for user limit (resource) management.
578 */
579void
580aio_init_aioinfo(struct proc *p)
581{
582	struct kaioinfo *ki;
583
584	ki = uma_zalloc(kaio_zone, M_WAITOK);
585	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
586	ki->kaio_flags = 0;
587	ki->kaio_maxactive_count = max_aio_per_proc;
588	ki->kaio_active_count = 0;
589	ki->kaio_qallowed_count = max_aio_queue_per_proc;
590	ki->kaio_count = 0;
591	ki->kaio_ballowed_count = max_buf_aio;
592	ki->kaio_buffer_count = 0;
593	TAILQ_INIT(&ki->kaio_all);
594	TAILQ_INIT(&ki->kaio_done);
595	TAILQ_INIT(&ki->kaio_jobqueue);
596	TAILQ_INIT(&ki->kaio_bufqueue);
597	TAILQ_INIT(&ki->kaio_liojoblist);
598	TAILQ_INIT(&ki->kaio_sockqueue);
599	TAILQ_INIT(&ki->kaio_syncqueue);
600	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
601	PROC_LOCK(p);
602	if (p->p_aioinfo == NULL) {
603		p->p_aioinfo = ki;
604		PROC_UNLOCK(p);
605	} else {
606		PROC_UNLOCK(p);
607		mtx_destroy(&ki->kaio_mtx);
608		uma_zfree(kaio_zone, ki);
609	}
610
611	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
612		aio_newproc(NULL);
613}
614
615static int
616aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
617{
618	struct thread *td;
619	int error;
620
621	error = sigev_findtd(p, sigev, &td);
622	if (error)
623		return (error);
624	if (!KSI_ONQ(ksi)) {
625		ksiginfo_set_sigev(ksi, sigev);
626		ksi->ksi_code = SI_ASYNCIO;
627		ksi->ksi_flags |= KSI_EXT | KSI_INS;
628		tdsendsignal(p, td, ksi->ksi_signo, ksi);
629	}
630	PROC_UNLOCK(p);
631	return (error);
632}
633
634/*
635 * Free a job entry.  Wait for completion if it is currently active, but don't
636 * delay forever.  If we delay, we return a flag that says that we have to
637 * restart the queue scan.
638 */
639static int
640aio_free_entry(struct aiocblist *aiocbe)
641{
642	struct kaioinfo *ki;
643	struct aioliojob *lj;
644	struct proc *p;
645
646	p = aiocbe->userproc;
647	MPASS(curproc == p);
648	ki = p->p_aioinfo;
649	MPASS(ki != NULL);
650
651	AIO_LOCK_ASSERT(ki, MA_OWNED);
652	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
653
654	atomic_subtract_int(&num_queue_count, 1);
655
656	ki->kaio_count--;
657	MPASS(ki->kaio_count >= 0);
658
659	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
660	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
661
662	lj = aiocbe->lio;
663	if (lj) {
664		lj->lioj_count--;
665		lj->lioj_finished_count--;
666
667		if (lj->lioj_count == 0) {
668			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
669			/* lio is going away, we need to destroy any knotes */
670			knlist_delete(&lj->klist, curthread, 1);
671			PROC_LOCK(p);
672			sigqueue_take(&lj->lioj_ksi);
673			PROC_UNLOCK(p);
674			uma_zfree(aiolio_zone, lj);
675		}
676	}
677
678	/* aiocbe is going away, we need to destroy any knotes */
679	knlist_delete(&aiocbe->klist, curthread, 1);
680	PROC_LOCK(p);
681	sigqueue_take(&aiocbe->ksi);
682	PROC_UNLOCK(p);
683
684	MPASS(aiocbe->bp == NULL);
685	aiocbe->jobstate = JOBST_NULL;
686	AIO_UNLOCK(ki);
687
688	/*
689	 * The thread argument here is used to find the owning process
690	 * and is also passed to fo_close() which may pass it to various
691	 * places such as devsw close() routines.  Because of that, we
692	 * need a thread pointer from the process owning the job that is
693	 * persistent and won't disappear out from under us or move to
694	 * another process.
695	 *
696	 * Currently, all the callers of this function call it to remove
697	 * an aiocblist from the current process' job list either via a
698	 * syscall or due to the current process calling exit() or
699	 * execve().  Thus, we know that p == curproc.  We also know that
700	 * curthread can't exit since we are curthread.
701	 *
702	 * Therefore, we use curthread as the thread to pass to
703	 * knlist_delete().  This does mean that it is possible for the
704	 * thread pointer at close time to differ from the thread pointer
705	 * at open time, but this is already true of file descriptors in
706	 * a multithreaded process.
707	 */
708	if (aiocbe->fd_file)
709		fdrop(aiocbe->fd_file, curthread);
710	crfree(aiocbe->cred);
711	uma_zfree(aiocb_zone, aiocbe);
712	AIO_LOCK(ki);
713
714	return (0);
715}
716
717static void
718aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
719{
720   	aio_proc_rundown(arg, p);
721}
722
723/*
724 * Rundown the jobs for a given process.
725 */
726static void
727aio_proc_rundown(void *arg, struct proc *p)
728{
729	struct kaioinfo *ki;
730	struct aioliojob *lj;
731	struct aiocblist *cbe, *cbn;
732	struct file *fp;
733	struct socket *so;
734	int remove;
735
736	KASSERT(curthread->td_proc == p,
737	    ("%s: called on non-curproc", __func__));
738	ki = p->p_aioinfo;
739	if (ki == NULL)
740		return;
741
742	AIO_LOCK(ki);
743	ki->kaio_flags |= KAIO_RUNDOWN;
744
745restart:
746
747	/*
748	 * Try to cancel all pending requests. This code simulates
749	 * aio_cancel on all pending I/O requests.
750	 */
751	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
752		remove = 0;
753		mtx_lock(&aio_job_mtx);
754		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
755			TAILQ_REMOVE(&aio_jobs, cbe, list);
756			remove = 1;
757		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
758			fp = cbe->fd_file;
759			MPASS(fp->f_type == DTYPE_SOCKET);
760			so = fp->f_data;
761			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
762			remove = 1;
763		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
764			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
765			remove = 1;
766		}
767		mtx_unlock(&aio_job_mtx);
768
769		if (remove) {
770			cbe->jobstate = JOBST_JOBFINISHED;
771			cbe->uaiocb._aiocb_private.status = -1;
772			cbe->uaiocb._aiocb_private.error = ECANCELED;
773			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
774			aio_bio_done_notify(p, cbe, DONE_QUEUE);
775		}
776	}
777
778	/* Wait for all running I/O to be finished */
779	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
780	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
781		ki->kaio_flags |= KAIO_WAKEUP;
782		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
783		goto restart;
784	}
785
786	/* Free all completed I/O requests. */
787	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
788		aio_free_entry(cbe);
789
790	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
791		if (lj->lioj_count == 0) {
792			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
793			knlist_delete(&lj->klist, curthread, 1);
794			PROC_LOCK(p);
795			sigqueue_take(&lj->lioj_ksi);
796			PROC_UNLOCK(p);
797			uma_zfree(aiolio_zone, lj);
798		} else {
799			panic("LIO job not cleaned up: C:%d, FC:%d\n",
800			    lj->lioj_count, lj->lioj_finished_count);
801		}
802	}
803	AIO_UNLOCK(ki);
804	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
805	mtx_destroy(&ki->kaio_mtx);
806	uma_zfree(kaio_zone, ki);
807	p->p_aioinfo = NULL;
808}
809
810/*
811 * Select a job to run (called by an AIO daemon).
812 */
813static struct aiocblist *
814aio_selectjob(struct aiothreadlist *aiop)
815{
816	struct aiocblist *aiocbe;
817	struct kaioinfo *ki;
818	struct proc *userp;
819
820	mtx_assert(&aio_job_mtx, MA_OWNED);
821	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
822		userp = aiocbe->userproc;
823		ki = userp->p_aioinfo;
824
825		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
826			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
827			/* Account for currently active jobs. */
828			ki->kaio_active_count++;
829			aiocbe->jobstate = JOBST_JOBRUNNING;
830			break;
831		}
832	}
833	return (aiocbe);
834}
835
836/*
837 *  Move all data to a permanent storage device, this code
838 *  simulates fsync syscall.
839 */
840static int
841aio_fsync_vnode(struct thread *td, struct vnode *vp)
842{
843	struct mount *mp;
844	int error;
845
846	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
847		goto drop;
848	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
849	if (vp->v_object != NULL) {
850		VM_OBJECT_WLOCK(vp->v_object);
851		vm_object_page_clean(vp->v_object, 0, 0, 0);
852		VM_OBJECT_WUNLOCK(vp->v_object);
853	}
854	error = VOP_FSYNC(vp, MNT_WAIT, td);
855
856	VOP_UNLOCK(vp, 0);
857	vn_finished_write(mp);
858drop:
859	return (error);
860}
861
862/*
863 * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
864 * does the I/O request for the non-physio version of the operations.  The
865 * normal vn operations are used, and this code should work in all instances
866 * for every type of file, including pipes, sockets, fifos, and regular files.
867 *
868 * XXX I don't think it works well for socket, pipe, and fifo.
869 */
870static void
871aio_process_rw(struct aiocblist *aiocbe)
872{
873	struct ucred *td_savedcred;
874	struct thread *td;
875	struct aiocb *cb;
876	struct file *fp;
877	struct socket *so;
878	struct uio auio;
879	struct iovec aiov;
880	int cnt;
881	int error;
882	int oublock_st, oublock_end;
883	int inblock_st, inblock_end;
884
885	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
886	    aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
887	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
888
889	td = curthread;
890	td_savedcred = td->td_ucred;
891	td->td_ucred = aiocbe->cred;
892	cb = &aiocbe->uaiocb;
893	fp = aiocbe->fd_file;
894
895	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
896	aiov.iov_len = cb->aio_nbytes;
897
898	auio.uio_iov = &aiov;
899	auio.uio_iovcnt = 1;
900	auio.uio_offset = cb->aio_offset;
901	auio.uio_resid = cb->aio_nbytes;
902	cnt = cb->aio_nbytes;
903	auio.uio_segflg = UIO_USERSPACE;
904	auio.uio_td = td;
905
906	inblock_st = td->td_ru.ru_inblock;
907	oublock_st = td->td_ru.ru_oublock;
908	/*
909	 * aio_aqueue() acquires a reference to the file that is
910	 * released in aio_free_entry().
911	 */
912	if (cb->aio_lio_opcode == LIO_READ) {
913		auio.uio_rw = UIO_READ;
914		if (auio.uio_resid == 0)
915			error = 0;
916		else
917			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
918	} else {
919		if (fp->f_type == DTYPE_VNODE)
920			bwillwrite();
921		auio.uio_rw = UIO_WRITE;
922		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
923	}
924	inblock_end = td->td_ru.ru_inblock;
925	oublock_end = td->td_ru.ru_oublock;
926
927	aiocbe->inputcharge = inblock_end - inblock_st;
928	aiocbe->outputcharge = oublock_end - oublock_st;
929
930	if ((error) && (auio.uio_resid != cnt)) {
931		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
932			error = 0;
933		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
934			int sigpipe = 1;
935			if (fp->f_type == DTYPE_SOCKET) {
936				so = fp->f_data;
937				if (so->so_options & SO_NOSIGPIPE)
938					sigpipe = 0;
939			}
940			if (sigpipe) {
941				PROC_LOCK(aiocbe->userproc);
942				kern_psignal(aiocbe->userproc, SIGPIPE);
943				PROC_UNLOCK(aiocbe->userproc);
944			}
945		}
946	}
947
948	cnt -= auio.uio_resid;
949	cb->_aiocb_private.error = error;
950	cb->_aiocb_private.status = cnt;
951	td->td_ucred = td_savedcred;
952}
953
954static void
955aio_process_sync(struct aiocblist *aiocbe)
956{
957	struct thread *td = curthread;
958	struct ucred *td_savedcred = td->td_ucred;
959	struct aiocb *cb = &aiocbe->uaiocb;
960	struct file *fp = aiocbe->fd_file;
961	int error = 0;
962
963	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
964	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
965
966	td->td_ucred = aiocbe->cred;
967	if (fp->f_vnode != NULL)
968		error = aio_fsync_vnode(td, fp->f_vnode);
969	cb->_aiocb_private.error = error;
970	cb->_aiocb_private.status = 0;
971	td->td_ucred = td_savedcred;
972}
973
974static void
975aio_process_mlock(struct aiocblist *aiocbe)
976{
977	struct aiocb *cb = &aiocbe->uaiocb;
978	int error;
979
980	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
981	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
982
983	error = vm_mlock(aiocbe->userproc, aiocbe->cred,
984	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
985	cb->_aiocb_private.error = error;
986	cb->_aiocb_private.status = 0;
987}
988
989static void
990aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
991{
992	struct aioliojob *lj;
993	struct kaioinfo *ki;
994	struct aiocblist *scb, *scbn;
995	int lj_done;
996
997	ki = userp->p_aioinfo;
998	AIO_LOCK_ASSERT(ki, MA_OWNED);
999	lj = aiocbe->lio;
1000	lj_done = 0;
1001	if (lj) {
1002		lj->lioj_finished_count++;
1003		if (lj->lioj_count == lj->lioj_finished_count)
1004			lj_done = 1;
1005	}
1006	if (type == DONE_QUEUE) {
1007		aiocbe->jobflags |= AIOCBLIST_DONE;
1008	} else {
1009		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
1010	}
1011	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
1012	aiocbe->jobstate = JOBST_JOBFINISHED;
1013
1014	if (ki->kaio_flags & KAIO_RUNDOWN)
1015		goto notification_done;
1016
1017	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1018	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
1019		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
1020
1021	KNOTE_LOCKED(&aiocbe->klist, 1);
1022
1023	if (lj_done) {
1024		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
1025			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
1026			KNOTE_LOCKED(&lj->klist, 1);
1027		}
1028		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
1029		    == LIOJ_SIGNAL
1030		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
1031		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
1032			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
1033			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1034		}
1035	}
1036
1037notification_done:
1038	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
1039		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
1040			if (aiocbe->fd_file == scb->fd_file &&
1041			    aiocbe->seqno < scb->seqno) {
1042				if (--scb->pending == 0) {
1043					mtx_lock(&aio_job_mtx);
1044					scb->jobstate = JOBST_JOBQGLOBAL;
1045					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
1046					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
1047					aio_kick_nowait(userp);
1048					mtx_unlock(&aio_job_mtx);
1049				}
1050			}
1051		}
1052	}
1053	if (ki->kaio_flags & KAIO_WAKEUP) {
1054		ki->kaio_flags &= ~KAIO_WAKEUP;
1055		wakeup(&userp->p_aioinfo);
1056	}
1057}
1058
1059/*
1060 * The AIO daemon, most of the actual work is done in aio_process_*,
1061 * but the setup (and address space mgmt) is done in this routine.
1062 */
1063static void
1064aio_daemon(void *_id)
1065{
1066	struct aiocblist *aiocbe;
1067	struct aiothreadlist *aiop;
1068	struct kaioinfo *ki;
1069	struct proc *curcp, *mycp, *userp;
1070	struct vmspace *myvm, *tmpvm;
1071	struct thread *td = curthread;
1072	int id = (intptr_t)_id;
1073
1074	/*
1075	 * Local copies of curproc (cp) and vmspace (myvm)
1076	 */
1077	mycp = td->td_proc;
1078	myvm = mycp->p_vmspace;
1079
1080	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
1081
1082	/*
1083	 * Allocate and ready the aio control info.  There is one aiop structure
1084	 * per daemon.
1085	 */
1086	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1087	aiop->aiothread = td;
1088	aiop->aiothreadflags = 0;
1089
1090	/* The daemon resides in its own pgrp. */
1091	sys_setsid(td, NULL);
1092
1093	/*
1094	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1095	 * and creating too many daemons.)
1096	 */
1097	sema_post(&aio_newproc_sem);
1098
1099	mtx_lock(&aio_job_mtx);
1100	for (;;) {
1101		/*
1102		 * curcp is the current daemon process context.
1103		 * userp is the current user process context.
1104		 */
1105		curcp = mycp;
1106
1107		/*
1108		 * Take daemon off of free queue
1109		 */
1110		if (aiop->aiothreadflags & AIOP_FREE) {
1111			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1112			aiop->aiothreadflags &= ~AIOP_FREE;
1113		}
1114
1115		/*
1116		 * Check for jobs.
1117		 */
1118		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
1119			mtx_unlock(&aio_job_mtx);
1120			userp = aiocbe->userproc;
1121
1122			/*
1123			 * Connect to process address space for user program.
1124			 */
1125			if (userp != curcp) {
1126				/*
1127				 * Save the current address space that we are
1128				 * connected to.
1129				 */
1130				tmpvm = mycp->p_vmspace;
1131
1132				/*
1133				 * Point to the new user address space, and
1134				 * refer to it.
1135				 */
1136				mycp->p_vmspace = userp->p_vmspace;
1137				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
1138
1139				/* Activate the new mapping. */
1140				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1141
1142				/*
1143				 * If the old address space wasn't the daemons
1144				 * own address space, then we need to remove the
1145				 * daemon's reference from the other process
1146				 * that it was acting on behalf of.
1147				 */
1148				if (tmpvm != myvm) {
1149					vmspace_free(tmpvm);
1150				}
1151				curcp = userp;
1152			}
1153
1154			ki = userp->p_aioinfo;
1155
1156			/* Do the I/O function. */
1157			switch(aiocbe->uaiocb.aio_lio_opcode) {
1158			case LIO_READ:
1159			case LIO_WRITE:
1160				aio_process_rw(aiocbe);
1161				break;
1162			case LIO_SYNC:
1163				aio_process_sync(aiocbe);
1164				break;
1165			case LIO_MLOCK:
1166				aio_process_mlock(aiocbe);
1167				break;
1168			}
1169
1170			mtx_lock(&aio_job_mtx);
1171			/* Decrement the active job count. */
1172			ki->kaio_active_count--;
1173			mtx_unlock(&aio_job_mtx);
1174
1175			AIO_LOCK(ki);
1176			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
1177			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
1178			AIO_UNLOCK(ki);
1179
1180			mtx_lock(&aio_job_mtx);
1181		}
1182
1183		/*
1184		 * Disconnect from user address space.
1185		 */
1186		if (curcp != mycp) {
1187
1188			mtx_unlock(&aio_job_mtx);
1189
1190			/* Get the user address space to disconnect from. */
1191			tmpvm = mycp->p_vmspace;
1192
1193			/* Get original address space for daemon. */
1194			mycp->p_vmspace = myvm;
1195
1196			/* Activate the daemon's address space. */
1197			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
1198#ifdef DIAGNOSTIC
1199			if (tmpvm == myvm) {
1200				printf("AIOD: vmspace problem -- %d\n",
1201				    mycp->p_pid);
1202			}
1203#endif
1204			/* Remove our vmspace reference. */
1205			vmspace_free(tmpvm);
1206
1207			curcp = mycp;
1208
1209			mtx_lock(&aio_job_mtx);
1210			/*
1211			 * We have to restart to avoid race, we only sleep if
1212			 * no job can be selected, that should be
1213			 * curcp == mycp.
1214			 */
1215			continue;
1216		}
1217
1218		mtx_assert(&aio_job_mtx, MA_OWNED);
1219
1220		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1221		aiop->aiothreadflags |= AIOP_FREE;
1222
1223		/*
1224		 * If daemon is inactive for a long time, allow it to exit,
1225		 * thereby freeing resources.
1226		 */
1227		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
1228		    aiod_lifetime)) {
1229			if (TAILQ_EMPTY(&aio_jobs)) {
1230				if ((aiop->aiothreadflags & AIOP_FREE) &&
1231				    (num_aio_procs > target_aio_procs)) {
1232					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1233					num_aio_procs--;
1234					mtx_unlock(&aio_job_mtx);
1235					uma_zfree(aiop_zone, aiop);
1236					free_unr(aiod_unr, id);
1237#ifdef DIAGNOSTIC
1238					if (mycp->p_vmspace->vm_refcnt <= 1) {
1239						printf("AIOD: bad vm refcnt for"
1240						    " exiting daemon: %d\n",
1241						    mycp->p_vmspace->vm_refcnt);
1242					}
1243#endif
1244					kproc_exit(0);
1245				}
1246			}
1247		}
1248	}
1249	mtx_unlock(&aio_job_mtx);
1250	panic("shouldn't be here\n");
1251}
1252
1253/*
1254 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1255 * AIO daemon modifies its environment itself.
1256 */
1257static int
1258aio_newproc(int *start)
1259{
1260	int error;
1261	struct proc *p;
1262	int id;
1263
1264	id = alloc_unr(aiod_unr);
1265	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1266		RFNOWAIT, 0, "aiod%d", id);
1267	if (error == 0) {
1268		/*
1269		 * Wait until daemon is started.
1270		 */
1271		sema_wait(&aio_newproc_sem);
1272		mtx_lock(&aio_job_mtx);
1273		num_aio_procs++;
1274		if (start != NULL)
1275			(*start)--;
1276		mtx_unlock(&aio_job_mtx);
1277	} else {
1278		free_unr(aiod_unr, id);
1279	}
1280	return (error);
1281}
1282
1283/*
1284 * Try the high-performance, low-overhead physio method for eligible
1285 * VCHR devices.  This method doesn't use an aio helper thread, and
1286 * thus has very low overhead.
1287 *
1288 * Assumes that the caller, aio_aqueue(), has incremented the file
1289 * structure's reference count, preventing its deallocation for the
1290 * duration of this call.
1291 */
1292static int
1293aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1294{
1295	struct aiocb *cb;
1296	struct file *fp;
1297	struct buf *bp;
1298	struct vnode *vp;
1299	struct cdevsw *csw;
1300	struct cdev *dev;
1301	struct kaioinfo *ki;
1302	struct aioliojob *lj;
1303	int error, ref;
1304
1305	cb = &aiocbe->uaiocb;
1306	fp = aiocbe->fd_file;
1307
1308	if (fp == NULL || fp->f_type != DTYPE_VNODE)
1309		return (-1);
1310
1311	vp = fp->f_vnode;
1312
1313	/*
1314	 * If its not a disk, we don't want to return a positive error.
1315	 * It causes the aio code to not fall through to try the thread
1316	 * way when you're talking to a regular file.
1317	 */
1318	if (!vn_isdisk(vp, &error)) {
1319		if (error == ENOTBLK)
1320			return (-1);
1321		else
1322			return (error);
1323	}
1324
1325	if (vp->v_bufobj.bo_bsize == 0)
1326		return (-1);
1327
1328 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1329		return (-1);
1330
1331	if (cb->aio_nbytes >
1332	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1333		return (-1);
1334
1335	ki = p->p_aioinfo;
1336	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1337		return (-1);
1338
1339	ref = 0;
1340	csw = devvn_refthread(vp, &dev, &ref);
1341	if (csw == NULL)
1342		return (ENXIO);
1343	if (cb->aio_nbytes > dev->si_iosize_max) {
1344		error = -1;
1345		goto unref;
1346	}
1347
1348	/* Create and build a buffer header for a transfer. */
1349	bp = (struct buf *)getpbuf(NULL);
1350	BUF_KERNPROC(bp);
1351
1352	AIO_LOCK(ki);
1353	ki->kaio_count++;
1354	ki->kaio_buffer_count++;
1355	lj = aiocbe->lio;
1356	if (lj)
1357		lj->lioj_count++;
1358	AIO_UNLOCK(ki);
1359
1360	/*
1361	 * Get a copy of the kva from the physical buffer.
1362	 */
1363	error = 0;
1364
1365	bp->b_bcount = cb->aio_nbytes;
1366	bp->b_bufsize = cb->aio_nbytes;
1367	bp->b_iodone = aio_physwakeup;
1368	bp->b_saveaddr = bp->b_data;
1369	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1370	bp->b_offset = cb->aio_offset;
1371	bp->b_iooffset = cb->aio_offset;
1372	bp->b_blkno = btodb(cb->aio_offset);
1373	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1374
1375	/*
1376	 * Bring buffer into kernel space.
1377	 */
1378	if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
1379		error = EFAULT;
1380		goto doerror;
1381	}
1382
1383	AIO_LOCK(ki);
1384	aiocbe->bp = bp;
1385	bp->b_caller1 = (void *)aiocbe;
1386	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1387	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1388	aiocbe->jobstate = JOBST_JOBQBUF;
1389	cb->_aiocb_private.status = cb->aio_nbytes;
1390	AIO_UNLOCK(ki);
1391
1392	atomic_add_int(&num_queue_count, 1);
1393	atomic_add_int(&num_buf_aio, 1);
1394
1395	bp->b_error = 0;
1396
1397	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
1398
1399	/* Perform transfer. */
1400	dev_strategy_csw(dev, csw, bp);
1401	dev_relthread(dev, ref);
1402	return (0);
1403
1404doerror:
1405	AIO_LOCK(ki);
1406	ki->kaio_count--;
1407	ki->kaio_buffer_count--;
1408	if (lj)
1409		lj->lioj_count--;
1410	aiocbe->bp = NULL;
1411	AIO_UNLOCK(ki);
1412	relpbuf(bp, NULL);
1413unref:
1414	dev_relthread(dev, ref);
1415	return (error);
1416}
1417
1418/*
1419 * Wake up aio requests that may be serviceable now.
1420 */
1421static void
1422aio_swake_cb(struct socket *so, struct sockbuf *sb)
1423{
1424	struct aiocblist *cb, *cbn;
1425	int opcode;
1426
1427	SOCKBUF_LOCK_ASSERT(sb);
1428	if (sb == &so->so_snd)
1429		opcode = LIO_WRITE;
1430	else
1431		opcode = LIO_READ;
1432
1433	sb->sb_flags &= ~SB_AIO;
1434	mtx_lock(&aio_job_mtx);
1435	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
1436		if (opcode == cb->uaiocb.aio_lio_opcode) {
1437			if (cb->jobstate != JOBST_JOBQSOCK)
1438				panic("invalid queue value");
1439			/* XXX
1440			 * We don't have actual sockets backend yet,
1441			 * so we simply move the requests to the generic
1442			 * file I/O backend.
1443			 */
1444			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1445			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1446			aio_kick_nowait(cb->userproc);
1447		}
1448	}
1449	mtx_unlock(&aio_job_mtx);
1450}
1451
1452static int
1453convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1454{
1455
1456	/*
1457	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1458	 * supported by AIO with the old sigevent structure.
1459	 */
1460	nsig->sigev_notify = osig->sigev_notify;
1461	switch (nsig->sigev_notify) {
1462	case SIGEV_NONE:
1463		break;
1464	case SIGEV_SIGNAL:
1465		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1466		break;
1467	case SIGEV_KEVENT:
1468		nsig->sigev_notify_kqueue =
1469		    osig->__sigev_u.__sigev_notify_kqueue;
1470		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1471		break;
1472	default:
1473		return (EINVAL);
1474	}
1475	return (0);
1476}
1477
1478static int
1479aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1480{
1481	struct oaiocb *ojob;
1482	int error;
1483
1484	bzero(kjob, sizeof(struct aiocb));
1485	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1486	if (error)
1487		return (error);
1488	ojob = (struct oaiocb *)kjob;
1489	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1490}
1491
1492static int
1493aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1494{
1495
1496	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1497}
1498
1499static long
1500aiocb_fetch_status(struct aiocb *ujob)
1501{
1502
1503	return (fuword(&ujob->_aiocb_private.status));
1504}
1505
1506static long
1507aiocb_fetch_error(struct aiocb *ujob)
1508{
1509
1510	return (fuword(&ujob->_aiocb_private.error));
1511}
1512
1513static int
1514aiocb_store_status(struct aiocb *ujob, long status)
1515{
1516
1517	return (suword(&ujob->_aiocb_private.status, status));
1518}
1519
1520static int
1521aiocb_store_error(struct aiocb *ujob, long error)
1522{
1523
1524	return (suword(&ujob->_aiocb_private.error, error));
1525}
1526
1527static int
1528aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1529{
1530
1531	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1532}
1533
1534static int
1535aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1536{
1537
1538	return (suword(ujobp, (long)ujob));
1539}
1540
1541static struct aiocb_ops aiocb_ops = {
1542	.copyin = aiocb_copyin,
1543	.fetch_status = aiocb_fetch_status,
1544	.fetch_error = aiocb_fetch_error,
1545	.store_status = aiocb_store_status,
1546	.store_error = aiocb_store_error,
1547	.store_kernelinfo = aiocb_store_kernelinfo,
1548	.store_aiocb = aiocb_store_aiocb,
1549};
1550
1551static struct aiocb_ops aiocb_ops_osigevent = {
1552	.copyin = aiocb_copyin_old_sigevent,
1553	.fetch_status = aiocb_fetch_status,
1554	.fetch_error = aiocb_fetch_error,
1555	.store_status = aiocb_store_status,
1556	.store_error = aiocb_store_error,
1557	.store_kernelinfo = aiocb_store_kernelinfo,
1558	.store_aiocb = aiocb_store_aiocb,
1559};
1560
1561/*
1562 * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1563 * technique is done in this code.
1564 */
1565int
1566aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
1567	int type, struct aiocb_ops *ops)
1568{
1569	struct proc *p = td->td_proc;
1570	cap_rights_t rights;
1571	struct file *fp;
1572	struct socket *so;
1573	struct aiocblist *aiocbe, *cb;
1574	struct kaioinfo *ki;
1575	struct kevent kev;
1576	struct sockbuf *sb;
1577	int opcode;
1578	int error;
1579	int fd, kqfd;
1580	int jid;
1581	u_short evflags;
1582
1583	if (p->p_aioinfo == NULL)
1584		aio_init_aioinfo(p);
1585
1586	ki = p->p_aioinfo;
1587
1588	ops->store_status(job, -1);
1589	ops->store_error(job, 0);
1590	ops->store_kernelinfo(job, -1);
1591
1592	if (num_queue_count >= max_queue_count ||
1593	    ki->kaio_count >= ki->kaio_qallowed_count) {
1594		ops->store_error(job, EAGAIN);
1595		return (EAGAIN);
1596	}
1597
1598	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1599	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
1600
1601	error = ops->copyin(job, &aiocbe->uaiocb);
1602	if (error) {
1603		ops->store_error(job, error);
1604		uma_zfree(aiocb_zone, aiocbe);
1605		return (error);
1606	}
1607
1608	/* XXX: aio_nbytes is later casted to signed types. */
1609	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
1610		uma_zfree(aiocb_zone, aiocbe);
1611		return (EINVAL);
1612	}
1613
1614	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1615	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1616	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1617	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1618		ops->store_error(job, EINVAL);
1619		uma_zfree(aiocb_zone, aiocbe);
1620		return (EINVAL);
1621	}
1622
1623	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1624	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1625		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1626		uma_zfree(aiocb_zone, aiocbe);
1627		return (EINVAL);
1628	}
1629
1630	ksiginfo_init(&aiocbe->ksi);
1631
1632	/* Save userspace address of the job info. */
1633	aiocbe->uuaiocb = job;
1634
1635	/* Get the opcode. */
1636	if (type != LIO_NOP)
1637		aiocbe->uaiocb.aio_lio_opcode = type;
1638	opcode = aiocbe->uaiocb.aio_lio_opcode;
1639
1640	/*
1641	 * Validate the opcode and fetch the file object for the specified
1642	 * file descriptor.
1643	 *
1644	 * XXXRW: Moved the opcode validation up here so that we don't
1645	 * retrieve a file descriptor without knowing what the capabiltity
1646	 * should be.
1647	 */
1648	fd = aiocbe->uaiocb.aio_fildes;
1649	switch (opcode) {
1650	case LIO_WRITE:
1651		error = fget_write(td, fd,
1652		    cap_rights_init(&rights, CAP_PWRITE), &fp);
1653		break;
1654	case LIO_READ:
1655		error = fget_read(td, fd,
1656		    cap_rights_init(&rights, CAP_PREAD), &fp);
1657		break;
1658	case LIO_SYNC:
1659		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
1660		break;
1661	case LIO_MLOCK:
1662		fp = NULL;
1663		break;
1664	case LIO_NOP:
1665		error = fget(td, fd, cap_rights_init(&rights), &fp);
1666		break;
1667	default:
1668		error = EINVAL;
1669	}
1670	if (error) {
1671		uma_zfree(aiocb_zone, aiocbe);
1672		ops->store_error(job, error);
1673		return (error);
1674	}
1675
1676	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1677		error = EINVAL;
1678		goto aqueue_fail;
1679	}
1680
1681	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
1682		error = EINVAL;
1683		goto aqueue_fail;
1684	}
1685
1686	aiocbe->fd_file = fp;
1687
1688	mtx_lock(&aio_job_mtx);
1689	jid = jobrefid++;
1690	aiocbe->seqno = jobseqno++;
1691	mtx_unlock(&aio_job_mtx);
1692	error = ops->store_kernelinfo(job, jid);
1693	if (error) {
1694		error = EINVAL;
1695		goto aqueue_fail;
1696	}
1697	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1698
1699	if (opcode == LIO_NOP) {
1700		fdrop(fp, td);
1701		uma_zfree(aiocb_zone, aiocbe);
1702		return (0);
1703	}
1704
1705	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1706		goto no_kqueue;
1707	evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1708	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1709		error = EINVAL;
1710		goto aqueue_fail;
1711	}
1712	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1713	kev.ident = (uintptr_t)aiocbe->uuaiocb;
1714	kev.filter = EVFILT_AIO;
1715	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1716	kev.data = (intptr_t)aiocbe;
1717	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1718	error = kqfd_register(kqfd, &kev, td, 1);
1719aqueue_fail:
1720	if (error) {
1721		if (fp)
1722			fdrop(fp, td);
1723		uma_zfree(aiocb_zone, aiocbe);
1724		ops->store_error(job, error);
1725		goto done;
1726	}
1727no_kqueue:
1728
1729	ops->store_error(job, EINPROGRESS);
1730	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1731	aiocbe->userproc = p;
1732	aiocbe->cred = crhold(td->td_ucred);
1733	aiocbe->jobflags = 0;
1734	aiocbe->lio = lj;
1735
1736	if (opcode == LIO_SYNC)
1737		goto queueit;
1738
1739	if (fp && fp->f_type == DTYPE_SOCKET) {
1740		/*
1741		 * Alternate queueing for socket ops: Reach down into the
1742		 * descriptor to get the socket data.  Then check to see if the
1743		 * socket is ready to be read or written (based on the requested
1744		 * operation).
1745		 *
1746		 * If it is not ready for io, then queue the aiocbe on the
1747		 * socket, and set the flags so we get a call when sbnotify()
1748		 * happens.
1749		 *
1750		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
1751		 * and unlock the snd sockbuf for no reason.
1752		 */
1753		so = fp->f_data;
1754		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
1755		SOCKBUF_LOCK(sb);
1756		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1757		    LIO_WRITE) && (!sowriteable(so)))) {
1758			sb->sb_flags |= SB_AIO;
1759
1760			mtx_lock(&aio_job_mtx);
1761			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1762			mtx_unlock(&aio_job_mtx);
1763
1764			AIO_LOCK(ki);
1765			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1766			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1767			aiocbe->jobstate = JOBST_JOBQSOCK;
1768			ki->kaio_count++;
1769			if (lj)
1770				lj->lioj_count++;
1771			AIO_UNLOCK(ki);
1772			SOCKBUF_UNLOCK(sb);
1773			atomic_add_int(&num_queue_count, 1);
1774			error = 0;
1775			goto done;
1776		}
1777		SOCKBUF_UNLOCK(sb);
1778	}
1779
1780	if ((error = aio_qphysio(p, aiocbe)) == 0)
1781		goto done;
1782#if 0
1783	if (error > 0) {
1784		aiocbe->uaiocb._aiocb_private.error = error;
1785		ops->store_error(job, error);
1786		goto done;
1787	}
1788#endif
1789queueit:
1790	/* No buffer for daemon I/O. */
1791	aiocbe->bp = NULL;
1792	atomic_add_int(&num_queue_count, 1);
1793
1794	AIO_LOCK(ki);
1795	ki->kaio_count++;
1796	if (lj)
1797		lj->lioj_count++;
1798	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1799	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
1800	if (opcode == LIO_SYNC) {
1801		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
1802			if (cb->fd_file == aiocbe->fd_file &&
1803			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1804			    cb->seqno < aiocbe->seqno) {
1805				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1806				aiocbe->pending++;
1807			}
1808		}
1809		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
1810			if (cb->fd_file == aiocbe->fd_file &&
1811			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
1812			    cb->seqno < aiocbe->seqno) {
1813				cb->jobflags |= AIOCBLIST_CHECKSYNC;
1814				aiocbe->pending++;
1815			}
1816		}
1817		if (aiocbe->pending != 0) {
1818			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
1819			aiocbe->jobstate = JOBST_JOBQSYNC;
1820			AIO_UNLOCK(ki);
1821			goto done;
1822		}
1823	}
1824	mtx_lock(&aio_job_mtx);
1825	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1826	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1827	aio_kick_nowait(p);
1828	mtx_unlock(&aio_job_mtx);
1829	AIO_UNLOCK(ki);
1830	error = 0;
1831done:
1832	return (error);
1833}
1834
1835static void
1836aio_kick_nowait(struct proc *userp)
1837{
1838	struct kaioinfo *ki = userp->p_aioinfo;
1839	struct aiothreadlist *aiop;
1840
1841	mtx_assert(&aio_job_mtx, MA_OWNED);
1842	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1843		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1844		aiop->aiothreadflags &= ~AIOP_FREE;
1845		wakeup(aiop->aiothread);
1846	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1847	    ((ki->kaio_active_count + num_aio_resv_start) <
1848	    ki->kaio_maxactive_count)) {
1849		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
1850	}
1851}
1852
1853static int
1854aio_kick(struct proc *userp)
1855{
1856	struct kaioinfo *ki = userp->p_aioinfo;
1857	struct aiothreadlist *aiop;
1858	int error, ret = 0;
1859
1860	mtx_assert(&aio_job_mtx, MA_OWNED);
1861retryproc:
1862	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1863		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1864		aiop->aiothreadflags &= ~AIOP_FREE;
1865		wakeup(aiop->aiothread);
1866	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1867	    ((ki->kaio_active_count + num_aio_resv_start) <
1868	    ki->kaio_maxactive_count)) {
1869		num_aio_resv_start++;
1870		mtx_unlock(&aio_job_mtx);
1871		error = aio_newproc(&num_aio_resv_start);
1872		mtx_lock(&aio_job_mtx);
1873		if (error) {
1874			num_aio_resv_start--;
1875			goto retryproc;
1876		}
1877	} else {
1878		ret = -1;
1879	}
1880	return (ret);
1881}
1882
1883static void
1884aio_kick_helper(void *context, int pending)
1885{
1886	struct proc *userp = context;
1887
1888	mtx_lock(&aio_job_mtx);
1889	while (--pending >= 0) {
1890		if (aio_kick(userp))
1891			break;
1892	}
1893	mtx_unlock(&aio_job_mtx);
1894}
1895
1896/*
1897 * Support the aio_return system call, as a side-effect, kernel resources are
1898 * released.
1899 */
1900static int
1901kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
1902{
1903	struct proc *p = td->td_proc;
1904	struct aiocblist *cb;
1905	struct kaioinfo *ki;
1906	int status, error;
1907
1908	ki = p->p_aioinfo;
1909	if (ki == NULL)
1910		return (EINVAL);
1911	AIO_LOCK(ki);
1912	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
1913		if (cb->uuaiocb == uaiocb)
1914			break;
1915	}
1916	if (cb != NULL) {
1917		MPASS(cb->jobstate == JOBST_JOBFINISHED);
1918		status = cb->uaiocb._aiocb_private.status;
1919		error = cb->uaiocb._aiocb_private.error;
1920		td->td_retval[0] = status;
1921		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1922			td->td_ru.ru_oublock += cb->outputcharge;
1923			cb->outputcharge = 0;
1924		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1925			td->td_ru.ru_inblock += cb->inputcharge;
1926			cb->inputcharge = 0;
1927		}
1928		aio_free_entry(cb);
1929		AIO_UNLOCK(ki);
1930		ops->store_error(uaiocb, error);
1931		ops->store_status(uaiocb, status);
1932	} else {
1933		error = EINVAL;
1934		AIO_UNLOCK(ki);
1935	}
1936	return (error);
1937}
1938
1939int
1940sys_aio_return(struct thread *td, struct aio_return_args *uap)
1941{
1942
1943	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1944}
1945
1946/*
1947 * Allow a process to wakeup when any of the I/O requests are completed.
1948 */
1949static int
1950kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1951    struct timespec *ts)
1952{
1953	struct proc *p = td->td_proc;
1954	struct timeval atv;
1955	struct kaioinfo *ki;
1956	struct aiocblist *cb, *cbfirst;
1957	int error, i, timo;
1958
1959	timo = 0;
1960	if (ts) {
1961		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1962			return (EINVAL);
1963
1964		TIMESPEC_TO_TIMEVAL(&atv, ts);
1965		if (itimerfix(&atv))
1966			return (EINVAL);
1967		timo = tvtohz(&atv);
1968	}
1969
1970	ki = p->p_aioinfo;
1971	if (ki == NULL)
1972		return (EAGAIN);
1973
1974	if (njoblist == 0)
1975		return (0);
1976
1977	AIO_LOCK(ki);
1978	for (;;) {
1979		cbfirst = NULL;
1980		error = 0;
1981		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
1982			for (i = 0; i < njoblist; i++) {
1983				if (cb->uuaiocb == ujoblist[i]) {
1984					if (cbfirst == NULL)
1985						cbfirst = cb;
1986					if (cb->jobstate == JOBST_JOBFINISHED)
1987						goto RETURN;
1988				}
1989			}
1990		}
1991		/* All tasks were finished. */
1992		if (cbfirst == NULL)
1993			break;
1994
1995		ki->kaio_flags |= KAIO_WAKEUP;
1996		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1997		    "aiospn", timo);
1998		if (error == ERESTART)
1999			error = EINTR;
2000		if (error)
2001			break;
2002	}
2003RETURN:
2004	AIO_UNLOCK(ki);
2005	return (error);
2006}
2007
2008int
2009sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
2010{
2011	struct timespec ts, *tsp;
2012	struct aiocb **ujoblist;
2013	int error;
2014
2015	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2016		return (EINVAL);
2017
2018	if (uap->timeout) {
2019		/* Get timespec struct. */
2020		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
2021			return (error);
2022		tsp = &ts;
2023	} else
2024		tsp = NULL;
2025
2026	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2027	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
2028	if (error == 0)
2029		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2030	uma_zfree(aiol_zone, ujoblist);
2031	return (error);
2032}
2033
2034/*
2035 * aio_cancel cancels any non-physio aio operations not currently in
2036 * progress.
2037 */
2038int
2039sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
2040{
2041	struct proc *p = td->td_proc;
2042	struct kaioinfo *ki;
2043	struct aiocblist *cbe, *cbn;
2044	struct file *fp;
2045	struct socket *so;
2046	int error;
2047	int remove;
2048	int cancelled = 0;
2049	int notcancelled = 0;
2050	struct vnode *vp;
2051
2052	/* Lookup file object. */
2053	error = fget(td, uap->fd, NULL, &fp);
2054	if (error)
2055		return (error);
2056
2057	ki = p->p_aioinfo;
2058	if (ki == NULL)
2059		goto done;
2060
2061	if (fp->f_type == DTYPE_VNODE) {
2062		vp = fp->f_vnode;
2063		if (vn_isdisk(vp, &error)) {
2064			fdrop(fp, td);
2065			td->td_retval[0] = AIO_NOTCANCELED;
2066			return (0);
2067		}
2068	}
2069
2070	AIO_LOCK(ki);
2071	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
2072		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
2073		    ((uap->aiocbp == NULL) ||
2074		     (uap->aiocbp == cbe->uuaiocb))) {
2075			remove = 0;
2076
2077			mtx_lock(&aio_job_mtx);
2078			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
2079				TAILQ_REMOVE(&aio_jobs, cbe, list);
2080				remove = 1;
2081			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
2082				MPASS(fp->f_type == DTYPE_SOCKET);
2083				so = fp->f_data;
2084				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
2085				remove = 1;
2086			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
2087				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
2088				remove = 1;
2089			}
2090			mtx_unlock(&aio_job_mtx);
2091
2092			if (remove) {
2093				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
2094				cbe->uaiocb._aiocb_private.status = -1;
2095				cbe->uaiocb._aiocb_private.error = ECANCELED;
2096				aio_bio_done_notify(p, cbe, DONE_QUEUE);
2097				cancelled++;
2098			} else {
2099				notcancelled++;
2100			}
2101			if (uap->aiocbp != NULL)
2102				break;
2103		}
2104	}
2105	AIO_UNLOCK(ki);
2106
2107done:
2108	fdrop(fp, td);
2109
2110	if (uap->aiocbp != NULL) {
2111		if (cancelled) {
2112			td->td_retval[0] = AIO_CANCELED;
2113			return (0);
2114		}
2115	}
2116
2117	if (notcancelled) {
2118		td->td_retval[0] = AIO_NOTCANCELED;
2119		return (0);
2120	}
2121
2122	if (cancelled) {
2123		td->td_retval[0] = AIO_CANCELED;
2124		return (0);
2125	}
2126
2127	td->td_retval[0] = AIO_ALLDONE;
2128
2129	return (0);
2130}
2131
2132/*
2133 * aio_error is implemented in the kernel level for compatibility purposes
2134 * only.  For a user mode async implementation, it would be best to do it in
2135 * a userland subroutine.
2136 */
2137static int
2138kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
2139{
2140	struct proc *p = td->td_proc;
2141	struct aiocblist *cb;
2142	struct kaioinfo *ki;
2143	int status;
2144
2145	ki = p->p_aioinfo;
2146	if (ki == NULL) {
2147		td->td_retval[0] = EINVAL;
2148		return (0);
2149	}
2150
2151	AIO_LOCK(ki);
2152	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
2153		if (cb->uuaiocb == aiocbp) {
2154			if (cb->jobstate == JOBST_JOBFINISHED)
2155				td->td_retval[0] =
2156					cb->uaiocb._aiocb_private.error;
2157			else
2158				td->td_retval[0] = EINPROGRESS;
2159			AIO_UNLOCK(ki);
2160			return (0);
2161		}
2162	}
2163	AIO_UNLOCK(ki);
2164
2165	/*
2166	 * Hack for failure of aio_aqueue.
2167	 */
2168	status = ops->fetch_status(aiocbp);
2169	if (status == -1) {
2170		td->td_retval[0] = ops->fetch_error(aiocbp);
2171		return (0);
2172	}
2173
2174	td->td_retval[0] = EINVAL;
2175	return (0);
2176}
2177
2178int
2179sys_aio_error(struct thread *td, struct aio_error_args *uap)
2180{
2181
2182	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2183}
2184
2185/* syscall - asynchronous read from a file (REALTIME) */
2186int
2187sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
2188{
2189
2190	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2191	    &aiocb_ops_osigevent));
2192}
2193
2194int
2195sys_aio_read(struct thread *td, struct aio_read_args *uap)
2196{
2197
2198	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2199}
2200
2201/* syscall - asynchronous write to a file (REALTIME) */
2202int
2203sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
2204{
2205
2206	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2207	    &aiocb_ops_osigevent));
2208}
2209
2210int
2211sys_aio_write(struct thread *td, struct aio_write_args *uap)
2212{
2213
2214	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2215}
2216
2217int
2218sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2219{
2220
2221	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2222}
2223
2224static int
2225kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2226    struct aiocb **acb_list, int nent, struct sigevent *sig,
2227    struct aiocb_ops *ops)
2228{
2229	struct proc *p = td->td_proc;
2230	struct aiocb *iocb;
2231	struct kaioinfo *ki;
2232	struct aioliojob *lj;
2233	struct kevent kev;
2234	int error;
2235	int nerror;
2236	int i;
2237
2238	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2239		return (EINVAL);
2240
2241	if (nent < 0 || nent > AIO_LISTIO_MAX)
2242		return (EINVAL);
2243
2244	if (p->p_aioinfo == NULL)
2245		aio_init_aioinfo(p);
2246
2247	ki = p->p_aioinfo;
2248
2249	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2250	lj->lioj_flags = 0;
2251	lj->lioj_count = 0;
2252	lj->lioj_finished_count = 0;
2253	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2254	ksiginfo_init(&lj->lioj_ksi);
2255
2256	/*
2257	 * Setup signal.
2258	 */
2259	if (sig && (mode == LIO_NOWAIT)) {
2260		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2261		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2262			/* Assume only new style KEVENT */
2263			kev.filter = EVFILT_LIO;
2264			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2265			kev.ident = (uintptr_t)uacb_list; /* something unique */
2266			kev.data = (intptr_t)lj;
2267			/* pass user defined sigval data */
2268			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2269			error = kqfd_register(
2270			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2271			if (error) {
2272				uma_zfree(aiolio_zone, lj);
2273				return (error);
2274			}
2275		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2276			;
2277		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2278			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2279				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2280					uma_zfree(aiolio_zone, lj);
2281					return EINVAL;
2282				}
2283				lj->lioj_flags |= LIOJ_SIGNAL;
2284		} else {
2285			uma_zfree(aiolio_zone, lj);
2286			return EINVAL;
2287		}
2288	}
2289
2290	AIO_LOCK(ki);
2291	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2292	/*
2293	 * Add extra aiocb count to avoid the lio to be freed
2294	 * by other threads doing aio_waitcomplete or aio_return,
2295	 * and prevent event from being sent until we have queued
2296	 * all tasks.
2297	 */
2298	lj->lioj_count = 1;
2299	AIO_UNLOCK(ki);
2300
2301	/*
2302	 * Get pointers to the list of I/O requests.
2303	 */
2304	nerror = 0;
2305	for (i = 0; i < nent; i++) {
2306		iocb = acb_list[i];
2307		if (iocb != NULL) {
2308			error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
2309			if (error != 0)
2310				nerror++;
2311		}
2312	}
2313
2314	error = 0;
2315	AIO_LOCK(ki);
2316	if (mode == LIO_WAIT) {
2317		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2318			ki->kaio_flags |= KAIO_WAKEUP;
2319			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2320			    PRIBIO | PCATCH, "aiospn", 0);
2321			if (error == ERESTART)
2322				error = EINTR;
2323			if (error)
2324				break;
2325		}
2326	} else {
2327		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2328			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2329				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2330				KNOTE_LOCKED(&lj->klist, 1);
2331			}
2332			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2333			    == LIOJ_SIGNAL
2334			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2335			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2336				aio_sendsig(p, &lj->lioj_signal,
2337					    &lj->lioj_ksi);
2338				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2339			}
2340		}
2341	}
2342	lj->lioj_count--;
2343	if (lj->lioj_count == 0) {
2344		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2345		knlist_delete(&lj->klist, curthread, 1);
2346		PROC_LOCK(p);
2347		sigqueue_take(&lj->lioj_ksi);
2348		PROC_UNLOCK(p);
2349		AIO_UNLOCK(ki);
2350		uma_zfree(aiolio_zone, lj);
2351	} else
2352		AIO_UNLOCK(ki);
2353
2354	if (nerror)
2355		return (EIO);
2356	return (error);
2357}
2358
2359/* syscall - list directed I/O (REALTIME) */
2360int
2361sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
2362{
2363	struct aiocb **acb_list;
2364	struct sigevent *sigp, sig;
2365	struct osigevent osig;
2366	int error, nent;
2367
2368	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2369		return (EINVAL);
2370
2371	nent = uap->nent;
2372	if (nent < 0 || nent > AIO_LISTIO_MAX)
2373		return (EINVAL);
2374
2375	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2376		error = copyin(uap->sig, &osig, sizeof(osig));
2377		if (error)
2378			return (error);
2379		error = convert_old_sigevent(&osig, &sig);
2380		if (error)
2381			return (error);
2382		sigp = &sig;
2383	} else
2384		sigp = NULL;
2385
2386	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2387	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2388	if (error == 0)
2389		error = kern_lio_listio(td, uap->mode,
2390		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2391		    &aiocb_ops_osigevent);
2392	free(acb_list, M_LIO);
2393	return (error);
2394}
2395
2396/* syscall - list directed I/O (REALTIME) */
2397int
2398sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2399{
2400	struct aiocb **acb_list;
2401	struct sigevent *sigp, sig;
2402	int error, nent;
2403
2404	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2405		return (EINVAL);
2406
2407	nent = uap->nent;
2408	if (nent < 0 || nent > AIO_LISTIO_MAX)
2409		return (EINVAL);
2410
2411	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2412		error = copyin(uap->sig, &sig, sizeof(sig));
2413		if (error)
2414			return (error);
2415		sigp = &sig;
2416	} else
2417		sigp = NULL;
2418
2419	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2420	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2421	if (error == 0)
2422		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2423		    nent, sigp, &aiocb_ops);
2424	free(acb_list, M_LIO);
2425	return (error);
2426}
2427
2428/*
2429 * Called from interrupt thread for physio, we should return as fast
2430 * as possible, so we schedule a biohelper task.
2431 */
2432static void
2433aio_physwakeup(struct buf *bp)
2434{
2435	struct aiocblist *aiocbe;
2436
2437	aiocbe = (struct aiocblist *)bp->b_caller1;
2438	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
2439}
2440
2441/*
2442 * Task routine to perform heavy tasks, process wakeup, and signals.
2443 */
2444static void
2445biohelper(void *context, int pending)
2446{
2447	struct aiocblist *aiocbe = context;
2448	struct buf *bp;
2449	struct proc *userp;
2450	struct kaioinfo *ki;
2451	int nblks;
2452
2453	bp = aiocbe->bp;
2454	userp = aiocbe->userproc;
2455	ki = userp->p_aioinfo;
2456	AIO_LOCK(ki);
2457	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2458	aiocbe->uaiocb._aiocb_private.error = 0;
2459	if (bp->b_ioflags & BIO_ERROR)
2460		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2461	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
2462	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
2463		aiocbe->outputcharge += nblks;
2464	else
2465		aiocbe->inputcharge += nblks;
2466	aiocbe->bp = NULL;
2467	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
2468	ki->kaio_buffer_count--;
2469	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
2470	AIO_UNLOCK(ki);
2471
2472	/* Release mapping into kernel space. */
2473	vunmapbuf(bp);
2474	relpbuf(bp, NULL);
2475	atomic_subtract_int(&num_buf_aio, 1);
2476}
2477
2478/* syscall - wait for the next completion of an aio request */
2479static int
2480kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
2481    struct timespec *ts, struct aiocb_ops *ops)
2482{
2483	struct proc *p = td->td_proc;
2484	struct timeval atv;
2485	struct kaioinfo *ki;
2486	struct aiocblist *cb;
2487	struct aiocb *uuaiocb;
2488	int error, status, timo;
2489
2490	ops->store_aiocb(aiocbp, NULL);
2491
2492	timo = 0;
2493	if (ts) {
2494		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2495			return (EINVAL);
2496
2497		TIMESPEC_TO_TIMEVAL(&atv, ts);
2498		if (itimerfix(&atv))
2499			return (EINVAL);
2500		timo = tvtohz(&atv);
2501	}
2502
2503	if (p->p_aioinfo == NULL)
2504		aio_init_aioinfo(p);
2505	ki = p->p_aioinfo;
2506
2507	error = 0;
2508	cb = NULL;
2509	AIO_LOCK(ki);
2510	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2511		ki->kaio_flags |= KAIO_WAKEUP;
2512		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2513		    "aiowc", timo);
2514		if (timo && error == ERESTART)
2515			error = EINTR;
2516		if (error)
2517			break;
2518	}
2519
2520	if (cb != NULL) {
2521		MPASS(cb->jobstate == JOBST_JOBFINISHED);
2522		uuaiocb = cb->uuaiocb;
2523		status = cb->uaiocb._aiocb_private.status;
2524		error = cb->uaiocb._aiocb_private.error;
2525		td->td_retval[0] = status;
2526		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2527			td->td_ru.ru_oublock += cb->outputcharge;
2528			cb->outputcharge = 0;
2529		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2530			td->td_ru.ru_inblock += cb->inputcharge;
2531			cb->inputcharge = 0;
2532		}
2533		aio_free_entry(cb);
2534		AIO_UNLOCK(ki);
2535		ops->store_aiocb(aiocbp, uuaiocb);
2536		ops->store_error(uuaiocb, error);
2537		ops->store_status(uuaiocb, status);
2538	} else
2539		AIO_UNLOCK(ki);
2540
2541	return (error);
2542}
2543
2544int
2545sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2546{
2547	struct timespec ts, *tsp;
2548	int error;
2549
2550	if (uap->timeout) {
2551		/* Get timespec struct. */
2552		error = copyin(uap->timeout, &ts, sizeof(ts));
2553		if (error)
2554			return (error);
2555		tsp = &ts;
2556	} else
2557		tsp = NULL;
2558
2559	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2560}
2561
2562static int
2563kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
2564    struct aiocb_ops *ops)
2565{
2566	struct proc *p = td->td_proc;
2567	struct kaioinfo *ki;
2568
2569	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2570		return (EINVAL);
2571	ki = p->p_aioinfo;
2572	if (ki == NULL)
2573		aio_init_aioinfo(p);
2574	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
2575}
2576
2577int
2578sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2579{
2580
2581	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2582}
2583
2584/* kqueue attach function */
2585static int
2586filt_aioattach(struct knote *kn)
2587{
2588	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
2589
2590	/*
2591	 * The aiocbe pointer must be validated before using it, so
2592	 * registration is restricted to the kernel; the user cannot
2593	 * set EV_FLAG1.
2594	 */
2595	if ((kn->kn_flags & EV_FLAG1) == 0)
2596		return (EPERM);
2597	kn->kn_ptr.p_aio = aiocbe;
2598	kn->kn_flags &= ~EV_FLAG1;
2599
2600	knlist_add(&aiocbe->klist, kn, 0);
2601
2602	return (0);
2603}
2604
2605/* kqueue detach function */
2606static void
2607filt_aiodetach(struct knote *kn)
2608{
2609	struct knlist *knl;
2610
2611	knl = &kn->kn_ptr.p_aio->klist;
2612	knl->kl_lock(knl->kl_lockarg);
2613	if (!knlist_empty(knl))
2614		knlist_remove(knl, kn, 1);
2615	knl->kl_unlock(knl->kl_lockarg);
2616}
2617
2618/* kqueue filter function */
2619/*ARGSUSED*/
2620static int
2621filt_aio(struct knote *kn, long hint)
2622{
2623	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
2624
2625	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2626	if (aiocbe->jobstate != JOBST_JOBFINISHED)
2627		return (0);
2628	kn->kn_flags |= EV_EOF;
2629	return (1);
2630}
2631
2632/* kqueue attach function */
2633static int
2634filt_lioattach(struct knote *kn)
2635{
2636	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2637
2638	/*
2639	 * The aioliojob pointer must be validated before using it, so
2640	 * registration is restricted to the kernel; the user cannot
2641	 * set EV_FLAG1.
2642	 */
2643	if ((kn->kn_flags & EV_FLAG1) == 0)
2644		return (EPERM);
2645	kn->kn_ptr.p_lio = lj;
2646	kn->kn_flags &= ~EV_FLAG1;
2647
2648	knlist_add(&lj->klist, kn, 0);
2649
2650	return (0);
2651}
2652
2653/* kqueue detach function */
2654static void
2655filt_liodetach(struct knote *kn)
2656{
2657	struct knlist *knl;
2658
2659	knl = &kn->kn_ptr.p_lio->klist;
2660	knl->kl_lock(knl->kl_lockarg);
2661	if (!knlist_empty(knl))
2662		knlist_remove(knl, kn, 1);
2663	knl->kl_unlock(knl->kl_lockarg);
2664}
2665
2666/* kqueue filter function */
2667/*ARGSUSED*/
2668static int
2669filt_lio(struct knote *kn, long hint)
2670{
2671	struct aioliojob * lj = kn->kn_ptr.p_lio;
2672
2673	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2674}
2675
2676#ifdef COMPAT_FREEBSD32
2677
2678struct __aiocb_private32 {
2679	int32_t	status;
2680	int32_t	error;
2681	uint32_t kernelinfo;
2682};
2683
2684typedef struct oaiocb32 {
2685	int	aio_fildes;		/* File descriptor */
2686	uint64_t aio_offset __packed;	/* File offset for I/O */
2687	uint32_t aio_buf;		/* I/O buffer in process space */
2688	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2689	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2690	int	aio_lio_opcode;		/* LIO opcode */
2691	int	aio_reqprio;		/* Request priority -- ignored */
2692	struct	__aiocb_private32 _aiocb_private;
2693} oaiocb32_t;
2694
2695typedef struct aiocb32 {
2696	int32_t	aio_fildes;		/* File descriptor */
2697	uint64_t aio_offset __packed;	/* File offset for I/O */
2698	uint32_t aio_buf;		/* I/O buffer in process space */
2699	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2700	int	__spare__[2];
2701	uint32_t __spare2__;
2702	int	aio_lio_opcode;		/* LIO opcode */
2703	int	aio_reqprio;		/* Request priority -- ignored */
2704	struct __aiocb_private32 _aiocb_private;
2705	struct sigevent32 aio_sigevent;	/* Signal to deliver */
2706} aiocb32_t;
2707
2708static int
2709convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2710{
2711
2712	/*
2713	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2714	 * supported by AIO with the old sigevent structure.
2715	 */
2716	CP(*osig, *nsig, sigev_notify);
2717	switch (nsig->sigev_notify) {
2718	case SIGEV_NONE:
2719		break;
2720	case SIGEV_SIGNAL:
2721		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2722		break;
2723	case SIGEV_KEVENT:
2724		nsig->sigev_notify_kqueue =
2725		    osig->__sigev_u.__sigev_notify_kqueue;
2726		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2727		break;
2728	default:
2729		return (EINVAL);
2730	}
2731	return (0);
2732}
2733
2734static int
2735aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2736{
2737	struct oaiocb32 job32;
2738	int error;
2739
2740	bzero(kjob, sizeof(struct aiocb));
2741	error = copyin(ujob, &job32, sizeof(job32));
2742	if (error)
2743		return (error);
2744
2745	CP(job32, *kjob, aio_fildes);
2746	CP(job32, *kjob, aio_offset);
2747	PTRIN_CP(job32, *kjob, aio_buf);
2748	CP(job32, *kjob, aio_nbytes);
2749	CP(job32, *kjob, aio_lio_opcode);
2750	CP(job32, *kjob, aio_reqprio);
2751	CP(job32, *kjob, _aiocb_private.status);
2752	CP(job32, *kjob, _aiocb_private.error);
2753	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2754	return (convert_old_sigevent32(&job32.aio_sigevent,
2755	    &kjob->aio_sigevent));
2756}
2757
2758static int
2759aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2760{
2761	struct aiocb32 job32;
2762	int error;
2763
2764	error = copyin(ujob, &job32, sizeof(job32));
2765	if (error)
2766		return (error);
2767	CP(job32, *kjob, aio_fildes);
2768	CP(job32, *kjob, aio_offset);
2769	PTRIN_CP(job32, *kjob, aio_buf);
2770	CP(job32, *kjob, aio_nbytes);
2771	CP(job32, *kjob, aio_lio_opcode);
2772	CP(job32, *kjob, aio_reqprio);
2773	CP(job32, *kjob, _aiocb_private.status);
2774	CP(job32, *kjob, _aiocb_private.error);
2775	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2776	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2777}
2778
2779static long
2780aiocb32_fetch_status(struct aiocb *ujob)
2781{
2782	struct aiocb32 *ujob32;
2783
2784	ujob32 = (struct aiocb32 *)ujob;
2785	return (fuword32(&ujob32->_aiocb_private.status));
2786}
2787
2788static long
2789aiocb32_fetch_error(struct aiocb *ujob)
2790{
2791	struct aiocb32 *ujob32;
2792
2793	ujob32 = (struct aiocb32 *)ujob;
2794	return (fuword32(&ujob32->_aiocb_private.error));
2795}
2796
2797static int
2798aiocb32_store_status(struct aiocb *ujob, long status)
2799{
2800	struct aiocb32 *ujob32;
2801
2802	ujob32 = (struct aiocb32 *)ujob;
2803	return (suword32(&ujob32->_aiocb_private.status, status));
2804}
2805
2806static int
2807aiocb32_store_error(struct aiocb *ujob, long error)
2808{
2809	struct aiocb32 *ujob32;
2810
2811	ujob32 = (struct aiocb32 *)ujob;
2812	return (suword32(&ujob32->_aiocb_private.error, error));
2813}
2814
2815static int
2816aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2817{
2818	struct aiocb32 *ujob32;
2819
2820	ujob32 = (struct aiocb32 *)ujob;
2821	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2822}
2823
2824static int
2825aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2826{
2827
2828	return (suword32(ujobp, (long)ujob));
2829}
2830
2831static struct aiocb_ops aiocb32_ops = {
2832	.copyin = aiocb32_copyin,
2833	.fetch_status = aiocb32_fetch_status,
2834	.fetch_error = aiocb32_fetch_error,
2835	.store_status = aiocb32_store_status,
2836	.store_error = aiocb32_store_error,
2837	.store_kernelinfo = aiocb32_store_kernelinfo,
2838	.store_aiocb = aiocb32_store_aiocb,
2839};
2840
2841static struct aiocb_ops aiocb32_ops_osigevent = {
2842	.copyin = aiocb32_copyin_old_sigevent,
2843	.fetch_status = aiocb32_fetch_status,
2844	.fetch_error = aiocb32_fetch_error,
2845	.store_status = aiocb32_store_status,
2846	.store_error = aiocb32_store_error,
2847	.store_kernelinfo = aiocb32_store_kernelinfo,
2848	.store_aiocb = aiocb32_store_aiocb,
2849};
2850
2851int
2852freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2853{
2854
2855	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2856}
2857
2858int
2859freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2860{
2861	struct timespec32 ts32;
2862	struct timespec ts, *tsp;
2863	struct aiocb **ujoblist;
2864	uint32_t *ujoblist32;
2865	int error, i;
2866
2867	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2868		return (EINVAL);
2869
2870	if (uap->timeout) {
2871		/* Get timespec struct. */
2872		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2873			return (error);
2874		CP(ts32, ts, tv_sec);
2875		CP(ts32, ts, tv_nsec);
2876		tsp = &ts;
2877	} else
2878		tsp = NULL;
2879
2880	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2881	ujoblist32 = (uint32_t *)ujoblist;
2882	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2883	    sizeof(ujoblist32[0]));
2884	if (error == 0) {
2885		for (i = uap->nent; i > 0; i--)
2886			ujoblist[i] = PTRIN(ujoblist32[i]);
2887
2888		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2889	}
2890	uma_zfree(aiol_zone, ujoblist);
2891	return (error);
2892}
2893
2894int
2895freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
2896{
2897
2898	return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
2899}
2900
2901int
2902freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2903{
2904
2905	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2906}
2907
2908int
2909freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
2910{
2911
2912	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2913	    &aiocb32_ops_osigevent));
2914}
2915
2916int
2917freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2918{
2919
2920	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2921	    &aiocb32_ops));
2922}
2923
2924int
2925freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
2926{
2927
2928	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2929	    &aiocb32_ops_osigevent));
2930}
2931
2932int
2933freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2934{
2935
2936	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2937	    &aiocb32_ops));
2938}
2939
2940int
2941freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
2942{
2943
2944	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
2945	    &aiocb32_ops));
2946}
2947
2948int
2949freebsd32_aio_waitcomplete(struct thread *td,
2950    struct freebsd32_aio_waitcomplete_args *uap)
2951{
2952	struct timespec32 ts32;
2953	struct timespec ts, *tsp;
2954	int error;
2955
2956	if (uap->timeout) {
2957		/* Get timespec struct. */
2958		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2959		if (error)
2960			return (error);
2961		CP(ts32, ts, tv_sec);
2962		CP(ts32, ts, tv_nsec);
2963		tsp = &ts;
2964	} else
2965		tsp = NULL;
2966
2967	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2968	    &aiocb32_ops));
2969}
2970
2971int
2972freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2973{
2974
2975	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2976	    &aiocb32_ops));
2977}
2978
2979int
2980freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
2981{
2982	struct aiocb **acb_list;
2983	struct sigevent *sigp, sig;
2984	struct osigevent32 osig;
2985	uint32_t *acb_list32;
2986	int error, i, nent;
2987
2988	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2989		return (EINVAL);
2990
2991	nent = uap->nent;
2992	if (nent < 0 || nent > AIO_LISTIO_MAX)
2993		return (EINVAL);
2994
2995	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2996		error = copyin(uap->sig, &osig, sizeof(osig));
2997		if (error)
2998			return (error);
2999		error = convert_old_sigevent32(&osig, &sig);
3000		if (error)
3001			return (error);
3002		sigp = &sig;
3003	} else
3004		sigp = NULL;
3005
3006	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3007	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3008	if (error) {
3009		free(acb_list32, M_LIO);
3010		return (error);
3011	}
3012	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3013	for (i = 0; i < nent; i++)
3014		acb_list[i] = PTRIN(acb_list32[i]);
3015	free(acb_list32, M_LIO);
3016
3017	error = kern_lio_listio(td, uap->mode,
3018	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3019	    &aiocb32_ops_osigevent);
3020	free(acb_list, M_LIO);
3021	return (error);
3022}
3023
3024int
3025freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
3026{
3027	struct aiocb **acb_list;
3028	struct sigevent *sigp, sig;
3029	struct sigevent32 sig32;
3030	uint32_t *acb_list32;
3031	int error, i, nent;
3032
3033	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
3034		return (EINVAL);
3035
3036	nent = uap->nent;
3037	if (nent < 0 || nent > AIO_LISTIO_MAX)
3038		return (EINVAL);
3039
3040	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
3041		error = copyin(uap->sig, &sig32, sizeof(sig32));
3042		if (error)
3043			return (error);
3044		error = convert_sigevent32(&sig32, &sig);
3045		if (error)
3046			return (error);
3047		sigp = &sig;
3048	} else
3049		sigp = NULL;
3050
3051	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
3052	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
3053	if (error) {
3054		free(acb_list32, M_LIO);
3055		return (error);
3056	}
3057	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
3058	for (i = 0; i < nent; i++)
3059		acb_list[i] = PTRIN(acb_list32[i]);
3060	free(acb_list32, M_LIO);
3061
3062	error = kern_lio_listio(td, uap->mode,
3063	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
3064	    &aiocb32_ops);
3065	free(acb_list, M_LIO);
3066	return (error);
3067}
3068
3069#endif
3070