1/*-
2 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 *    derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author.  This software is distributed AS-IS.
15 */
16
17/*
18 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
19 */
20
21#include <sys/cdefs.h>
22__FBSDID("$FreeBSD: releng/11.0/sys/kern/vfs_aio.c 303787 2016-08-05 22:23:04Z jhb $");
23
24#include "opt_compat.h"
25
26#include <sys/param.h>
27#include <sys/systm.h>
28#include <sys/malloc.h>
29#include <sys/bio.h>
30#include <sys/buf.h>
31#include <sys/capsicum.h>
32#include <sys/eventhandler.h>
33#include <sys/sysproto.h>
34#include <sys/filedesc.h>
35#include <sys/kernel.h>
36#include <sys/module.h>
37#include <sys/kthread.h>
38#include <sys/fcntl.h>
39#include <sys/file.h>
40#include <sys/limits.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/unistd.h>
44#include <sys/posix4.h>
45#include <sys/proc.h>
46#include <sys/resourcevar.h>
47#include <sys/signalvar.h>
48#include <sys/protosw.h>
49#include <sys/rwlock.h>
50#include <sys/sema.h>
51#include <sys/socket.h>
52#include <sys/socketvar.h>
53#include <sys/syscall.h>
54#include <sys/sysent.h>
55#include <sys/sysctl.h>
56#include <sys/syslog.h>
57#include <sys/sx.h>
58#include <sys/taskqueue.h>
59#include <sys/vnode.h>
60#include <sys/conf.h>
61#include <sys/event.h>
62#include <sys/mount.h>
63#include <geom/geom.h>
64
65#include <machine/atomic.h>
66
67#include <vm/vm.h>
68#include <vm/vm_page.h>
69#include <vm/vm_extern.h>
70#include <vm/pmap.h>
71#include <vm/vm_map.h>
72#include <vm/vm_object.h>
73#include <vm/uma.h>
74#include <sys/aio.h>
75
76/*
77 * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
78 * overflow. (XXX will be removed soon.)
79 */
80static u_long jobrefid;
81
82/*
83 * Counter for aio_fsync.
84 */
85static uint64_t jobseqno;
86
87#ifndef MAX_AIO_PER_PROC
88#define MAX_AIO_PER_PROC	32
89#endif
90
91#ifndef MAX_AIO_QUEUE_PER_PROC
92#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
93#endif
94
95#ifndef MAX_AIO_QUEUE
96#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
97#endif
98
99#ifndef MAX_BUF_AIO
100#define MAX_BUF_AIO		16
101#endif
102
103FEATURE(aio, "Asynchronous I/O");
104
105static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
106
107static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
108    "Async IO management");
109
110static int enable_aio_unsafe = 0;
111SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
112    "Permit asynchronous IO on all file types, not just known-safe types");
113
114static unsigned int unsafe_warningcnt = 1;
115SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
116    &unsafe_warningcnt, 0,
117    "Warnings that will be triggered upon failed IO requests on unsafe files");
118
119static int max_aio_procs = MAX_AIO_PROCS;
120SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
121    "Maximum number of kernel processes to use for handling async IO ");
122
123static int num_aio_procs = 0;
124SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
125    "Number of presently active kernel processes for async IO");
126
127/*
128 * The code will adjust the actual number of AIO processes towards this
129 * number when it gets a chance.
130 */
131static int target_aio_procs = TARGET_AIO_PROCS;
132SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
133    0,
134    "Preferred number of ready kernel processes for async IO");
135
136static int max_queue_count = MAX_AIO_QUEUE;
137SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
138    "Maximum number of aio requests to queue, globally");
139
140static int num_queue_count = 0;
141SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
142    "Number of queued aio requests");
143
144static int num_buf_aio = 0;
145SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
146    "Number of aio requests presently handled by the buf subsystem");
147
148/* Number of async I/O processes in the process of being started */
149/* XXX This should be local to aio_aqueue() */
150static int num_aio_resv_start = 0;
151
152static int aiod_lifetime;
153SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
154    "Maximum lifetime for idle aiod");
155
156static int max_aio_per_proc = MAX_AIO_PER_PROC;
157SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
158    0,
159    "Maximum active aio requests per process (stored in the process)");
160
161static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
162SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
163    &max_aio_queue_per_proc, 0,
164    "Maximum queued aio requests per process (stored in the process)");
165
166static int max_buf_aio = MAX_BUF_AIO;
167SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
168    "Maximum buf aio requests per process (stored in the process)");
169
170#ifdef COMPAT_FREEBSD6
171typedef struct oaiocb {
172	int	aio_fildes;		/* File descriptor */
173	off_t	aio_offset;		/* File offset for I/O */
174	volatile void *aio_buf;         /* I/O buffer in process space */
175	size_t	aio_nbytes;		/* Number of bytes for I/O */
176	struct	osigevent aio_sigevent;	/* Signal to deliver */
177	int	aio_lio_opcode;		/* LIO opcode */
178	int	aio_reqprio;		/* Request priority -- ignored */
179	struct	__aiocb_private	_aiocb_private;
180} oaiocb_t;
181#endif
182
183/*
184 * Below is a key of locks used to protect each member of struct kaiocb
185 * aioliojob and kaioinfo and any backends.
186 *
187 * * - need not protected
188 * a - locked by kaioinfo lock
189 * b - locked by backend lock, the backend lock can be null in some cases,
190 *     for example, BIO belongs to this type, in this case, proc lock is
191 *     reused.
192 * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
193 */
194
195/*
196 * If the routine that services an AIO request blocks while running in an
197 * AIO kernel process it can starve other I/O requests.  BIO requests
198 * queued via aio_qphysio() complete in GEOM and do not use AIO kernel
199 * processes at all.  Socket I/O requests use a separate pool of
200 * kprocs and also force non-blocking I/O.  Other file I/O requests
201 * use the generic fo_read/fo_write operations which can block.  The
202 * fsync and mlock operations can also block while executing.  Ideally
203 * none of these requests would block while executing.
204 *
205 * Note that the service routines cannot toggle O_NONBLOCK in the file
206 * structure directly while handling a request due to races with
207 * userland threads.
208 */
209
210/* jobflags */
211#define	KAIOCB_QUEUEING		0x01
212#define	KAIOCB_CANCELLED	0x02
213#define	KAIOCB_CANCELLING	0x04
214#define	KAIOCB_CHECKSYNC	0x08
215#define	KAIOCB_CLEARED		0x10
216#define	KAIOCB_FINISHED		0x20
217
218/*
219 * AIO process info
220 */
221#define AIOP_FREE	0x1			/* proc on free queue */
222
223struct aioproc {
224	int	aioprocflags;			/* (c) AIO proc flags */
225	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
226	struct	proc *aioproc;			/* (*) the AIO proc */
227};
228
229/*
230 * data-structure for lio signal management
231 */
232struct aioliojob {
233	int	lioj_flags;			/* (a) listio flags */
234	int	lioj_count;			/* (a) listio flags */
235	int	lioj_finished_count;		/* (a) listio flags */
236	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
237	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
238	struct	knlist klist;			/* (a) list of knotes */
239	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
240};
241
242#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
243#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
244#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
245
246/*
247 * per process aio data structure
248 */
249struct kaioinfo {
250	struct	mtx kaio_mtx;		/* the lock to protect this struct */
251	int	kaio_flags;		/* (a) per process kaio flags */
252	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
253	int	kaio_active_count;	/* (c) number of currently used AIOs */
254	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
255	int	kaio_count;		/* (a) size of AIO queue */
256	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
257	int	kaio_buffer_count;	/* (a) number of physio buffers */
258	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
259	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
260	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
261	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
262	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
263	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
264	struct	task kaio_task;		/* (*) task to kick aio processes */
265	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
266};
267
268#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
269#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
270#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
271#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
272
273#define KAIO_RUNDOWN	0x1	/* process is being run down */
274#define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
275
276/*
277 * Operations used to interact with userland aio control blocks.
278 * Different ABIs provide their own operations.
279 */
280struct aiocb_ops {
281	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
282	long	(*fetch_status)(struct aiocb *ujob);
283	long	(*fetch_error)(struct aiocb *ujob);
284	int	(*store_status)(struct aiocb *ujob, long status);
285	int	(*store_error)(struct aiocb *ujob, long error);
286	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
287	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
288};
289
290static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
291static struct sema aio_newproc_sem;
292static struct mtx aio_job_mtx;
293static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
294static struct unrhdr *aiod_unr;
295
296void		aio_init_aioinfo(struct proc *p);
297static int	aio_onceonly(void);
298static int	aio_free_entry(struct kaiocb *job);
299static void	aio_process_rw(struct kaiocb *job);
300static void	aio_process_sync(struct kaiocb *job);
301static void	aio_process_mlock(struct kaiocb *job);
302static void	aio_schedule_fsync(void *context, int pending);
303static int	aio_newproc(int *);
304int		aio_aqueue(struct thread *td, struct aiocb *ujob,
305		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
306static int	aio_queue_file(struct file *fp, struct kaiocb *job);
307static void	aio_physwakeup(struct bio *bp);
308static void	aio_proc_rundown(void *arg, struct proc *p);
309static void	aio_proc_rundown_exec(void *arg, struct proc *p,
310		    struct image_params *imgp);
311static int	aio_qphysio(struct proc *p, struct kaiocb *job);
312static void	aio_daemon(void *param);
313static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
314static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
315static int	aio_kick(struct proc *userp);
316static void	aio_kick_nowait(struct proc *userp);
317static void	aio_kick_helper(void *context, int pending);
318static int	filt_aioattach(struct knote *kn);
319static void	filt_aiodetach(struct knote *kn);
320static int	filt_aio(struct knote *kn, long hint);
321static int	filt_lioattach(struct knote *kn);
322static void	filt_liodetach(struct knote *kn);
323static int	filt_lio(struct knote *kn, long hint);
324
325/*
326 * Zones for:
327 * 	kaio	Per process async io info
328 *	aiop	async io process data
329 *	aiocb	async io jobs
330 *	aiol	list io job pointer - internal to aio_suspend XXX
331 *	aiolio	list io jobs
332 */
333static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
334
335/* kqueue filters for aio */
336static struct filterops aio_filtops = {
337	.f_isfd = 0,
338	.f_attach = filt_aioattach,
339	.f_detach = filt_aiodetach,
340	.f_event = filt_aio,
341};
342static struct filterops lio_filtops = {
343	.f_isfd = 0,
344	.f_attach = filt_lioattach,
345	.f_detach = filt_liodetach,
346	.f_event = filt_lio
347};
348
349static eventhandler_tag exit_tag, exec_tag;
350
351TASKQUEUE_DEFINE_THREAD(aiod_kick);
352
353/*
354 * Main operations function for use as a kernel module.
355 */
356static int
357aio_modload(struct module *module, int cmd, void *arg)
358{
359	int error = 0;
360
361	switch (cmd) {
362	case MOD_LOAD:
363		aio_onceonly();
364		break;
365	case MOD_SHUTDOWN:
366		break;
367	default:
368		error = EOPNOTSUPP;
369		break;
370	}
371	return (error);
372}
373
374static moduledata_t aio_mod = {
375	"aio",
376	&aio_modload,
377	NULL
378};
379
380DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
381MODULE_VERSION(aio, 1);
382
383/*
384 * Startup initialization
385 */
386static int
387aio_onceonly(void)
388{
389
390	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
391	    EVENTHANDLER_PRI_ANY);
392	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
393	    NULL, EVENTHANDLER_PRI_ANY);
394	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
395	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
396	TAILQ_INIT(&aio_freeproc);
397	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
398	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
399	TAILQ_INIT(&aio_jobs);
400	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
401	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
402	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
403	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
404	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
405	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
406	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
407	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
408	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
409	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
410	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
411	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
412	jobrefid = 1;
413	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
414	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
415	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
416	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
417
418	return (0);
419}
420
421/*
422 * Init the per-process aioinfo structure.  The aioinfo limits are set
423 * per-process for user limit (resource) management.
424 */
425void
426aio_init_aioinfo(struct proc *p)
427{
428	struct kaioinfo *ki;
429
430	ki = uma_zalloc(kaio_zone, M_WAITOK);
431	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
432	ki->kaio_flags = 0;
433	ki->kaio_maxactive_count = max_aio_per_proc;
434	ki->kaio_active_count = 0;
435	ki->kaio_qallowed_count = max_aio_queue_per_proc;
436	ki->kaio_count = 0;
437	ki->kaio_ballowed_count = max_buf_aio;
438	ki->kaio_buffer_count = 0;
439	TAILQ_INIT(&ki->kaio_all);
440	TAILQ_INIT(&ki->kaio_done);
441	TAILQ_INIT(&ki->kaio_jobqueue);
442	TAILQ_INIT(&ki->kaio_liojoblist);
443	TAILQ_INIT(&ki->kaio_syncqueue);
444	TAILQ_INIT(&ki->kaio_syncready);
445	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
446	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
447	PROC_LOCK(p);
448	if (p->p_aioinfo == NULL) {
449		p->p_aioinfo = ki;
450		PROC_UNLOCK(p);
451	} else {
452		PROC_UNLOCK(p);
453		mtx_destroy(&ki->kaio_mtx);
454		uma_zfree(kaio_zone, ki);
455	}
456
457	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
458		aio_newproc(NULL);
459}
460
461static int
462aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
463{
464	struct thread *td;
465	int error;
466
467	error = sigev_findtd(p, sigev, &td);
468	if (error)
469		return (error);
470	if (!KSI_ONQ(ksi)) {
471		ksiginfo_set_sigev(ksi, sigev);
472		ksi->ksi_code = SI_ASYNCIO;
473		ksi->ksi_flags |= KSI_EXT | KSI_INS;
474		tdsendsignal(p, td, ksi->ksi_signo, ksi);
475	}
476	PROC_UNLOCK(p);
477	return (error);
478}
479
480/*
481 * Free a job entry.  Wait for completion if it is currently active, but don't
482 * delay forever.  If we delay, we return a flag that says that we have to
483 * restart the queue scan.
484 */
485static int
486aio_free_entry(struct kaiocb *job)
487{
488	struct kaioinfo *ki;
489	struct aioliojob *lj;
490	struct proc *p;
491
492	p = job->userproc;
493	MPASS(curproc == p);
494	ki = p->p_aioinfo;
495	MPASS(ki != NULL);
496
497	AIO_LOCK_ASSERT(ki, MA_OWNED);
498	MPASS(job->jobflags & KAIOCB_FINISHED);
499
500	atomic_subtract_int(&num_queue_count, 1);
501
502	ki->kaio_count--;
503	MPASS(ki->kaio_count >= 0);
504
505	TAILQ_REMOVE(&ki->kaio_done, job, plist);
506	TAILQ_REMOVE(&ki->kaio_all, job, allist);
507
508	lj = job->lio;
509	if (lj) {
510		lj->lioj_count--;
511		lj->lioj_finished_count--;
512
513		if (lj->lioj_count == 0) {
514			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
515			/* lio is going away, we need to destroy any knotes */
516			knlist_delete(&lj->klist, curthread, 1);
517			PROC_LOCK(p);
518			sigqueue_take(&lj->lioj_ksi);
519			PROC_UNLOCK(p);
520			uma_zfree(aiolio_zone, lj);
521		}
522	}
523
524	/* job is going away, we need to destroy any knotes */
525	knlist_delete(&job->klist, curthread, 1);
526	PROC_LOCK(p);
527	sigqueue_take(&job->ksi);
528	PROC_UNLOCK(p);
529
530	AIO_UNLOCK(ki);
531
532	/*
533	 * The thread argument here is used to find the owning process
534	 * and is also passed to fo_close() which may pass it to various
535	 * places such as devsw close() routines.  Because of that, we
536	 * need a thread pointer from the process owning the job that is
537	 * persistent and won't disappear out from under us or move to
538	 * another process.
539	 *
540	 * Currently, all the callers of this function call it to remove
541	 * a kaiocb from the current process' job list either via a
542	 * syscall or due to the current process calling exit() or
543	 * execve().  Thus, we know that p == curproc.  We also know that
544	 * curthread can't exit since we are curthread.
545	 *
546	 * Therefore, we use curthread as the thread to pass to
547	 * knlist_delete().  This does mean that it is possible for the
548	 * thread pointer at close time to differ from the thread pointer
549	 * at open time, but this is already true of file descriptors in
550	 * a multithreaded process.
551	 */
552	if (job->fd_file)
553		fdrop(job->fd_file, curthread);
554	crfree(job->cred);
555	uma_zfree(aiocb_zone, job);
556	AIO_LOCK(ki);
557
558	return (0);
559}
560
561static void
562aio_proc_rundown_exec(void *arg, struct proc *p,
563    struct image_params *imgp __unused)
564{
565   	aio_proc_rundown(arg, p);
566}
567
568static int
569aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
570{
571	aio_cancel_fn_t *func;
572	int cancelled;
573
574	AIO_LOCK_ASSERT(ki, MA_OWNED);
575	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
576		return (0);
577	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
578	job->jobflags |= KAIOCB_CANCELLED;
579
580	func = job->cancel_fn;
581
582	/*
583	 * If there is no cancel routine, just leave the job marked as
584	 * cancelled.  The job should be in active use by a caller who
585	 * should complete it normally or when it fails to install a
586	 * cancel routine.
587	 */
588	if (func == NULL)
589		return (0);
590
591	/*
592	 * Set the CANCELLING flag so that aio_complete() will defer
593	 * completions of this job.  This prevents the job from being
594	 * freed out from under the cancel callback.  After the
595	 * callback any deferred completion (whether from the callback
596	 * or any other source) will be completed.
597	 */
598	job->jobflags |= KAIOCB_CANCELLING;
599	AIO_UNLOCK(ki);
600	func(job);
601	AIO_LOCK(ki);
602	job->jobflags &= ~KAIOCB_CANCELLING;
603	if (job->jobflags & KAIOCB_FINISHED) {
604		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
605		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
606		aio_bio_done_notify(p, job);
607	} else {
608		/*
609		 * The cancel callback might have scheduled an
610		 * operation to cancel this request, but it is
611		 * only counted as cancelled if the request is
612		 * cancelled when the callback returns.
613		 */
614		cancelled = 0;
615	}
616	return (cancelled);
617}
618
619/*
620 * Rundown the jobs for a given process.
621 */
622static void
623aio_proc_rundown(void *arg, struct proc *p)
624{
625	struct kaioinfo *ki;
626	struct aioliojob *lj;
627	struct kaiocb *job, *jobn;
628
629	KASSERT(curthread->td_proc == p,
630	    ("%s: called on non-curproc", __func__));
631	ki = p->p_aioinfo;
632	if (ki == NULL)
633		return;
634
635	AIO_LOCK(ki);
636	ki->kaio_flags |= KAIO_RUNDOWN;
637
638restart:
639
640	/*
641	 * Try to cancel all pending requests. This code simulates
642	 * aio_cancel on all pending I/O requests.
643	 */
644	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
645		aio_cancel_job(p, ki, job);
646	}
647
648	/* Wait for all running I/O to be finished */
649	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
650		ki->kaio_flags |= KAIO_WAKEUP;
651		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
652		goto restart;
653	}
654
655	/* Free all completed I/O requests. */
656	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
657		aio_free_entry(job);
658
659	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
660		if (lj->lioj_count == 0) {
661			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
662			knlist_delete(&lj->klist, curthread, 1);
663			PROC_LOCK(p);
664			sigqueue_take(&lj->lioj_ksi);
665			PROC_UNLOCK(p);
666			uma_zfree(aiolio_zone, lj);
667		} else {
668			panic("LIO job not cleaned up: C:%d, FC:%d\n",
669			    lj->lioj_count, lj->lioj_finished_count);
670		}
671	}
672	AIO_UNLOCK(ki);
673	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
674	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
675	mtx_destroy(&ki->kaio_mtx);
676	uma_zfree(kaio_zone, ki);
677	p->p_aioinfo = NULL;
678}
679
680/*
681 * Select a job to run (called by an AIO daemon).
682 */
683static struct kaiocb *
684aio_selectjob(struct aioproc *aiop)
685{
686	struct kaiocb *job;
687	struct kaioinfo *ki;
688	struct proc *userp;
689
690	mtx_assert(&aio_job_mtx, MA_OWNED);
691restart:
692	TAILQ_FOREACH(job, &aio_jobs, list) {
693		userp = job->userproc;
694		ki = userp->p_aioinfo;
695
696		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
697			TAILQ_REMOVE(&aio_jobs, job, list);
698			if (!aio_clear_cancel_function(job))
699				goto restart;
700
701			/* Account for currently active jobs. */
702			ki->kaio_active_count++;
703			break;
704		}
705	}
706	return (job);
707}
708
709/*
710 * Move all data to a permanent storage device.  This code
711 * simulates the fsync syscall.
712 */
713static int
714aio_fsync_vnode(struct thread *td, struct vnode *vp)
715{
716	struct mount *mp;
717	int error;
718
719	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
720		goto drop;
721	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
722	if (vp->v_object != NULL) {
723		VM_OBJECT_WLOCK(vp->v_object);
724		vm_object_page_clean(vp->v_object, 0, 0, 0);
725		VM_OBJECT_WUNLOCK(vp->v_object);
726	}
727	error = VOP_FSYNC(vp, MNT_WAIT, td);
728
729	VOP_UNLOCK(vp, 0);
730	vn_finished_write(mp);
731drop:
732	return (error);
733}
734
735/*
736 * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
737 * does the I/O request for the non-physio version of the operations.  The
738 * normal vn operations are used, and this code should work in all instances
739 * for every type of file, including pipes, sockets, fifos, and regular files.
740 *
741 * XXX I don't think it works well for socket, pipe, and fifo.
742 */
743static void
744aio_process_rw(struct kaiocb *job)
745{
746	struct ucred *td_savedcred;
747	struct thread *td;
748	struct aiocb *cb;
749	struct file *fp;
750	struct uio auio;
751	struct iovec aiov;
752	ssize_t cnt;
753	long msgsnd_st, msgsnd_end;
754	long msgrcv_st, msgrcv_end;
755	long oublock_st, oublock_end;
756	long inblock_st, inblock_end;
757	int error;
758
759	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
760	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
761	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
762
763	aio_switch_vmspace(job);
764	td = curthread;
765	td_savedcred = td->td_ucred;
766	td->td_ucred = job->cred;
767	cb = &job->uaiocb;
768	fp = job->fd_file;
769
770	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
771	aiov.iov_len = cb->aio_nbytes;
772
773	auio.uio_iov = &aiov;
774	auio.uio_iovcnt = 1;
775	auio.uio_offset = cb->aio_offset;
776	auio.uio_resid = cb->aio_nbytes;
777	cnt = cb->aio_nbytes;
778	auio.uio_segflg = UIO_USERSPACE;
779	auio.uio_td = td;
780
781	msgrcv_st = td->td_ru.ru_msgrcv;
782	msgsnd_st = td->td_ru.ru_msgsnd;
783	inblock_st = td->td_ru.ru_inblock;
784	oublock_st = td->td_ru.ru_oublock;
785
786	/*
787	 * aio_aqueue() acquires a reference to the file that is
788	 * released in aio_free_entry().
789	 */
790	if (cb->aio_lio_opcode == LIO_READ) {
791		auio.uio_rw = UIO_READ;
792		if (auio.uio_resid == 0)
793			error = 0;
794		else
795			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
796	} else {
797		if (fp->f_type == DTYPE_VNODE)
798			bwillwrite();
799		auio.uio_rw = UIO_WRITE;
800		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
801	}
802	msgrcv_end = td->td_ru.ru_msgrcv;
803	msgsnd_end = td->td_ru.ru_msgsnd;
804	inblock_end = td->td_ru.ru_inblock;
805	oublock_end = td->td_ru.ru_oublock;
806
807	job->msgrcv = msgrcv_end - msgrcv_st;
808	job->msgsnd = msgsnd_end - msgsnd_st;
809	job->inblock = inblock_end - inblock_st;
810	job->outblock = oublock_end - oublock_st;
811
812	if ((error) && (auio.uio_resid != cnt)) {
813		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
814			error = 0;
815		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
816			PROC_LOCK(job->userproc);
817			kern_psignal(job->userproc, SIGPIPE);
818			PROC_UNLOCK(job->userproc);
819		}
820	}
821
822	cnt -= auio.uio_resid;
823	td->td_ucred = td_savedcred;
824	if (error)
825		aio_complete(job, -1, error);
826	else
827		aio_complete(job, cnt, 0);
828}
829
830static void
831aio_process_sync(struct kaiocb *job)
832{
833	struct thread *td = curthread;
834	struct ucred *td_savedcred = td->td_ucred;
835	struct file *fp = job->fd_file;
836	int error = 0;
837
838	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
839	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
840
841	td->td_ucred = job->cred;
842	if (fp->f_vnode != NULL)
843		error = aio_fsync_vnode(td, fp->f_vnode);
844	td->td_ucred = td_savedcred;
845	if (error)
846		aio_complete(job, -1, error);
847	else
848		aio_complete(job, 0, 0);
849}
850
851static void
852aio_process_mlock(struct kaiocb *job)
853{
854	struct aiocb *cb = &job->uaiocb;
855	int error;
856
857	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
858	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
859
860	aio_switch_vmspace(job);
861	error = vm_mlock(job->userproc, job->cred,
862	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
863	if (error)
864		aio_complete(job, -1, error);
865	else
866		aio_complete(job, 0, 0);
867}
868
869static void
870aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
871{
872	struct aioliojob *lj;
873	struct kaioinfo *ki;
874	struct kaiocb *sjob, *sjobn;
875	int lj_done;
876	bool schedule_fsync;
877
878	ki = userp->p_aioinfo;
879	AIO_LOCK_ASSERT(ki, MA_OWNED);
880	lj = job->lio;
881	lj_done = 0;
882	if (lj) {
883		lj->lioj_finished_count++;
884		if (lj->lioj_count == lj->lioj_finished_count)
885			lj_done = 1;
886	}
887	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
888	MPASS(job->jobflags & KAIOCB_FINISHED);
889
890	if (ki->kaio_flags & KAIO_RUNDOWN)
891		goto notification_done;
892
893	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
894	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
895		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
896
897	KNOTE_LOCKED(&job->klist, 1);
898
899	if (lj_done) {
900		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
901			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
902			KNOTE_LOCKED(&lj->klist, 1);
903		}
904		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
905		    == LIOJ_SIGNAL
906		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
907		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
908			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
909			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
910		}
911	}
912
913notification_done:
914	if (job->jobflags & KAIOCB_CHECKSYNC) {
915		schedule_fsync = false;
916		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
917			if (job->fd_file != sjob->fd_file ||
918			    job->seqno >= sjob->seqno)
919				continue;
920			if (--sjob->pending > 0)
921				continue;
922			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
923			if (!aio_clear_cancel_function_locked(sjob))
924				continue;
925			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
926			schedule_fsync = true;
927		}
928		if (schedule_fsync)
929			taskqueue_enqueue(taskqueue_aiod_kick,
930			    &ki->kaio_sync_task);
931	}
932	if (ki->kaio_flags & KAIO_WAKEUP) {
933		ki->kaio_flags &= ~KAIO_WAKEUP;
934		wakeup(&userp->p_aioinfo);
935	}
936}
937
938static void
939aio_schedule_fsync(void *context, int pending)
940{
941	struct kaioinfo *ki;
942	struct kaiocb *job;
943
944	ki = context;
945	AIO_LOCK(ki);
946	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
947		job = TAILQ_FIRST(&ki->kaio_syncready);
948		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
949		AIO_UNLOCK(ki);
950		aio_schedule(job, aio_process_sync);
951		AIO_LOCK(ki);
952	}
953	AIO_UNLOCK(ki);
954}
955
956bool
957aio_cancel_cleared(struct kaiocb *job)
958{
959	struct kaioinfo *ki;
960
961	/*
962	 * The caller should hold the same queue lock held when
963	 * aio_clear_cancel_function() was called and set this flag
964	 * ensuring this check sees an up-to-date value.  However,
965	 * there is no way to assert that.
966	 */
967	ki = job->userproc->p_aioinfo;
968	return ((job->jobflags & KAIOCB_CLEARED) != 0);
969}
970
971static bool
972aio_clear_cancel_function_locked(struct kaiocb *job)
973{
974
975	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
976	MPASS(job->cancel_fn != NULL);
977	if (job->jobflags & KAIOCB_CANCELLING) {
978		job->jobflags |= KAIOCB_CLEARED;
979		return (false);
980	}
981	job->cancel_fn = NULL;
982	return (true);
983}
984
985bool
986aio_clear_cancel_function(struct kaiocb *job)
987{
988	struct kaioinfo *ki;
989	bool ret;
990
991	ki = job->userproc->p_aioinfo;
992	AIO_LOCK(ki);
993	ret = aio_clear_cancel_function_locked(job);
994	AIO_UNLOCK(ki);
995	return (ret);
996}
997
998static bool
999aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
1000{
1001
1002	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
1003	if (job->jobflags & KAIOCB_CANCELLED)
1004		return (false);
1005	job->cancel_fn = func;
1006	return (true);
1007}
1008
1009bool
1010aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
1011{
1012	struct kaioinfo *ki;
1013	bool ret;
1014
1015	ki = job->userproc->p_aioinfo;
1016	AIO_LOCK(ki);
1017	ret = aio_set_cancel_function_locked(job, func);
1018	AIO_UNLOCK(ki);
1019	return (ret);
1020}
1021
1022void
1023aio_complete(struct kaiocb *job, long status, int error)
1024{
1025	struct kaioinfo *ki;
1026	struct proc *userp;
1027
1028	job->uaiocb._aiocb_private.error = error;
1029	job->uaiocb._aiocb_private.status = status;
1030
1031	userp = job->userproc;
1032	ki = userp->p_aioinfo;
1033
1034	AIO_LOCK(ki);
1035	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
1036	    ("duplicate aio_complete"));
1037	job->jobflags |= KAIOCB_FINISHED;
1038	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
1039		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
1040		aio_bio_done_notify(userp, job);
1041	}
1042	AIO_UNLOCK(ki);
1043}
1044
1045void
1046aio_cancel(struct kaiocb *job)
1047{
1048
1049	aio_complete(job, -1, ECANCELED);
1050}
1051
1052void
1053aio_switch_vmspace(struct kaiocb *job)
1054{
1055
1056	vmspace_switch_aio(job->userproc->p_vmspace);
1057}
1058
1059/*
1060 * The AIO daemon, most of the actual work is done in aio_process_*,
1061 * but the setup (and address space mgmt) is done in this routine.
1062 */
1063static void
1064aio_daemon(void *_id)
1065{
1066	struct kaiocb *job;
1067	struct aioproc *aiop;
1068	struct kaioinfo *ki;
1069	struct proc *p;
1070	struct vmspace *myvm;
1071	struct thread *td = curthread;
1072	int id = (intptr_t)_id;
1073
1074	/*
1075	 * Grab an extra reference on the daemon's vmspace so that it
1076	 * doesn't get freed by jobs that switch to a different
1077	 * vmspace.
1078	 */
1079	p = td->td_proc;
1080	myvm = vmspace_acquire_ref(p);
1081
1082	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
1083
1084	/*
1085	 * Allocate and ready the aio control info.  There is one aiop structure
1086	 * per daemon.
1087	 */
1088	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1089	aiop->aioproc = p;
1090	aiop->aioprocflags = 0;
1091
1092	/*
1093	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
1094	 * and creating too many daemons.)
1095	 */
1096	sema_post(&aio_newproc_sem);
1097
1098	mtx_lock(&aio_job_mtx);
1099	for (;;) {
1100		/*
1101		 * Take daemon off of free queue
1102		 */
1103		if (aiop->aioprocflags & AIOP_FREE) {
1104			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1105			aiop->aioprocflags &= ~AIOP_FREE;
1106		}
1107
1108		/*
1109		 * Check for jobs.
1110		 */
1111		while ((job = aio_selectjob(aiop)) != NULL) {
1112			mtx_unlock(&aio_job_mtx);
1113
1114			ki = job->userproc->p_aioinfo;
1115			job->handle_fn(job);
1116
1117			mtx_lock(&aio_job_mtx);
1118			/* Decrement the active job count. */
1119			ki->kaio_active_count--;
1120		}
1121
1122		/*
1123		 * Disconnect from user address space.
1124		 */
1125		if (p->p_vmspace != myvm) {
1126			mtx_unlock(&aio_job_mtx);
1127			vmspace_switch_aio(myvm);
1128			mtx_lock(&aio_job_mtx);
1129			/*
1130			 * We have to restart to avoid race, we only sleep if
1131			 * no job can be selected.
1132			 */
1133			continue;
1134		}
1135
1136		mtx_assert(&aio_job_mtx, MA_OWNED);
1137
1138		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1139		aiop->aioprocflags |= AIOP_FREE;
1140
1141		/*
1142		 * If daemon is inactive for a long time, allow it to exit,
1143		 * thereby freeing resources.
1144		 */
1145		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
1146		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
1147		    (aiop->aioprocflags & AIOP_FREE) &&
1148		    num_aio_procs > target_aio_procs)
1149			break;
1150	}
1151	TAILQ_REMOVE(&aio_freeproc, aiop, list);
1152	num_aio_procs--;
1153	mtx_unlock(&aio_job_mtx);
1154	uma_zfree(aiop_zone, aiop);
1155	free_unr(aiod_unr, id);
1156	vmspace_free(myvm);
1157
1158	KASSERT(p->p_vmspace == myvm,
1159	    ("AIOD: bad vmspace for exiting daemon"));
1160	KASSERT(myvm->vm_refcnt > 1,
1161	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
1162	kproc_exit(0);
1163}
1164
1165/*
1166 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1167 * AIO daemon modifies its environment itself.
1168 */
1169static int
1170aio_newproc(int *start)
1171{
1172	int error;
1173	struct proc *p;
1174	int id;
1175
1176	id = alloc_unr(aiod_unr);
1177	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1178		RFNOWAIT, 0, "aiod%d", id);
1179	if (error == 0) {
1180		/*
1181		 * Wait until daemon is started.
1182		 */
1183		sema_wait(&aio_newproc_sem);
1184		mtx_lock(&aio_job_mtx);
1185		num_aio_procs++;
1186		if (start != NULL)
1187			(*start)--;
1188		mtx_unlock(&aio_job_mtx);
1189	} else {
1190		free_unr(aiod_unr, id);
1191	}
1192	return (error);
1193}
1194
1195/*
1196 * Try the high-performance, low-overhead physio method for eligible
1197 * VCHR devices.  This method doesn't use an aio helper thread, and
1198 * thus has very low overhead.
1199 *
1200 * Assumes that the caller, aio_aqueue(), has incremented the file
1201 * structure's reference count, preventing its deallocation for the
1202 * duration of this call.
1203 */
1204static int
1205aio_qphysio(struct proc *p, struct kaiocb *job)
1206{
1207	struct aiocb *cb;
1208	struct file *fp;
1209	struct bio *bp;
1210	struct buf *pbuf;
1211	struct vnode *vp;
1212	struct cdevsw *csw;
1213	struct cdev *dev;
1214	struct kaioinfo *ki;
1215	int error, ref, poff;
1216	vm_prot_t prot;
1217
1218	cb = &job->uaiocb;
1219	fp = job->fd_file;
1220
1221	if (fp == NULL || fp->f_type != DTYPE_VNODE)
1222		return (-1);
1223
1224	vp = fp->f_vnode;
1225	if (vp->v_type != VCHR)
1226		return (-1);
1227	if (vp->v_bufobj.bo_bsize == 0)
1228		return (-1);
1229	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
1230		return (-1);
1231
1232	ref = 0;
1233	csw = devvn_refthread(vp, &dev, &ref);
1234	if (csw == NULL)
1235		return (ENXIO);
1236
1237	if ((csw->d_flags & D_DISK) == 0) {
1238		error = -1;
1239		goto unref;
1240	}
1241	if (cb->aio_nbytes > dev->si_iosize_max) {
1242		error = -1;
1243		goto unref;
1244	}
1245
1246	ki = p->p_aioinfo;
1247	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
1248	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
1249		if (cb->aio_nbytes > MAXPHYS) {
1250			error = -1;
1251			goto unref;
1252		}
1253
1254		pbuf = NULL;
1255	} else {
1256		if (cb->aio_nbytes > MAXPHYS - poff) {
1257			error = -1;
1258			goto unref;
1259		}
1260		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1261			error = -1;
1262			goto unref;
1263		}
1264
1265		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
1266		BUF_KERNPROC(pbuf);
1267		AIO_LOCK(ki);
1268		ki->kaio_buffer_count++;
1269		AIO_UNLOCK(ki);
1270	}
1271	job->bp = bp = g_alloc_bio();
1272
1273	bp->bio_length = cb->aio_nbytes;
1274	bp->bio_bcount = cb->aio_nbytes;
1275	bp->bio_done = aio_physwakeup;
1276	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
1277	bp->bio_offset = cb->aio_offset;
1278	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1279	bp->bio_dev = dev;
1280	bp->bio_caller1 = (void *)job;
1281
1282	prot = VM_PROT_READ;
1283	if (cb->aio_lio_opcode == LIO_READ)
1284		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
1285	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
1286	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
1287	    nitems(job->pages));
1288	if (job->npages < 0) {
1289		error = EFAULT;
1290		goto doerror;
1291	}
1292	if (pbuf != NULL) {
1293		pmap_qenter((vm_offset_t)pbuf->b_data,
1294		    job->pages, job->npages);
1295		bp->bio_data = pbuf->b_data + poff;
1296		atomic_add_int(&num_buf_aio, 1);
1297	} else {
1298		bp->bio_ma = job->pages;
1299		bp->bio_ma_n = job->npages;
1300		bp->bio_ma_offset = poff;
1301		bp->bio_data = unmapped_buf;
1302		bp->bio_flags |= BIO_UNMAPPED;
1303	}
1304
1305	/* Perform transfer. */
1306	csw->d_strategy(bp);
1307	dev_relthread(dev, ref);
1308	return (0);
1309
1310doerror:
1311	if (pbuf != NULL) {
1312		AIO_LOCK(ki);
1313		ki->kaio_buffer_count--;
1314		AIO_UNLOCK(ki);
1315		relpbuf(pbuf, NULL);
1316		job->pbuf = NULL;
1317	}
1318	g_destroy_bio(bp);
1319	job->bp = NULL;
1320unref:
1321	dev_relthread(dev, ref);
1322	return (error);
1323}
1324
1325#ifdef COMPAT_FREEBSD6
1326static int
1327convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1328{
1329
1330	/*
1331	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1332	 * supported by AIO with the old sigevent structure.
1333	 */
1334	nsig->sigev_notify = osig->sigev_notify;
1335	switch (nsig->sigev_notify) {
1336	case SIGEV_NONE:
1337		break;
1338	case SIGEV_SIGNAL:
1339		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1340		break;
1341	case SIGEV_KEVENT:
1342		nsig->sigev_notify_kqueue =
1343		    osig->__sigev_u.__sigev_notify_kqueue;
1344		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1345		break;
1346	default:
1347		return (EINVAL);
1348	}
1349	return (0);
1350}
1351
1352static int
1353aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1354{
1355	struct oaiocb *ojob;
1356	int error;
1357
1358	bzero(kjob, sizeof(struct aiocb));
1359	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1360	if (error)
1361		return (error);
1362	ojob = (struct oaiocb *)kjob;
1363	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1364}
1365#endif
1366
1367static int
1368aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1369{
1370
1371	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1372}
1373
1374static long
1375aiocb_fetch_status(struct aiocb *ujob)
1376{
1377
1378	return (fuword(&ujob->_aiocb_private.status));
1379}
1380
1381static long
1382aiocb_fetch_error(struct aiocb *ujob)
1383{
1384
1385	return (fuword(&ujob->_aiocb_private.error));
1386}
1387
1388static int
1389aiocb_store_status(struct aiocb *ujob, long status)
1390{
1391
1392	return (suword(&ujob->_aiocb_private.status, status));
1393}
1394
1395static int
1396aiocb_store_error(struct aiocb *ujob, long error)
1397{
1398
1399	return (suword(&ujob->_aiocb_private.error, error));
1400}
1401
1402static int
1403aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1404{
1405
1406	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1407}
1408
1409static int
1410aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1411{
1412
1413	return (suword(ujobp, (long)ujob));
1414}
1415
1416static struct aiocb_ops aiocb_ops = {
1417	.copyin = aiocb_copyin,
1418	.fetch_status = aiocb_fetch_status,
1419	.fetch_error = aiocb_fetch_error,
1420	.store_status = aiocb_store_status,
1421	.store_error = aiocb_store_error,
1422	.store_kernelinfo = aiocb_store_kernelinfo,
1423	.store_aiocb = aiocb_store_aiocb,
1424};
1425
1426#ifdef COMPAT_FREEBSD6
1427static struct aiocb_ops aiocb_ops_osigevent = {
1428	.copyin = aiocb_copyin_old_sigevent,
1429	.fetch_status = aiocb_fetch_status,
1430	.fetch_error = aiocb_fetch_error,
1431	.store_status = aiocb_store_status,
1432	.store_error = aiocb_store_error,
1433	.store_kernelinfo = aiocb_store_kernelinfo,
1434	.store_aiocb = aiocb_store_aiocb,
1435};
1436#endif
1437
1438/*
1439 * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1440 * technique is done in this code.
1441 */
1442int
1443aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
1444	int type, struct aiocb_ops *ops)
1445{
1446	struct proc *p = td->td_proc;
1447	cap_rights_t rights;
1448	struct file *fp;
1449	struct kaiocb *job;
1450	struct kaioinfo *ki;
1451	struct kevent kev;
1452	int opcode;
1453	int error;
1454	int fd, kqfd;
1455	int jid;
1456	u_short evflags;
1457
1458	if (p->p_aioinfo == NULL)
1459		aio_init_aioinfo(p);
1460
1461	ki = p->p_aioinfo;
1462
1463	ops->store_status(ujob, -1);
1464	ops->store_error(ujob, 0);
1465	ops->store_kernelinfo(ujob, -1);
1466
1467	if (num_queue_count >= max_queue_count ||
1468	    ki->kaio_count >= ki->kaio_qallowed_count) {
1469		ops->store_error(ujob, EAGAIN);
1470		return (EAGAIN);
1471	}
1472
1473	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1474	knlist_init_mtx(&job->klist, AIO_MTX(ki));
1475
1476	error = ops->copyin(ujob, &job->uaiocb);
1477	if (error) {
1478		ops->store_error(ujob, error);
1479		uma_zfree(aiocb_zone, job);
1480		return (error);
1481	}
1482
1483	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
1484		uma_zfree(aiocb_zone, job);
1485		return (EINVAL);
1486	}
1487
1488	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1489	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1490	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1491	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1492		ops->store_error(ujob, EINVAL);
1493		uma_zfree(aiocb_zone, job);
1494		return (EINVAL);
1495	}
1496
1497	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1498	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1499		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
1500		uma_zfree(aiocb_zone, job);
1501		return (EINVAL);
1502	}
1503
1504	ksiginfo_init(&job->ksi);
1505
1506	/* Save userspace address of the job info. */
1507	job->ujob = ujob;
1508
1509	/* Get the opcode. */
1510	if (type != LIO_NOP)
1511		job->uaiocb.aio_lio_opcode = type;
1512	opcode = job->uaiocb.aio_lio_opcode;
1513
1514	/*
1515	 * Validate the opcode and fetch the file object for the specified
1516	 * file descriptor.
1517	 *
1518	 * XXXRW: Moved the opcode validation up here so that we don't
1519	 * retrieve a file descriptor without knowing what the capabiltity
1520	 * should be.
1521	 */
1522	fd = job->uaiocb.aio_fildes;
1523	switch (opcode) {
1524	case LIO_WRITE:
1525		error = fget_write(td, fd,
1526		    cap_rights_init(&rights, CAP_PWRITE), &fp);
1527		break;
1528	case LIO_READ:
1529		error = fget_read(td, fd,
1530		    cap_rights_init(&rights, CAP_PREAD), &fp);
1531		break;
1532	case LIO_SYNC:
1533		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
1534		break;
1535	case LIO_MLOCK:
1536		fp = NULL;
1537		break;
1538	case LIO_NOP:
1539		error = fget(td, fd, cap_rights_init(&rights), &fp);
1540		break;
1541	default:
1542		error = EINVAL;
1543	}
1544	if (error) {
1545		uma_zfree(aiocb_zone, job);
1546		ops->store_error(ujob, error);
1547		return (error);
1548	}
1549
1550	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1551		error = EINVAL;
1552		goto aqueue_fail;
1553	}
1554
1555	if (opcode != LIO_SYNC && job->uaiocb.aio_offset == -1LL) {
1556		error = EINVAL;
1557		goto aqueue_fail;
1558	}
1559
1560	job->fd_file = fp;
1561
1562	mtx_lock(&aio_job_mtx);
1563	jid = jobrefid++;
1564	job->seqno = jobseqno++;
1565	mtx_unlock(&aio_job_mtx);
1566	error = ops->store_kernelinfo(ujob, jid);
1567	if (error) {
1568		error = EINVAL;
1569		goto aqueue_fail;
1570	}
1571	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1572
1573	if (opcode == LIO_NOP) {
1574		fdrop(fp, td);
1575		uma_zfree(aiocb_zone, job);
1576		return (0);
1577	}
1578
1579	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1580		goto no_kqueue;
1581	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1582	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1583		error = EINVAL;
1584		goto aqueue_fail;
1585	}
1586	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
1587	kev.ident = (uintptr_t)job->ujob;
1588	kev.filter = EVFILT_AIO;
1589	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1590	kev.data = (intptr_t)job;
1591	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1592	error = kqfd_register(kqfd, &kev, td, 1);
1593	if (error)
1594		goto aqueue_fail;
1595
1596no_kqueue:
1597
1598	ops->store_error(ujob, EINPROGRESS);
1599	job->uaiocb._aiocb_private.error = EINPROGRESS;
1600	job->userproc = p;
1601	job->cred = crhold(td->td_ucred);
1602	job->jobflags = KAIOCB_QUEUEING;
1603	job->lio = lj;
1604
1605	if (opcode == LIO_MLOCK) {
1606		aio_schedule(job, aio_process_mlock);
1607		error = 0;
1608	} else if (fp->f_ops->fo_aio_queue == NULL)
1609		error = aio_queue_file(fp, job);
1610	else
1611		error = fo_aio_queue(fp, job);
1612	if (error)
1613		goto aqueue_fail;
1614
1615	AIO_LOCK(ki);
1616	job->jobflags &= ~KAIOCB_QUEUEING;
1617	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
1618	ki->kaio_count++;
1619	if (lj)
1620		lj->lioj_count++;
1621	atomic_add_int(&num_queue_count, 1);
1622	if (job->jobflags & KAIOCB_FINISHED) {
1623		/*
1624		 * The queue callback completed the request synchronously.
1625		 * The bulk of the completion is deferred in that case
1626		 * until this point.
1627		 */
1628		aio_bio_done_notify(p, job);
1629	} else
1630		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
1631	AIO_UNLOCK(ki);
1632	return (0);
1633
1634aqueue_fail:
1635	knlist_delete(&job->klist, curthread, 0);
1636	if (fp)
1637		fdrop(fp, td);
1638	uma_zfree(aiocb_zone, job);
1639	ops->store_error(ujob, error);
1640	return (error);
1641}
1642
1643static void
1644aio_cancel_daemon_job(struct kaiocb *job)
1645{
1646
1647	mtx_lock(&aio_job_mtx);
1648	if (!aio_cancel_cleared(job))
1649		TAILQ_REMOVE(&aio_jobs, job, list);
1650	mtx_unlock(&aio_job_mtx);
1651	aio_cancel(job);
1652}
1653
1654void
1655aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
1656{
1657
1658	mtx_lock(&aio_job_mtx);
1659	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
1660		mtx_unlock(&aio_job_mtx);
1661		aio_cancel(job);
1662		return;
1663	}
1664	job->handle_fn = func;
1665	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
1666	aio_kick_nowait(job->userproc);
1667	mtx_unlock(&aio_job_mtx);
1668}
1669
1670static void
1671aio_cancel_sync(struct kaiocb *job)
1672{
1673	struct kaioinfo *ki;
1674
1675	ki = job->userproc->p_aioinfo;
1676	AIO_LOCK(ki);
1677	if (!aio_cancel_cleared(job))
1678		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
1679	AIO_UNLOCK(ki);
1680	aio_cancel(job);
1681}
1682
1683int
1684aio_queue_file(struct file *fp, struct kaiocb *job)
1685{
1686	struct aioliojob *lj;
1687	struct kaioinfo *ki;
1688	struct kaiocb *job2;
1689	struct vnode *vp;
1690	struct mount *mp;
1691	int error, opcode;
1692	bool safe;
1693
1694	lj = job->lio;
1695	ki = job->userproc->p_aioinfo;
1696	opcode = job->uaiocb.aio_lio_opcode;
1697	if (opcode == LIO_SYNC)
1698		goto queueit;
1699
1700	if ((error = aio_qphysio(job->userproc, job)) == 0)
1701		goto done;
1702#if 0
1703	/*
1704	 * XXX: This means qphysio() failed with EFAULT.  The current
1705	 * behavior is to retry the operation via fo_read/fo_write.
1706	 * Wouldn't it be better to just complete the request with an
1707	 * error here?
1708	 */
1709	if (error > 0)
1710		goto done;
1711#endif
1712queueit:
1713	safe = false;
1714	if (fp->f_type == DTYPE_VNODE) {
1715		vp = fp->f_vnode;
1716		if (vp->v_type == VREG || vp->v_type == VDIR) {
1717			mp = fp->f_vnode->v_mount;
1718			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
1719				safe = true;
1720		}
1721	}
1722	if (!(safe || enable_aio_unsafe)) {
1723		counted_warning(&unsafe_warningcnt,
1724		    "is attempting to use unsafe AIO requests");
1725		return (EOPNOTSUPP);
1726	}
1727
1728	if (opcode == LIO_SYNC) {
1729		AIO_LOCK(ki);
1730		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
1731			if (job2->fd_file == job->fd_file &&
1732			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
1733			    job2->seqno < job->seqno) {
1734				job2->jobflags |= KAIOCB_CHECKSYNC;
1735				job->pending++;
1736			}
1737		}
1738		if (job->pending != 0) {
1739			if (!aio_set_cancel_function_locked(job,
1740				aio_cancel_sync)) {
1741				AIO_UNLOCK(ki);
1742				aio_cancel(job);
1743				return (0);
1744			}
1745			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
1746			AIO_UNLOCK(ki);
1747			return (0);
1748		}
1749		AIO_UNLOCK(ki);
1750	}
1751
1752	switch (opcode) {
1753	case LIO_READ:
1754	case LIO_WRITE:
1755		aio_schedule(job, aio_process_rw);
1756		error = 0;
1757		break;
1758	case LIO_SYNC:
1759		aio_schedule(job, aio_process_sync);
1760		error = 0;
1761		break;
1762	default:
1763		error = EINVAL;
1764	}
1765done:
1766	return (error);
1767}
1768
1769static void
1770aio_kick_nowait(struct proc *userp)
1771{
1772	struct kaioinfo *ki = userp->p_aioinfo;
1773	struct aioproc *aiop;
1774
1775	mtx_assert(&aio_job_mtx, MA_OWNED);
1776	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1777		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1778		aiop->aioprocflags &= ~AIOP_FREE;
1779		wakeup(aiop->aioproc);
1780	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1781	    ki->kaio_active_count + num_aio_resv_start <
1782	    ki->kaio_maxactive_count) {
1783		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
1784	}
1785}
1786
1787static int
1788aio_kick(struct proc *userp)
1789{
1790	struct kaioinfo *ki = userp->p_aioinfo;
1791	struct aioproc *aiop;
1792	int error, ret = 0;
1793
1794	mtx_assert(&aio_job_mtx, MA_OWNED);
1795retryproc:
1796	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1797		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1798		aiop->aioprocflags &= ~AIOP_FREE;
1799		wakeup(aiop->aioproc);
1800	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1801	    ki->kaio_active_count + num_aio_resv_start <
1802	    ki->kaio_maxactive_count) {
1803		num_aio_resv_start++;
1804		mtx_unlock(&aio_job_mtx);
1805		error = aio_newproc(&num_aio_resv_start);
1806		mtx_lock(&aio_job_mtx);
1807		if (error) {
1808			num_aio_resv_start--;
1809			goto retryproc;
1810		}
1811	} else {
1812		ret = -1;
1813	}
1814	return (ret);
1815}
1816
1817static void
1818aio_kick_helper(void *context, int pending)
1819{
1820	struct proc *userp = context;
1821
1822	mtx_lock(&aio_job_mtx);
1823	while (--pending >= 0) {
1824		if (aio_kick(userp))
1825			break;
1826	}
1827	mtx_unlock(&aio_job_mtx);
1828}
1829
1830/*
1831 * Support the aio_return system call, as a side-effect, kernel resources are
1832 * released.
1833 */
1834static int
1835kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
1836{
1837	struct proc *p = td->td_proc;
1838	struct kaiocb *job;
1839	struct kaioinfo *ki;
1840	long status, error;
1841
1842	ki = p->p_aioinfo;
1843	if (ki == NULL)
1844		return (EINVAL);
1845	AIO_LOCK(ki);
1846	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
1847		if (job->ujob == ujob)
1848			break;
1849	}
1850	if (job != NULL) {
1851		MPASS(job->jobflags & KAIOCB_FINISHED);
1852		status = job->uaiocb._aiocb_private.status;
1853		error = job->uaiocb._aiocb_private.error;
1854		td->td_retval[0] = status;
1855		td->td_ru.ru_oublock += job->outblock;
1856		td->td_ru.ru_inblock += job->inblock;
1857		td->td_ru.ru_msgsnd += job->msgsnd;
1858		td->td_ru.ru_msgrcv += job->msgrcv;
1859		aio_free_entry(job);
1860		AIO_UNLOCK(ki);
1861		ops->store_error(ujob, error);
1862		ops->store_status(ujob, status);
1863	} else {
1864		error = EINVAL;
1865		AIO_UNLOCK(ki);
1866	}
1867	return (error);
1868}
1869
1870int
1871sys_aio_return(struct thread *td, struct aio_return_args *uap)
1872{
1873
1874	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1875}
1876
1877/*
1878 * Allow a process to wakeup when any of the I/O requests are completed.
1879 */
1880static int
1881kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1882    struct timespec *ts)
1883{
1884	struct proc *p = td->td_proc;
1885	struct timeval atv;
1886	struct kaioinfo *ki;
1887	struct kaiocb *firstjob, *job;
1888	int error, i, timo;
1889
1890	timo = 0;
1891	if (ts) {
1892		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
1893			return (EINVAL);
1894
1895		TIMESPEC_TO_TIMEVAL(&atv, ts);
1896		if (itimerfix(&atv))
1897			return (EINVAL);
1898		timo = tvtohz(&atv);
1899	}
1900
1901	ki = p->p_aioinfo;
1902	if (ki == NULL)
1903		return (EAGAIN);
1904
1905	if (njoblist == 0)
1906		return (0);
1907
1908	AIO_LOCK(ki);
1909	for (;;) {
1910		firstjob = NULL;
1911		error = 0;
1912		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
1913			for (i = 0; i < njoblist; i++) {
1914				if (job->ujob == ujoblist[i]) {
1915					if (firstjob == NULL)
1916						firstjob = job;
1917					if (job->jobflags & KAIOCB_FINISHED)
1918						goto RETURN;
1919				}
1920			}
1921		}
1922		/* All tasks were finished. */
1923		if (firstjob == NULL)
1924			break;
1925
1926		ki->kaio_flags |= KAIO_WAKEUP;
1927		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1928		    "aiospn", timo);
1929		if (error == ERESTART)
1930			error = EINTR;
1931		if (error)
1932			break;
1933	}
1934RETURN:
1935	AIO_UNLOCK(ki);
1936	return (error);
1937}
1938
1939int
1940sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1941{
1942	struct timespec ts, *tsp;
1943	struct aiocb **ujoblist;
1944	int error;
1945
1946	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
1947		return (EINVAL);
1948
1949	if (uap->timeout) {
1950		/* Get timespec struct. */
1951		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1952			return (error);
1953		tsp = &ts;
1954	} else
1955		tsp = NULL;
1956
1957	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
1958	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1959	if (error == 0)
1960		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1961	uma_zfree(aiol_zone, ujoblist);
1962	return (error);
1963}
1964
1965/*
1966 * aio_cancel cancels any non-physio aio operations not currently in
1967 * progress.
1968 */
1969int
1970sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1971{
1972	struct proc *p = td->td_proc;
1973	struct kaioinfo *ki;
1974	struct kaiocb *job, *jobn;
1975	struct file *fp;
1976	cap_rights_t rights;
1977	int error;
1978	int cancelled = 0;
1979	int notcancelled = 0;
1980	struct vnode *vp;
1981
1982	/* Lookup file object. */
1983	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
1984	if (error)
1985		return (error);
1986
1987	ki = p->p_aioinfo;
1988	if (ki == NULL)
1989		goto done;
1990
1991	if (fp->f_type == DTYPE_VNODE) {
1992		vp = fp->f_vnode;
1993		if (vn_isdisk(vp, &error)) {
1994			fdrop(fp, td);
1995			td->td_retval[0] = AIO_NOTCANCELED;
1996			return (0);
1997		}
1998	}
1999
2000	AIO_LOCK(ki);
2001	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
2002		if ((uap->fd == job->uaiocb.aio_fildes) &&
2003		    ((uap->aiocbp == NULL) ||
2004		     (uap->aiocbp == job->ujob))) {
2005			if (aio_cancel_job(p, ki, job)) {
2006				cancelled++;
2007			} else {
2008				notcancelled++;
2009			}
2010			if (uap->aiocbp != NULL)
2011				break;
2012		}
2013	}
2014	AIO_UNLOCK(ki);
2015
2016done:
2017	fdrop(fp, td);
2018
2019	if (uap->aiocbp != NULL) {
2020		if (cancelled) {
2021			td->td_retval[0] = AIO_CANCELED;
2022			return (0);
2023		}
2024	}
2025
2026	if (notcancelled) {
2027		td->td_retval[0] = AIO_NOTCANCELED;
2028		return (0);
2029	}
2030
2031	if (cancelled) {
2032		td->td_retval[0] = AIO_CANCELED;
2033		return (0);
2034	}
2035
2036	td->td_retval[0] = AIO_ALLDONE;
2037
2038	return (0);
2039}
2040
2041/*
2042 * aio_error is implemented in the kernel level for compatibility purposes
2043 * only.  For a user mode async implementation, it would be best to do it in
2044 * a userland subroutine.
2045 */
2046static int
2047kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
2048{
2049	struct proc *p = td->td_proc;
2050	struct kaiocb *job;
2051	struct kaioinfo *ki;
2052	int status;
2053
2054	ki = p->p_aioinfo;
2055	if (ki == NULL) {
2056		td->td_retval[0] = EINVAL;
2057		return (0);
2058	}
2059
2060	AIO_LOCK(ki);
2061	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
2062		if (job->ujob == ujob) {
2063			if (job->jobflags & KAIOCB_FINISHED)
2064				td->td_retval[0] =
2065					job->uaiocb._aiocb_private.error;
2066			else
2067				td->td_retval[0] = EINPROGRESS;
2068			AIO_UNLOCK(ki);
2069			return (0);
2070		}
2071	}
2072	AIO_UNLOCK(ki);
2073
2074	/*
2075	 * Hack for failure of aio_aqueue.
2076	 */
2077	status = ops->fetch_status(ujob);
2078	if (status == -1) {
2079		td->td_retval[0] = ops->fetch_error(ujob);
2080		return (0);
2081	}
2082
2083	td->td_retval[0] = EINVAL;
2084	return (0);
2085}
2086
2087int
2088sys_aio_error(struct thread *td, struct aio_error_args *uap)
2089{
2090
2091	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2092}
2093
2094/* syscall - asynchronous read from a file (REALTIME) */
2095#ifdef COMPAT_FREEBSD6
2096int
2097freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
2098{
2099
2100	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2101	    &aiocb_ops_osigevent));
2102}
2103#endif
2104
2105int
2106sys_aio_read(struct thread *td, struct aio_read_args *uap)
2107{
2108
2109	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
2110}
2111
2112/* syscall - asynchronous write to a file (REALTIME) */
2113#ifdef COMPAT_FREEBSD6
2114int
2115freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
2116{
2117
2118	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2119	    &aiocb_ops_osigevent));
2120}
2121#endif
2122
2123int
2124sys_aio_write(struct thread *td, struct aio_write_args *uap)
2125{
2126
2127	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
2128}
2129
2130int
2131sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2132{
2133
2134	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2135}
2136
2137static int
2138kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2139    struct aiocb **acb_list, int nent, struct sigevent *sig,
2140    struct aiocb_ops *ops)
2141{
2142	struct proc *p = td->td_proc;
2143	struct aiocb *job;
2144	struct kaioinfo *ki;
2145	struct aioliojob *lj;
2146	struct kevent kev;
2147	int error;
2148	int nerror;
2149	int i;
2150
2151	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2152		return (EINVAL);
2153
2154	if (nent < 0 || nent > AIO_LISTIO_MAX)
2155		return (EINVAL);
2156
2157	if (p->p_aioinfo == NULL)
2158		aio_init_aioinfo(p);
2159
2160	ki = p->p_aioinfo;
2161
2162	lj = uma_zalloc(aiolio_zone, M_WAITOK);
2163	lj->lioj_flags = 0;
2164	lj->lioj_count = 0;
2165	lj->lioj_finished_count = 0;
2166	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2167	ksiginfo_init(&lj->lioj_ksi);
2168
2169	/*
2170	 * Setup signal.
2171	 */
2172	if (sig && (mode == LIO_NOWAIT)) {
2173		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2174		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2175			/* Assume only new style KEVENT */
2176			kev.filter = EVFILT_LIO;
2177			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2178			kev.ident = (uintptr_t)uacb_list; /* something unique */
2179			kev.data = (intptr_t)lj;
2180			/* pass user defined sigval data */
2181			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2182			error = kqfd_register(
2183			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2184			if (error) {
2185				uma_zfree(aiolio_zone, lj);
2186				return (error);
2187			}
2188		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2189			;
2190		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2191			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2192				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2193					uma_zfree(aiolio_zone, lj);
2194					return EINVAL;
2195				}
2196				lj->lioj_flags |= LIOJ_SIGNAL;
2197		} else {
2198			uma_zfree(aiolio_zone, lj);
2199			return EINVAL;
2200		}
2201	}
2202
2203	AIO_LOCK(ki);
2204	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2205	/*
2206	 * Add extra aiocb count to avoid the lio to be freed
2207	 * by other threads doing aio_waitcomplete or aio_return,
2208	 * and prevent event from being sent until we have queued
2209	 * all tasks.
2210	 */
2211	lj->lioj_count = 1;
2212	AIO_UNLOCK(ki);
2213
2214	/*
2215	 * Get pointers to the list of I/O requests.
2216	 */
2217	nerror = 0;
2218	for (i = 0; i < nent; i++) {
2219		job = acb_list[i];
2220		if (job != NULL) {
2221			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
2222			if (error != 0)
2223				nerror++;
2224		}
2225	}
2226
2227	error = 0;
2228	AIO_LOCK(ki);
2229	if (mode == LIO_WAIT) {
2230		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
2231			ki->kaio_flags |= KAIO_WAKEUP;
2232			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2233			    PRIBIO | PCATCH, "aiospn", 0);
2234			if (error == ERESTART)
2235				error = EINTR;
2236			if (error)
2237				break;
2238		}
2239	} else {
2240		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2241			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2242				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2243				KNOTE_LOCKED(&lj->klist, 1);
2244			}
2245			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2246			    == LIOJ_SIGNAL
2247			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2248			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2249				aio_sendsig(p, &lj->lioj_signal,
2250					    &lj->lioj_ksi);
2251				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2252			}
2253		}
2254	}
2255	lj->lioj_count--;
2256	if (lj->lioj_count == 0) {
2257		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2258		knlist_delete(&lj->klist, curthread, 1);
2259		PROC_LOCK(p);
2260		sigqueue_take(&lj->lioj_ksi);
2261		PROC_UNLOCK(p);
2262		AIO_UNLOCK(ki);
2263		uma_zfree(aiolio_zone, lj);
2264	} else
2265		AIO_UNLOCK(ki);
2266
2267	if (nerror)
2268		return (EIO);
2269	return (error);
2270}
2271
2272/* syscall - list directed I/O (REALTIME) */
2273#ifdef COMPAT_FREEBSD6
2274int
2275freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
2276{
2277	struct aiocb **acb_list;
2278	struct sigevent *sigp, sig;
2279	struct osigevent osig;
2280	int error, nent;
2281
2282	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2283		return (EINVAL);
2284
2285	nent = uap->nent;
2286	if (nent < 0 || nent > AIO_LISTIO_MAX)
2287		return (EINVAL);
2288
2289	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2290		error = copyin(uap->sig, &osig, sizeof(osig));
2291		if (error)
2292			return (error);
2293		error = convert_old_sigevent(&osig, &sig);
2294		if (error)
2295			return (error);
2296		sigp = &sig;
2297	} else
2298		sigp = NULL;
2299
2300	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2301	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2302	if (error == 0)
2303		error = kern_lio_listio(td, uap->mode,
2304		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2305		    &aiocb_ops_osigevent);
2306	free(acb_list, M_LIO);
2307	return (error);
2308}
2309#endif
2310
2311/* syscall - list directed I/O (REALTIME) */
2312int
2313sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2314{
2315	struct aiocb **acb_list;
2316	struct sigevent *sigp, sig;
2317	int error, nent;
2318
2319	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2320		return (EINVAL);
2321
2322	nent = uap->nent;
2323	if (nent < 0 || nent > AIO_LISTIO_MAX)
2324		return (EINVAL);
2325
2326	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2327		error = copyin(uap->sig, &sig, sizeof(sig));
2328		if (error)
2329			return (error);
2330		sigp = &sig;
2331	} else
2332		sigp = NULL;
2333
2334	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2335	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2336	if (error == 0)
2337		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2338		    nent, sigp, &aiocb_ops);
2339	free(acb_list, M_LIO);
2340	return (error);
2341}
2342
2343static void
2344aio_physwakeup(struct bio *bp)
2345{
2346	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
2347	struct proc *userp;
2348	struct kaioinfo *ki;
2349	size_t nbytes;
2350	int error, nblks;
2351
2352	/* Release mapping into kernel space. */
2353	userp = job->userproc;
2354	ki = userp->p_aioinfo;
2355	if (job->pbuf) {
2356		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
2357		relpbuf(job->pbuf, NULL);
2358		job->pbuf = NULL;
2359		atomic_subtract_int(&num_buf_aio, 1);
2360		AIO_LOCK(ki);
2361		ki->kaio_buffer_count--;
2362		AIO_UNLOCK(ki);
2363	}
2364	vm_page_unhold_pages(job->pages, job->npages);
2365
2366	bp = job->bp;
2367	job->bp = NULL;
2368	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
2369	error = 0;
2370	if (bp->bio_flags & BIO_ERROR)
2371		error = bp->bio_error;
2372	nblks = btodb(nbytes);
2373	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
2374		job->outblock += nblks;
2375	else
2376		job->inblock += nblks;
2377
2378	if (error)
2379		aio_complete(job, -1, error);
2380	else
2381		aio_complete(job, nbytes, 0);
2382
2383	g_destroy_bio(bp);
2384}
2385
2386/* syscall - wait for the next completion of an aio request */
2387static int
2388kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
2389    struct timespec *ts, struct aiocb_ops *ops)
2390{
2391	struct proc *p = td->td_proc;
2392	struct timeval atv;
2393	struct kaioinfo *ki;
2394	struct kaiocb *job;
2395	struct aiocb *ujob;
2396	long error, status;
2397	int timo;
2398
2399	ops->store_aiocb(ujobp, NULL);
2400
2401	if (ts == NULL) {
2402		timo = 0;
2403	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
2404		timo = -1;
2405	} else {
2406		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
2407			return (EINVAL);
2408
2409		TIMESPEC_TO_TIMEVAL(&atv, ts);
2410		if (itimerfix(&atv))
2411			return (EINVAL);
2412		timo = tvtohz(&atv);
2413	}
2414
2415	if (p->p_aioinfo == NULL)
2416		aio_init_aioinfo(p);
2417	ki = p->p_aioinfo;
2418
2419	error = 0;
2420	job = NULL;
2421	AIO_LOCK(ki);
2422	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2423		if (timo == -1) {
2424			error = EWOULDBLOCK;
2425			break;
2426		}
2427		ki->kaio_flags |= KAIO_WAKEUP;
2428		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2429		    "aiowc", timo);
2430		if (timo && error == ERESTART)
2431			error = EINTR;
2432		if (error)
2433			break;
2434	}
2435
2436	if (job != NULL) {
2437		MPASS(job->jobflags & KAIOCB_FINISHED);
2438		ujob = job->ujob;
2439		status = job->uaiocb._aiocb_private.status;
2440		error = job->uaiocb._aiocb_private.error;
2441		td->td_retval[0] = status;
2442		td->td_ru.ru_oublock += job->outblock;
2443		td->td_ru.ru_inblock += job->inblock;
2444		td->td_ru.ru_msgsnd += job->msgsnd;
2445		td->td_ru.ru_msgrcv += job->msgrcv;
2446		aio_free_entry(job);
2447		AIO_UNLOCK(ki);
2448		ops->store_aiocb(ujobp, ujob);
2449		ops->store_error(ujob, error);
2450		ops->store_status(ujob, status);
2451	} else
2452		AIO_UNLOCK(ki);
2453
2454	return (error);
2455}
2456
2457int
2458sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2459{
2460	struct timespec ts, *tsp;
2461	int error;
2462
2463	if (uap->timeout) {
2464		/* Get timespec struct. */
2465		error = copyin(uap->timeout, &ts, sizeof(ts));
2466		if (error)
2467			return (error);
2468		tsp = &ts;
2469	} else
2470		tsp = NULL;
2471
2472	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2473}
2474
2475static int
2476kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
2477    struct aiocb_ops *ops)
2478{
2479	struct proc *p = td->td_proc;
2480	struct kaioinfo *ki;
2481
2482	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2483		return (EINVAL);
2484	ki = p->p_aioinfo;
2485	if (ki == NULL)
2486		aio_init_aioinfo(p);
2487	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
2488}
2489
2490int
2491sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2492{
2493
2494	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2495}
2496
2497/* kqueue attach function */
2498static int
2499filt_aioattach(struct knote *kn)
2500{
2501	struct kaiocb *job = (struct kaiocb *)kn->kn_sdata;
2502
2503	/*
2504	 * The job pointer must be validated before using it, so
2505	 * registration is restricted to the kernel; the user cannot
2506	 * set EV_FLAG1.
2507	 */
2508	if ((kn->kn_flags & EV_FLAG1) == 0)
2509		return (EPERM);
2510	kn->kn_ptr.p_aio = job;
2511	kn->kn_flags &= ~EV_FLAG1;
2512
2513	knlist_add(&job->klist, kn, 0);
2514
2515	return (0);
2516}
2517
2518/* kqueue detach function */
2519static void
2520filt_aiodetach(struct knote *kn)
2521{
2522	struct knlist *knl;
2523
2524	knl = &kn->kn_ptr.p_aio->klist;
2525	knl->kl_lock(knl->kl_lockarg);
2526	if (!knlist_empty(knl))
2527		knlist_remove(knl, kn, 1);
2528	knl->kl_unlock(knl->kl_lockarg);
2529}
2530
2531/* kqueue filter function */
2532/*ARGSUSED*/
2533static int
2534filt_aio(struct knote *kn, long hint)
2535{
2536	struct kaiocb *job = kn->kn_ptr.p_aio;
2537
2538	kn->kn_data = job->uaiocb._aiocb_private.error;
2539	if (!(job->jobflags & KAIOCB_FINISHED))
2540		return (0);
2541	kn->kn_flags |= EV_EOF;
2542	return (1);
2543}
2544
2545/* kqueue attach function */
2546static int
2547filt_lioattach(struct knote *kn)
2548{
2549	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2550
2551	/*
2552	 * The aioliojob pointer must be validated before using it, so
2553	 * registration is restricted to the kernel; the user cannot
2554	 * set EV_FLAG1.
2555	 */
2556	if ((kn->kn_flags & EV_FLAG1) == 0)
2557		return (EPERM);
2558	kn->kn_ptr.p_lio = lj;
2559	kn->kn_flags &= ~EV_FLAG1;
2560
2561	knlist_add(&lj->klist, kn, 0);
2562
2563	return (0);
2564}
2565
2566/* kqueue detach function */
2567static void
2568filt_liodetach(struct knote *kn)
2569{
2570	struct knlist *knl;
2571
2572	knl = &kn->kn_ptr.p_lio->klist;
2573	knl->kl_lock(knl->kl_lockarg);
2574	if (!knlist_empty(knl))
2575		knlist_remove(knl, kn, 1);
2576	knl->kl_unlock(knl->kl_lockarg);
2577}
2578
2579/* kqueue filter function */
2580/*ARGSUSED*/
2581static int
2582filt_lio(struct knote *kn, long hint)
2583{
2584	struct aioliojob * lj = kn->kn_ptr.p_lio;
2585
2586	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2587}
2588
2589#ifdef COMPAT_FREEBSD32
2590#include <sys/mount.h>
2591#include <sys/socket.h>
2592#include <compat/freebsd32/freebsd32.h>
2593#include <compat/freebsd32/freebsd32_proto.h>
2594#include <compat/freebsd32/freebsd32_signal.h>
2595#include <compat/freebsd32/freebsd32_syscall.h>
2596#include <compat/freebsd32/freebsd32_util.h>
2597
2598struct __aiocb_private32 {
2599	int32_t	status;
2600	int32_t	error;
2601	uint32_t kernelinfo;
2602};
2603
2604#ifdef COMPAT_FREEBSD6
2605typedef struct oaiocb32 {
2606	int	aio_fildes;		/* File descriptor */
2607	uint64_t aio_offset __packed;	/* File offset for I/O */
2608	uint32_t aio_buf;		/* I/O buffer in process space */
2609	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2610	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2611	int	aio_lio_opcode;		/* LIO opcode */
2612	int	aio_reqprio;		/* Request priority -- ignored */
2613	struct	__aiocb_private32 _aiocb_private;
2614} oaiocb32_t;
2615#endif
2616
2617typedef struct aiocb32 {
2618	int32_t	aio_fildes;		/* File descriptor */
2619	uint64_t aio_offset __packed;	/* File offset for I/O */
2620	uint32_t aio_buf;		/* I/O buffer in process space */
2621	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2622	int	__spare__[2];
2623	uint32_t __spare2__;
2624	int	aio_lio_opcode;		/* LIO opcode */
2625	int	aio_reqprio;		/* Request priority -- ignored */
2626	struct	__aiocb_private32 _aiocb_private;
2627	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
2628} aiocb32_t;
2629
2630#ifdef COMPAT_FREEBSD6
2631static int
2632convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2633{
2634
2635	/*
2636	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2637	 * supported by AIO with the old sigevent structure.
2638	 */
2639	CP(*osig, *nsig, sigev_notify);
2640	switch (nsig->sigev_notify) {
2641	case SIGEV_NONE:
2642		break;
2643	case SIGEV_SIGNAL:
2644		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2645		break;
2646	case SIGEV_KEVENT:
2647		nsig->sigev_notify_kqueue =
2648		    osig->__sigev_u.__sigev_notify_kqueue;
2649		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2650		break;
2651	default:
2652		return (EINVAL);
2653	}
2654	return (0);
2655}
2656
2657static int
2658aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2659{
2660	struct oaiocb32 job32;
2661	int error;
2662
2663	bzero(kjob, sizeof(struct aiocb));
2664	error = copyin(ujob, &job32, sizeof(job32));
2665	if (error)
2666		return (error);
2667
2668	CP(job32, *kjob, aio_fildes);
2669	CP(job32, *kjob, aio_offset);
2670	PTRIN_CP(job32, *kjob, aio_buf);
2671	CP(job32, *kjob, aio_nbytes);
2672	CP(job32, *kjob, aio_lio_opcode);
2673	CP(job32, *kjob, aio_reqprio);
2674	CP(job32, *kjob, _aiocb_private.status);
2675	CP(job32, *kjob, _aiocb_private.error);
2676	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2677	return (convert_old_sigevent32(&job32.aio_sigevent,
2678	    &kjob->aio_sigevent));
2679}
2680#endif
2681
2682static int
2683aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2684{
2685	struct aiocb32 job32;
2686	int error;
2687
2688	error = copyin(ujob, &job32, sizeof(job32));
2689	if (error)
2690		return (error);
2691	CP(job32, *kjob, aio_fildes);
2692	CP(job32, *kjob, aio_offset);
2693	PTRIN_CP(job32, *kjob, aio_buf);
2694	CP(job32, *kjob, aio_nbytes);
2695	CP(job32, *kjob, aio_lio_opcode);
2696	CP(job32, *kjob, aio_reqprio);
2697	CP(job32, *kjob, _aiocb_private.status);
2698	CP(job32, *kjob, _aiocb_private.error);
2699	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2700	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2701}
2702
2703static long
2704aiocb32_fetch_status(struct aiocb *ujob)
2705{
2706	struct aiocb32 *ujob32;
2707
2708	ujob32 = (struct aiocb32 *)ujob;
2709	return (fuword32(&ujob32->_aiocb_private.status));
2710}
2711
2712static long
2713aiocb32_fetch_error(struct aiocb *ujob)
2714{
2715	struct aiocb32 *ujob32;
2716
2717	ujob32 = (struct aiocb32 *)ujob;
2718	return (fuword32(&ujob32->_aiocb_private.error));
2719}
2720
2721static int
2722aiocb32_store_status(struct aiocb *ujob, long status)
2723{
2724	struct aiocb32 *ujob32;
2725
2726	ujob32 = (struct aiocb32 *)ujob;
2727	return (suword32(&ujob32->_aiocb_private.status, status));
2728}
2729
2730static int
2731aiocb32_store_error(struct aiocb *ujob, long error)
2732{
2733	struct aiocb32 *ujob32;
2734
2735	ujob32 = (struct aiocb32 *)ujob;
2736	return (suword32(&ujob32->_aiocb_private.error, error));
2737}
2738
2739static int
2740aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2741{
2742	struct aiocb32 *ujob32;
2743
2744	ujob32 = (struct aiocb32 *)ujob;
2745	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2746}
2747
2748static int
2749aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2750{
2751
2752	return (suword32(ujobp, (long)ujob));
2753}
2754
2755static struct aiocb_ops aiocb32_ops = {
2756	.copyin = aiocb32_copyin,
2757	.fetch_status = aiocb32_fetch_status,
2758	.fetch_error = aiocb32_fetch_error,
2759	.store_status = aiocb32_store_status,
2760	.store_error = aiocb32_store_error,
2761	.store_kernelinfo = aiocb32_store_kernelinfo,
2762	.store_aiocb = aiocb32_store_aiocb,
2763};
2764
2765#ifdef COMPAT_FREEBSD6
2766static struct aiocb_ops aiocb32_ops_osigevent = {
2767	.copyin = aiocb32_copyin_old_sigevent,
2768	.fetch_status = aiocb32_fetch_status,
2769	.fetch_error = aiocb32_fetch_error,
2770	.store_status = aiocb32_store_status,
2771	.store_error = aiocb32_store_error,
2772	.store_kernelinfo = aiocb32_store_kernelinfo,
2773	.store_aiocb = aiocb32_store_aiocb,
2774};
2775#endif
2776
2777int
2778freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2779{
2780
2781	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2782}
2783
2784int
2785freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2786{
2787	struct timespec32 ts32;
2788	struct timespec ts, *tsp;
2789	struct aiocb **ujoblist;
2790	uint32_t *ujoblist32;
2791	int error, i;
2792
2793	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
2794		return (EINVAL);
2795
2796	if (uap->timeout) {
2797		/* Get timespec struct. */
2798		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2799			return (error);
2800		CP(ts32, ts, tv_sec);
2801		CP(ts32, ts, tv_nsec);
2802		tsp = &ts;
2803	} else
2804		tsp = NULL;
2805
2806	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
2807	ujoblist32 = (uint32_t *)ujoblist;
2808	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2809	    sizeof(ujoblist32[0]));
2810	if (error == 0) {
2811		for (i = uap->nent; i > 0; i--)
2812			ujoblist[i] = PTRIN(ujoblist32[i]);
2813
2814		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2815	}
2816	uma_zfree(aiol_zone, ujoblist);
2817	return (error);
2818}
2819
2820int
2821freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2822{
2823
2824	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2825}
2826
2827#ifdef COMPAT_FREEBSD6
2828int
2829freebsd6_freebsd32_aio_read(struct thread *td,
2830    struct freebsd6_freebsd32_aio_read_args *uap)
2831{
2832
2833	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2834	    &aiocb32_ops_osigevent));
2835}
2836#endif
2837
2838int
2839freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2840{
2841
2842	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2843	    &aiocb32_ops));
2844}
2845
2846#ifdef COMPAT_FREEBSD6
2847int
2848freebsd6_freebsd32_aio_write(struct thread *td,
2849    struct freebsd6_freebsd32_aio_write_args *uap)
2850{
2851
2852	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2853	    &aiocb32_ops_osigevent));
2854}
2855#endif
2856
2857int
2858freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2859{
2860
2861	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2862	    &aiocb32_ops));
2863}
2864
2865int
2866freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
2867{
2868
2869	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
2870	    &aiocb32_ops));
2871}
2872
2873int
2874freebsd32_aio_waitcomplete(struct thread *td,
2875    struct freebsd32_aio_waitcomplete_args *uap)
2876{
2877	struct timespec32 ts32;
2878	struct timespec ts, *tsp;
2879	int error;
2880
2881	if (uap->timeout) {
2882		/* Get timespec struct. */
2883		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2884		if (error)
2885			return (error);
2886		CP(ts32, ts, tv_sec);
2887		CP(ts32, ts, tv_nsec);
2888		tsp = &ts;
2889	} else
2890		tsp = NULL;
2891
2892	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2893	    &aiocb32_ops));
2894}
2895
2896int
2897freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2898{
2899
2900	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2901	    &aiocb32_ops));
2902}
2903
2904#ifdef COMPAT_FREEBSD6
2905int
2906freebsd6_freebsd32_lio_listio(struct thread *td,
2907    struct freebsd6_freebsd32_lio_listio_args *uap)
2908{
2909	struct aiocb **acb_list;
2910	struct sigevent *sigp, sig;
2911	struct osigevent32 osig;
2912	uint32_t *acb_list32;
2913	int error, i, nent;
2914
2915	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2916		return (EINVAL);
2917
2918	nent = uap->nent;
2919	if (nent < 0 || nent > AIO_LISTIO_MAX)
2920		return (EINVAL);
2921
2922	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2923		error = copyin(uap->sig, &osig, sizeof(osig));
2924		if (error)
2925			return (error);
2926		error = convert_old_sigevent32(&osig, &sig);
2927		if (error)
2928			return (error);
2929		sigp = &sig;
2930	} else
2931		sigp = NULL;
2932
2933	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2934	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2935	if (error) {
2936		free(acb_list32, M_LIO);
2937		return (error);
2938	}
2939	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2940	for (i = 0; i < nent; i++)
2941		acb_list[i] = PTRIN(acb_list32[i]);
2942	free(acb_list32, M_LIO);
2943
2944	error = kern_lio_listio(td, uap->mode,
2945	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2946	    &aiocb32_ops_osigevent);
2947	free(acb_list, M_LIO);
2948	return (error);
2949}
2950#endif
2951
2952int
2953freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2954{
2955	struct aiocb **acb_list;
2956	struct sigevent *sigp, sig;
2957	struct sigevent32 sig32;
2958	uint32_t *acb_list32;
2959	int error, i, nent;
2960
2961	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2962		return (EINVAL);
2963
2964	nent = uap->nent;
2965	if (nent < 0 || nent > AIO_LISTIO_MAX)
2966		return (EINVAL);
2967
2968	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2969		error = copyin(uap->sig, &sig32, sizeof(sig32));
2970		if (error)
2971			return (error);
2972		error = convert_sigevent32(&sig32, &sig);
2973		if (error)
2974			return (error);
2975		sigp = &sig;
2976	} else
2977		sigp = NULL;
2978
2979	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2980	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2981	if (error) {
2982		free(acb_list32, M_LIO);
2983		return (error);
2984	}
2985	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2986	for (i = 0; i < nent; i++)
2987		acb_list[i] = PTRIN(acb_list32[i]);
2988	free(acb_list32, M_LIO);
2989
2990	error = kern_lio_listio(td, uap->mode,
2991	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2992	    &aiocb32_ops);
2993	free(acb_list, M_LIO);
2994	return (error);
2995}
2996
2997#endif
2998