1139804Simp/*-
226670Sdyson * Copyright (c) 1997 John S. Dyson.  All rights reserved.
326670Sdyson *
426670Sdyson * Redistribution and use in source and binary forms, with or without
526670Sdyson * modification, are permitted provided that the following conditions
626670Sdyson * are met:
726670Sdyson * 1. Redistributions of source code must retain the above copyright
826670Sdyson *    notice, this list of conditions and the following disclaimer.
926670Sdyson * 2. John S. Dyson's name may not be used to endorse or promote products
1026670Sdyson *    derived from this software without specific prior written permission.
1126670Sdyson *
1226670Sdyson * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
1326670Sdyson * bad that happens because of using this software isn't the responsibility
1426670Sdyson * of the author.  This software is distributed AS-IS.
1526670Sdyson */
1626670Sdyson
1726670Sdyson/*
1834925Sdufault * This file contains support for the POSIX 1003.1B AIO/LIO facility.
1926670Sdyson */
2026670Sdyson
21116182Sobrien#include <sys/cdefs.h>
22116182Sobrien__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_aio.c 367449 2020-11-07 16:20:37Z jhb $");
23116182Sobrien
24185878Sjhb#include "opt_compat.h"
25185878Sjhb
2626670Sdyson#include <sys/param.h>
2726670Sdyson#include <sys/systm.h>
2891140Stanimura#include <sys/malloc.h>
2960041Sphk#include <sys/bio.h>
3044272Sbde#include <sys/buf.h>
31263233Srwatson#include <sys/capsicum.h>
32112564Sjhb#include <sys/eventhandler.h>
3326670Sdyson#include <sys/sysproto.h>
3426670Sdyson#include <sys/filedesc.h>
3526670Sdyson#include <sys/kernel.h>
36129882Sphk#include <sys/module.h>
3774015Salc#include <sys/kthread.h>
3826670Sdyson#include <sys/fcntl.h>
3926670Sdyson#include <sys/file.h>
40114216Skan#include <sys/limits.h>
4131250Sbde#include <sys/lock.h>
4267365Sjhb#include <sys/mutex.h>
4326670Sdyson#include <sys/unistd.h>
44164184Strhodes#include <sys/posix4.h>
4526670Sdyson#include <sys/proc.h>
4638402Sbde#include <sys/resourcevar.h>
4726670Sdyson#include <sys/signalvar.h>
48314334Skib#include <sys/syscallsubr.h>
4955943Sjasone#include <sys/protosw.h>
50248084Sattilio#include <sys/rwlock.h>
51154669Sdavidxu#include <sys/sema.h>
52154669Sdavidxu#include <sys/socket.h>
5355943Sjasone#include <sys/socketvar.h>
5488633Salfred#include <sys/syscall.h>
5588633Salfred#include <sys/sysent.h>
5630240Sdyson#include <sys/sysctl.h>
57303460Sjhb#include <sys/syslog.h>
5893183Sbde#include <sys/sx.h>
59154669Sdavidxu#include <sys/taskqueue.h>
6031443Sdyson#include <sys/vnode.h>
6131443Sdyson#include <sys/conf.h>
6259288Sjlemon#include <sys/event.h>
63157037Sdavidxu#include <sys/mount.h>
64281860Smav#include <geom/geom.h>
6526670Sdyson
66154669Sdavidxu#include <machine/atomic.h>
67154669Sdavidxu
6826670Sdyson#include <vm/vm.h>
69281860Smav#include <vm/vm_page.h>
7026670Sdyson#include <vm/vm_extern.h>
7127221Sdyson#include <vm/pmap.h>
7227221Sdyson#include <vm/vm_map.h>
73157037Sdavidxu#include <vm/vm_object.h>
7492751Sjeff#include <vm/uma.h>
7526670Sdyson#include <sys/aio.h>
7626670Sdyson
7791690Seivind/*
7891690Seivind * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
79157037Sdavidxu * overflow. (XXX will be removed soon.)
8091690Seivind */
81157037Sdavidxustatic u_long jobrefid;
8227221Sdyson
83157037Sdavidxu/*
84157037Sdavidxu * Counter for aio_fsync.
85157037Sdavidxu */
86157037Sdavidxustatic uint64_t jobseqno;
8727221Sdyson
8831456Sdyson#ifndef MAX_AIO_PER_PROC
8927221Sdyson#define MAX_AIO_PER_PROC	32
9031456Sdyson#endif
9131456Sdyson
9231456Sdyson#ifndef MAX_AIO_QUEUE_PER_PROC
93326322Sasomers#define MAX_AIO_QUEUE_PER_PROC	256
9431456Sdyson#endif
9531456Sdyson
9631456Sdyson#ifndef MAX_AIO_QUEUE
97326322Sasomers#define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
9831456Sdyson#endif
9927221Sdyson
10031456Sdyson#ifndef MAX_BUF_AIO
10155943Sjasone#define MAX_BUF_AIO		16
10231456Sdyson#endif
10331456Sdyson
104175870SrwatsonFEATURE(aio, "Asynchronous I/O");
105326322SasomersSYSCTL_DECL(_p1003_1b);
106175870Srwatson
107185878Sjhbstatic MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
108326322Sasomersstatic MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
109185878Sjhb
110294851Sjhbstatic SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
111294851Sjhb    "Async IO management");
11291690Seivind
113296277Sjhbstatic int enable_aio_unsafe = 0;
114296277SjhbSYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
115296277Sjhb    "Permit asynchronous IO on all file types, not just known-safe types");
116296277Sjhb
117303460Sjhbstatic unsigned int unsafe_warningcnt = 1;
118303460SjhbSYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
119303460Sjhb    &unsafe_warningcnt, 0,
120303460Sjhb    "Warnings that will be triggered upon failed IO requests on unsafe files");
121303460Sjhb
12233181Seivindstatic int max_aio_procs = MAX_AIO_PROCS;
123294851SjhbSYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
124294851Sjhb    "Maximum number of kernel processes to use for handling async IO ");
12591690Seivind
12633181Seivindstatic int num_aio_procs = 0;
127294851SjhbSYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
128294851Sjhb    "Number of presently active kernel processes for async IO");
12991690Seivind
13091690Seivind/*
13191690Seivind * The code will adjust the actual number of AIO processes towards this
13291690Seivind * number when it gets a chance.
13391690Seivind */
13433181Seivindstatic int target_aio_procs = TARGET_AIO_PROCS;
13591690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
136294851Sjhb    0,
137294851Sjhb    "Preferred number of ready kernel processes for async IO");
13891690Seivind
13933181Seivindstatic int max_queue_count = MAX_AIO_QUEUE;
14091690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
14191690Seivind    "Maximum number of aio requests to queue, globally");
14291690Seivind
14333181Seivindstatic int num_queue_count = 0;
14491690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
14591690Seivind    "Number of queued aio requests");
14691690Seivind
14733181Seivindstatic int num_buf_aio = 0;
14891690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
14991690Seivind    "Number of aio requests presently handled by the buf subsystem");
15091690Seivind
151328575Sjhbstatic int num_unmapped_aio = 0;
152328575SjhbSYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
153328575Sjhb    0,
154328575Sjhb    "Number of aio requests presently handled by unmapped I/O buffers");
155328575Sjhb
156294482Sjhb/* Number of async I/O processes in the process of being started */
157154698Sdavidxu/* XXX This should be local to aio_aqueue() */
15833181Seivindstatic int num_aio_resv_start = 0;
15991690Seivind
16033181Seivindstatic int aiod_lifetime;
16191690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
16291690Seivind    "Maximum lifetime for idle aiod");
16391690Seivind
16455943Sjasonestatic int max_aio_per_proc = MAX_AIO_PER_PROC;
16591690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
166294851Sjhb    0,
167328582Sjhb    "Maximum active aio requests per process");
16891690Seivind
16955943Sjasonestatic int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
17091690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
17191690Seivind    &max_aio_queue_per_proc, 0,
172328582Sjhb    "Maximum queued aio requests per process");
17391690Seivind
17433181Seivindstatic int max_buf_aio = MAX_BUF_AIO;
17591690SeivindSYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
176328582Sjhb    "Maximum buf aio requests per process");
17730240Sdyson
178326322Sasomers/*
179326322Sasomers * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
180326322Sasomers * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
181326322Sasomers * vfs.aio.aio_listio_max.
182326322Sasomers */
183326322SasomersSYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
184326322Sasomers    CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
185326322Sasomers    0, "Maximum aio requests for a single lio_listio call");
186326322Sasomers
187296572Sjhb#ifdef COMPAT_FREEBSD6
188151867Sdavidxutypedef struct oaiocb {
189151867Sdavidxu	int	aio_fildes;		/* File descriptor */
190151867Sdavidxu	off_t	aio_offset;		/* File offset for I/O */
191151867Sdavidxu	volatile void *aio_buf;         /* I/O buffer in process space */
192151867Sdavidxu	size_t	aio_nbytes;		/* Number of bytes for I/O */
193151867Sdavidxu	struct	osigevent aio_sigevent;	/* Signal to deliver */
194151867Sdavidxu	int	aio_lio_opcode;		/* LIO opcode */
195151867Sdavidxu	int	aio_reqprio;		/* Request priority -- ignored */
196151867Sdavidxu	struct	__aiocb_private	_aiocb_private;
197151867Sdavidxu} oaiocb_t;
198296572Sjhb#endif
199151867Sdavidxu
200154765Sdavidxu/*
201295331Sjhb * Below is a key of locks used to protect each member of struct kaiocb
202154765Sdavidxu * aioliojob and kaioinfo and any backends.
203154765Sdavidxu *
204154765Sdavidxu * * - need not protected
205158373Sdavidxu * a - locked by kaioinfo lock
206154765Sdavidxu * b - locked by backend lock, the backend lock can be null in some cases,
207154765Sdavidxu *     for example, BIO belongs to this type, in this case, proc lock is
208154765Sdavidxu *     reused.
209154765Sdavidxu * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
210154765Sdavidxu */
211154765Sdavidxu
212154765Sdavidxu/*
213296277Sjhb * If the routine that services an AIO request blocks while running in an
214296277Sjhb * AIO kernel process it can starve other I/O requests.  BIO requests
215345393Sasomers * queued via aio_qbio() complete asynchronously and do not use AIO kernel
216296277Sjhb * processes at all.  Socket I/O requests use a separate pool of
217296277Sjhb * kprocs and also force non-blocking I/O.  Other file I/O requests
218296277Sjhb * use the generic fo_read/fo_write operations which can block.  The
219296277Sjhb * fsync and mlock operations can also block while executing.  Ideally
220296277Sjhb * none of these requests would block while executing.
221296277Sjhb *
222296277Sjhb * Note that the service routines cannot toggle O_NONBLOCK in the file
223296277Sjhb * structure directly while handling a request due to races with
224296277Sjhb * userland threads.
225154765Sdavidxu */
226154765Sdavidxu
22788970Salc/* jobflags */
228296277Sjhb#define	KAIOCB_QUEUEING		0x01
229296277Sjhb#define	KAIOCB_CANCELLED	0x02
230296277Sjhb#define	KAIOCB_CANCELLING	0x04
231295331Sjhb#define	KAIOCB_CHECKSYNC	0x08
232296277Sjhb#define	KAIOCB_CLEARED		0x10
233296277Sjhb#define	KAIOCB_FINISHED		0x20
23488970Salc
23526670Sdyson/*
23627221Sdyson * AIO process info
23727221Sdyson */
23831456Sdyson#define AIOP_FREE	0x1			/* proc on free queue */
23931456Sdyson
240294482Sjhbstruct aioproc {
241294851Sjhb	int	aioprocflags;			/* (c) AIO proc flags */
242294482Sjhb	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
243294851Sjhb	struct	proc *aioproc;			/* (*) the AIO proc */
24427221Sdyson};
24527221Sdyson
24631456Sdyson/*
24731456Sdyson * data-structure for lio signal management
24831456Sdyson */
249154669Sdavidxustruct aioliojob {
250154765Sdavidxu	int	lioj_flags;			/* (a) listio flags */
251154765Sdavidxu	int	lioj_count;			/* (a) listio flags */
252154765Sdavidxu	int	lioj_finished_count;		/* (a) listio flags */
253154765Sdavidxu	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
254154765Sdavidxu	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
255294851Sjhb	struct	knlist klist;			/* (a) list of knotes */
256154765Sdavidxu	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
25731456Sdyson};
258154669Sdavidxu
25955943Sjasone#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
26031456Sdyson#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
261151260Sambrisko#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
26231456Sdyson
26331456Sdyson/*
26431456Sdyson * per process aio data structure
26531456Sdyson */
26627221Sdysonstruct kaioinfo {
267294851Sjhb	struct	mtx kaio_mtx;		/* the lock to protect this struct */
268154765Sdavidxu	int	kaio_flags;		/* (a) per process kaio flags */
269154765Sdavidxu	int	kaio_active_count;	/* (c) number of currently used AIOs */
270154765Sdavidxu	int	kaio_count;		/* (a) size of AIO queue */
271345393Sasomers	int	kaio_buffer_count;	/* (a) number of bio buffers */
272295331Sjhb	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
273295331Sjhb	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
274154765Sdavidxu	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
275295331Sjhb	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
276295331Sjhb	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
277296277Sjhb	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
278294851Sjhb	struct	task kaio_task;		/* (*) task to kick aio processes */
279296277Sjhb	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
28027221Sdyson};
28127221Sdyson
282158373Sdavidxu#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
283158373Sdavidxu#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
284158373Sdavidxu#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
285158373Sdavidxu#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
286158373Sdavidxu
28755943Sjasone#define KAIO_RUNDOWN	0x1	/* process is being run down */
288294851Sjhb#define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
28931443Sdyson
290185878Sjhb/*
291185878Sjhb * Operations used to interact with userland aio control blocks.
292185878Sjhb * Different ABIs provide their own operations.
293185878Sjhb */
294185878Sjhbstruct aiocb_ops {
295185878Sjhb	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
296185878Sjhb	long	(*fetch_status)(struct aiocb *ujob);
297185878Sjhb	long	(*fetch_error)(struct aiocb *ujob);
298185878Sjhb	int	(*store_status)(struct aiocb *ujob, long status);
299185878Sjhb	int	(*store_error)(struct aiocb *ujob, long error);
300185878Sjhb	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
301185878Sjhb	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
302185878Sjhb};
303185878Sjhb
304294482Sjhbstatic TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
305154669Sdavidxustatic struct sema aio_newproc_sem;
306154669Sdavidxustatic struct mtx aio_job_mtx;
307295331Sjhbstatic TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
308154669Sdavidxustatic struct unrhdr *aiod_unr;
30927221Sdyson
310163379Snetchildvoid		aio_init_aioinfo(struct proc *p);
311205326Skibstatic int	aio_onceonly(void);
312295331Sjhbstatic int	aio_free_entry(struct kaiocb *job);
313295331Sjhbstatic void	aio_process_rw(struct kaiocb *job);
314295331Sjhbstatic void	aio_process_sync(struct kaiocb *job);
315295331Sjhbstatic void	aio_process_mlock(struct kaiocb *job);
316296277Sjhbstatic void	aio_schedule_fsync(void *context, int pending);
317154669Sdavidxustatic int	aio_newproc(int *);
318295331Sjhbint		aio_aqueue(struct thread *td, struct aiocb *ujob,
319294851Sjhb		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
320296277Sjhbstatic int	aio_queue_file(struct file *fp, struct kaiocb *job);
321345393Sasomersstatic void	aio_biowakeup(struct bio *bp);
322112564Sjhbstatic void	aio_proc_rundown(void *arg, struct proc *p);
323294851Sjhbstatic void	aio_proc_rundown_exec(void *arg, struct proc *p,
324294851Sjhb		    struct image_params *imgp);
325345393Sasomersstatic int	aio_qbio(struct proc *p, struct kaiocb *job);
326154669Sdavidxustatic void	aio_daemon(void *param);
327296277Sjhbstatic void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
328303787Sjhbstatic bool	aio_clear_cancel_function_locked(struct kaiocb *job);
329157073Sdavidxustatic int	aio_kick(struct proc *userp);
330157037Sdavidxustatic void	aio_kick_nowait(struct proc *userp);
331157037Sdavidxustatic void	aio_kick_helper(void *context, int pending);
33288633Salfredstatic int	filt_aioattach(struct knote *kn);
33388633Salfredstatic void	filt_aiodetach(struct knote *kn);
33488633Salfredstatic int	filt_aio(struct knote *kn, long hint);
335151260Sambriskostatic int	filt_lioattach(struct knote *kn);
336151260Sambriskostatic void	filt_liodetach(struct knote *kn);
337151260Sambriskostatic int	filt_lio(struct knote *kn, long hint);
33827221Sdyson
33991690Seivind/*
34091690Seivind * Zones for:
34191690Seivind * 	kaio	Per process async io info
342294482Sjhb *	aiop	async io process data
34391690Seivind *	aiocb	async io jobs
34491690Seivind *	aiolio	list io jobs
34591690Seivind */
346326322Sasomersstatic uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
34730240Sdyson
34891690Seivind/* kqueue filters for aio */
349197134Srwatsonstatic struct filterops aio_filtops = {
350197134Srwatson	.f_isfd = 0,
351197134Srwatson	.f_attach = filt_aioattach,
352197134Srwatson	.f_detach = filt_aiodetach,
353197134Srwatson	.f_event = filt_aio,
354197134Srwatson};
355197134Srwatsonstatic struct filterops lio_filtops = {
356197134Srwatson	.f_isfd = 0,
357197134Srwatson	.f_attach = filt_lioattach,
358197134Srwatson	.f_detach = filt_liodetach,
359197134Srwatson	.f_event = filt_lio
360197134Srwatson};
36188633Salfred
362112564Sjhbstatic eventhandler_tag exit_tag, exec_tag;
363112564Sjhb
364294036SjhbTASKQUEUE_DEFINE_THREAD(aiod_kick);
365154669Sdavidxu
36691690Seivind/*
36791690Seivind * Main operations function for use as a kernel module.
36891690Seivind */
36988633Salfredstatic int
37088633Salfredaio_modload(struct module *module, int cmd, void *arg)
37188633Salfred{
37288633Salfred	int error = 0;
37388633Salfred
37488633Salfred	switch (cmd) {
37588633Salfred	case MOD_LOAD:
37688633Salfred		aio_onceonly();
37788633Salfred		break;
37888633Salfred	case MOD_SHUTDOWN:
37988633Salfred		break;
38088633Salfred	default:
381296277Sjhb		error = EOPNOTSUPP;
38288633Salfred		break;
38388633Salfred	}
38488633Salfred	return (error);
38588633Salfred}
38688633Salfred
38788633Salfredstatic moduledata_t aio_mod = {
38888633Salfred	"aio",
38988633Salfred	&aio_modload,
39088633Salfred	NULL
39188633Salfred};
39288633Salfred
393296572SjhbDECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
39488633SalfredMODULE_VERSION(aio, 1);
39588633Salfred
39627221Sdyson/*
39727221Sdyson * Startup initialization
39827221Sdyson */
399205326Skibstatic int
40088633Salfredaio_onceonly(void)
40131443Sdyson{
40288633Salfred
403112564Sjhb	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
404112564Sjhb	    EVENTHANDLER_PRI_ANY);
405294851Sjhb	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
406294851Sjhb	    NULL, EVENTHANDLER_PRI_ANY);
40788633Salfred	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
408151260Sambrisko	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
40927221Sdyson	TAILQ_INIT(&aio_freeproc);
410154669Sdavidxu	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
411154669Sdavidxu	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
41227221Sdyson	TAILQ_INIT(&aio_jobs);
413154669Sdavidxu	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
41492751Sjeff	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
41592751Sjeff	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
416294482Sjhb	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
41792751Sjeff	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
418295331Sjhb	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
41992751Sjeff	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
420154669Sdavidxu	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
42192751Sjeff	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
42231456Sdyson	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
42331443Sdyson	jobrefid = 1;
424296572Sjhb	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
425106979Salfred	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
426106979Salfred	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
427205326Skib
428205326Skib	return (0);
42927221Sdyson}
43027221Sdyson
43191690Seivind/*
43255943Sjasone * Init the per-process aioinfo structure.  The aioinfo limits are set
43355943Sjasone * per-process for user limit (resource) management.
43427221Sdyson */
435163379Snetchildvoid
43631443Sdysonaio_init_aioinfo(struct proc *p)
43731443Sdyson{
43827221Sdyson	struct kaioinfo *ki;
439109177Salfred
440146813Salc	ki = uma_zalloc(kaio_zone, M_WAITOK);
441285207Skib	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
442146813Salc	ki->kaio_flags = 0;
443146813Salc	ki->kaio_active_count = 0;
444154669Sdavidxu	ki->kaio_count = 0;
445146813Salc	ki->kaio_buffer_count = 0;
446154669Sdavidxu	TAILQ_INIT(&ki->kaio_all);
447154669Sdavidxu	TAILQ_INIT(&ki->kaio_done);
448146813Salc	TAILQ_INIT(&ki->kaio_jobqueue);
449146813Salc	TAILQ_INIT(&ki->kaio_liojoblist);
450157037Sdavidxu	TAILQ_INIT(&ki->kaio_syncqueue);
451296277Sjhb	TAILQ_INIT(&ki->kaio_syncready);
452157037Sdavidxu	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
453296277Sjhb	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
454146813Salc	PROC_LOCK(p);
45527221Sdyson	if (p->p_aioinfo == NULL) {
45627221Sdyson		p->p_aioinfo = ki;
457146813Salc		PROC_UNLOCK(p);
458146813Salc	} else {
459146813Salc		PROC_UNLOCK(p);
460158373Sdavidxu		mtx_destroy(&ki->kaio_mtx);
461146813Salc		uma_zfree(kaio_zone, ki);
46227221Sdyson	}
463133660Sjmg
464179908Sgonzo	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
465154669Sdavidxu		aio_newproc(NULL);
46627221Sdyson}
46727221Sdyson
468151994Sdavidxustatic int
469151994Sdavidxuaio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
470151994Sdavidxu{
471213642Sdavidxu	struct thread *td;
472213642Sdavidxu	int error;
473158373Sdavidxu
474213642Sdavidxu	error = sigev_findtd(p, sigev, &td);
475213642Sdavidxu	if (error)
476213642Sdavidxu		return (error);
477151994Sdavidxu	if (!KSI_ONQ(ksi)) {
478213642Sdavidxu		ksiginfo_set_sigev(ksi, sigev);
479151994Sdavidxu		ksi->ksi_code = SI_ASYNCIO;
480151994Sdavidxu		ksi->ksi_flags |= KSI_EXT | KSI_INS;
481213642Sdavidxu		tdsendsignal(p, td, ksi->ksi_signo, ksi);
482151994Sdavidxu	}
483158373Sdavidxu	PROC_UNLOCK(p);
484213642Sdavidxu	return (error);
485151994Sdavidxu}
486151994Sdavidxu
48727221Sdyson/*
48855943Sjasone * Free a job entry.  Wait for completion if it is currently active, but don't
48955943Sjasone * delay forever.  If we delay, we return a flag that says that we have to
49055943Sjasone * restart the queue scan.
49127221Sdyson */
49273559Salcstatic int
493295331Sjhbaio_free_entry(struct kaiocb *job)
49431443Sdyson{
49527221Sdyson	struct kaioinfo *ki;
496154669Sdavidxu	struct aioliojob *lj;
49727221Sdyson	struct proc *p;
49827221Sdyson
499295331Sjhb	p = job->userproc;
500154669Sdavidxu	MPASS(curproc == p);
50127221Sdyson	ki = p->p_aioinfo;
502154669Sdavidxu	MPASS(ki != NULL);
503154669Sdavidxu
504158373Sdavidxu	AIO_LOCK_ASSERT(ki, MA_OWNED);
505296277Sjhb	MPASS(job->jobflags & KAIOCB_FINISHED);
506158373Sdavidxu
507154669Sdavidxu	atomic_subtract_int(&num_queue_count, 1);
508154669Sdavidxu
509154669Sdavidxu	ki->kaio_count--;
510154669Sdavidxu	MPASS(ki->kaio_count >= 0);
511154669Sdavidxu
512295331Sjhb	TAILQ_REMOVE(&ki->kaio_done, job, plist);
513295331Sjhb	TAILQ_REMOVE(&ki->kaio_all, job, allist);
514156024Sdavidxu
515295331Sjhb	lj = job->lio;
516154669Sdavidxu	if (lj) {
517154669Sdavidxu		lj->lioj_count--;
518154669Sdavidxu		lj->lioj_finished_count--;
51927221Sdyson
520154698Sdavidxu		if (lj->lioj_count == 0) {
521154669Sdavidxu			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
522154669Sdavidxu			/* lio is going away, we need to destroy any knotes */
523154669Sdavidxu			knlist_delete(&lj->klist, curthread, 1);
524158373Sdavidxu			PROC_LOCK(p);
525154669Sdavidxu			sigqueue_take(&lj->lioj_ksi);
526158373Sdavidxu			PROC_UNLOCK(p);
527154669Sdavidxu			uma_zfree(aiolio_zone, lj);
52831456Sdyson		}
52931443Sdyson	}
53031443Sdyson
531295331Sjhb	/* job is going away, we need to destroy any knotes */
532295331Sjhb	knlist_delete(&job->klist, curthread, 1);
533158373Sdavidxu	PROC_LOCK(p);
534295331Sjhb	sigqueue_take(&job->ksi);
535158373Sdavidxu	PROC_UNLOCK(p);
536152208Sjhb
537158373Sdavidxu	AIO_UNLOCK(ki);
538154669Sdavidxu
539152208Sjhb	/*
540152208Sjhb	 * The thread argument here is used to find the owning process
541152208Sjhb	 * and is also passed to fo_close() which may pass it to various
542152208Sjhb	 * places such as devsw close() routines.  Because of that, we
543152208Sjhb	 * need a thread pointer from the process owning the job that is
544152208Sjhb	 * persistent and won't disappear out from under us or move to
545152208Sjhb	 * another process.
546152208Sjhb	 *
547152208Sjhb	 * Currently, all the callers of this function call it to remove
548295331Sjhb	 * a kaiocb from the current process' job list either via a
549152208Sjhb	 * syscall or due to the current process calling exit() or
550152208Sjhb	 * execve().  Thus, we know that p == curproc.  We also know that
551152208Sjhb	 * curthread can't exit since we are curthread.
552152208Sjhb	 *
553152208Sjhb	 * Therefore, we use curthread as the thread to pass to
554152208Sjhb	 * knlist_delete().  This does mean that it is possible for the
555152208Sjhb	 * thread pointer at close time to differ from the thread pointer
556152208Sjhb	 * at open time, but this is already true of file descriptors in
557152208Sjhb	 * a multithreaded process.
55883366Sjulian	 */
559295331Sjhb	if (job->fd_file)
560295331Sjhb		fdrop(job->fd_file, curthread);
561295331Sjhb	crfree(job->cred);
562295331Sjhb	uma_zfree(aiocb_zone, job);
563158373Sdavidxu	AIO_LOCK(ki);
564154669Sdavidxu
565109177Salfred	return (0);
56627221Sdyson}
56727221Sdyson
568161302Snetchildstatic void
569294851Sjhbaio_proc_rundown_exec(void *arg, struct proc *p,
570294851Sjhb    struct image_params *imgp __unused)
571161302Snetchild{
572161302Snetchild   	aio_proc_rundown(arg, p);
573161302Snetchild}
574161302Snetchild
575296277Sjhbstatic int
576296277Sjhbaio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
577296277Sjhb{
578296277Sjhb	aio_cancel_fn_t *func;
579296277Sjhb	int cancelled;
580296277Sjhb
581296277Sjhb	AIO_LOCK_ASSERT(ki, MA_OWNED);
582296277Sjhb	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
583296277Sjhb		return (0);
584296277Sjhb	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
585296277Sjhb	job->jobflags |= KAIOCB_CANCELLED;
586296277Sjhb
587296277Sjhb	func = job->cancel_fn;
588296277Sjhb
589296277Sjhb	/*
590296277Sjhb	 * If there is no cancel routine, just leave the job marked as
591296277Sjhb	 * cancelled.  The job should be in active use by a caller who
592296277Sjhb	 * should complete it normally or when it fails to install a
593296277Sjhb	 * cancel routine.
594296277Sjhb	 */
595296277Sjhb	if (func == NULL)
596296277Sjhb		return (0);
597296277Sjhb
598296277Sjhb	/*
599296277Sjhb	 * Set the CANCELLING flag so that aio_complete() will defer
600296277Sjhb	 * completions of this job.  This prevents the job from being
601296277Sjhb	 * freed out from under the cancel callback.  After the
602296277Sjhb	 * callback any deferred completion (whether from the callback
603296277Sjhb	 * or any other source) will be completed.
604296277Sjhb	 */
605296277Sjhb	job->jobflags |= KAIOCB_CANCELLING;
606296277Sjhb	AIO_UNLOCK(ki);
607296277Sjhb	func(job);
608296277Sjhb	AIO_LOCK(ki);
609296277Sjhb	job->jobflags &= ~KAIOCB_CANCELLING;
610296277Sjhb	if (job->jobflags & KAIOCB_FINISHED) {
611296277Sjhb		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
612296277Sjhb		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
613296277Sjhb		aio_bio_done_notify(p, job);
614296277Sjhb	} else {
615296277Sjhb		/*
616296277Sjhb		 * The cancel callback might have scheduled an
617296277Sjhb		 * operation to cancel this request, but it is
618296277Sjhb		 * only counted as cancelled if the request is
619296277Sjhb		 * cancelled when the callback returns.
620296277Sjhb		 */
621296277Sjhb		cancelled = 0;
622296277Sjhb	}
623296277Sjhb	return (cancelled);
624296277Sjhb}
625296277Sjhb
62627221Sdyson/*
627133660Sjmg * Rundown the jobs for a given process.
62827221Sdyson */
62988633Salfredstatic void
630112564Sjhbaio_proc_rundown(void *arg, struct proc *p)
63131443Sdyson{
63227221Sdyson	struct kaioinfo *ki;
633154669Sdavidxu	struct aioliojob *lj;
634295331Sjhb	struct kaiocb *job, *jobn;
63555943Sjasone
636152208Sjhb	KASSERT(curthread->td_proc == p,
637152208Sjhb	    ("%s: called on non-curproc", __func__));
63827221Sdyson	ki = p->p_aioinfo;
63927221Sdyson	if (ki == NULL)
64027221Sdyson		return;
64127221Sdyson
642158373Sdavidxu	AIO_LOCK(ki);
643156024Sdavidxu	ki->kaio_flags |= KAIO_RUNDOWN;
64430240Sdyson
645154669Sdavidxurestart:
646154669Sdavidxu
64755943Sjasone	/*
648154669Sdavidxu	 * Try to cancel all pending requests. This code simulates
649154669Sdavidxu	 * aio_cancel on all pending I/O requests.
65055943Sjasone	 */
651295331Sjhb	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
652296277Sjhb		aio_cancel_job(p, ki, job);
65327221Sdyson	}
65427221Sdyson
655154669Sdavidxu	/* Wait for all running I/O to be finished */
656296277Sjhb	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
65731456Sdyson		ki->kaio_flags |= KAIO_WAKEUP;
658158373Sdavidxu		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
659154669Sdavidxu		goto restart;
66031456Sdyson	}
66131456Sdyson
662154669Sdavidxu	/* Free all completed I/O requests. */
663295331Sjhb	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
664295331Sjhb		aio_free_entry(job);
66531456Sdyson
666154669Sdavidxu	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
667154698Sdavidxu		if (lj->lioj_count == 0) {
66855943Sjasone			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
669154669Sdavidxu			knlist_delete(&lj->klist, curthread, 1);
670158373Sdavidxu			PROC_LOCK(p);
671154669Sdavidxu			sigqueue_take(&lj->lioj_ksi);
672158373Sdavidxu			PROC_UNLOCK(p);
67392751Sjeff			uma_zfree(aiolio_zone, lj);
67455943Sjasone		} else {
675154698Sdavidxu			panic("LIO job not cleaned up: C:%d, FC:%d\n",
676154698Sdavidxu			    lj->lioj_count, lj->lioj_finished_count);
67755943Sjasone		}
67831456Sdyson	}
679158373Sdavidxu	AIO_UNLOCK(ki);
680294036Sjhb	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
681296277Sjhb	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
682171901Skib	mtx_destroy(&ki->kaio_mtx);
68392751Sjeff	uma_zfree(kaio_zone, ki);
68430240Sdyson	p->p_aioinfo = NULL;
68527221Sdyson}
68627221Sdyson
68727221Sdyson/*
68855943Sjasone * Select a job to run (called by an AIO daemon).
68927221Sdyson */
690295331Sjhbstatic struct kaiocb *
691294482Sjhbaio_selectjob(struct aioproc *aiop)
69231443Sdyson{
693295331Sjhb	struct kaiocb *job;
69455943Sjasone	struct kaioinfo *ki;
69555943Sjasone	struct proc *userp;
69627221Sdyson
697154669Sdavidxu	mtx_assert(&aio_job_mtx, MA_OWNED);
698296277Sjhbrestart:
699295331Sjhb	TAILQ_FOREACH(job, &aio_jobs, list) {
700295331Sjhb		userp = job->userproc;
70127221Sdyson		ki = userp->p_aioinfo;
70227221Sdyson
703328582Sjhb		if (ki->kaio_active_count < max_aio_per_proc) {
704295331Sjhb			TAILQ_REMOVE(&aio_jobs, job, list);
705296277Sjhb			if (!aio_clear_cancel_function(job))
706296277Sjhb				goto restart;
707296277Sjhb
708154669Sdavidxu			/* Account for currently active jobs. */
709154669Sdavidxu			ki->kaio_active_count++;
710154669Sdavidxu			break;
71127221Sdyson		}
71227221Sdyson	}
713295331Sjhb	return (job);
71427221Sdyson}
71527221Sdyson
71627221Sdyson/*
717294851Sjhb * Move all data to a permanent storage device.  This code
718294851Sjhb * simulates the fsync syscall.
719157037Sdavidxu */
720157037Sdavidxustatic int
721157037Sdavidxuaio_fsync_vnode(struct thread *td, struct vnode *vp)
722157037Sdavidxu{
723157037Sdavidxu	struct mount *mp;
724157037Sdavidxu	int error;
725157037Sdavidxu
726157037Sdavidxu	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
727157037Sdavidxu		goto drop;
728175202Sattilio	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
729157037Sdavidxu	if (vp->v_object != NULL) {
730248084Sattilio		VM_OBJECT_WLOCK(vp->v_object);
731157037Sdavidxu		vm_object_page_clean(vp->v_object, 0, 0, 0);
732248084Sattilio		VM_OBJECT_WUNLOCK(vp->v_object);
733157037Sdavidxu	}
734157037Sdavidxu	error = VOP_FSYNC(vp, MNT_WAIT, td);
735157037Sdavidxu
736175294Sattilio	VOP_UNLOCK(vp, 0);
737157037Sdavidxu	vn_finished_write(mp);
738157037Sdavidxudrop:
739157037Sdavidxu	return (error);
740157037Sdavidxu}
741157037Sdavidxu
742157037Sdavidxu/*
743251522Sglebius * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
744345393Sasomers * does the I/O request for the non-bio version of the operations.  The normal
745345393Sasomers * vn operations are used, and this code should work in all instances for every
746345393Sasomers * type of file, including pipes, sockets, fifos, and regular files.
747154669Sdavidxu *
748154765Sdavidxu * XXX I don't think it works well for socket, pipe, and fifo.
74927221Sdyson */
75073559Salcstatic void
751295331Sjhbaio_process_rw(struct kaiocb *job)
75231443Sdyson{
753106574Srwatson	struct ucred *td_savedcred;
75483366Sjulian	struct thread *td;
75527221Sdyson	struct aiocb *cb;
75627221Sdyson	struct file *fp;
75727221Sdyson	struct uio auio;
75827221Sdyson	struct iovec aiov;
759297167Sjhb	ssize_t cnt;
760302074Sjhb	long msgsnd_st, msgsnd_end;
761302074Sjhb	long msgrcv_st, msgrcv_end;
762302074Sjhb	long oublock_st, oublock_end;
763302074Sjhb	long inblock_st, inblock_end;
76427221Sdyson	int error;
76527221Sdyson
766295331Sjhb	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
767295331Sjhb	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
768295331Sjhb	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
769251522Sglebius
770296277Sjhb	aio_switch_vmspace(job);
77183366Sjulian	td = curthread;
772106574Srwatson	td_savedcred = td->td_ucred;
773295331Sjhb	td->td_ucred = job->cred;
774295331Sjhb	cb = &job->uaiocb;
775295331Sjhb	fp = job->fd_file;
77627221Sdyson
77787556Salc	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
77827221Sdyson	aiov.iov_len = cb->aio_nbytes;
77927221Sdyson
78027221Sdyson	auio.uio_iov = &aiov;
78127221Sdyson	auio.uio_iovcnt = 1;
78293745Salc	auio.uio_offset = cb->aio_offset;
78327221Sdyson	auio.uio_resid = cb->aio_nbytes;
78427221Sdyson	cnt = cb->aio_nbytes;
78527221Sdyson	auio.uio_segflg = UIO_USERSPACE;
78683366Sjulian	auio.uio_td = td;
78727221Sdyson
788302074Sjhb	msgrcv_st = td->td_ru.ru_msgrcv;
789302074Sjhb	msgsnd_st = td->td_ru.ru_msgsnd;
790170174Sjeff	inblock_st = td->td_ru.ru_inblock;
791170174Sjeff	oublock_st = td->td_ru.ru_oublock;
792302074Sjhb
79368883Sdillon	/*
794154698Sdavidxu	 * aio_aqueue() acquires a reference to the file that is
79593745Salc	 * released in aio_free_entry().
79668883Sdillon	 */
79727221Sdyson	if (cb->aio_lio_opcode == LIO_READ) {
79827221Sdyson		auio.uio_rw = UIO_READ;
799171901Skib		if (auio.uio_resid == 0)
800171901Skib			error = 0;
801171901Skib		else
802171901Skib			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
80327221Sdyson	} else {
804154893Sdavidxu		if (fp->f_type == DTYPE_VNODE)
805154893Sdavidxu			bwillwrite();
80627221Sdyson		auio.uio_rw = UIO_WRITE;
80783366Sjulian		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
80827221Sdyson	}
809302074Sjhb	msgrcv_end = td->td_ru.ru_msgrcv;
810302074Sjhb	msgsnd_end = td->td_ru.ru_msgsnd;
811170174Sjeff	inblock_end = td->td_ru.ru_inblock;
812170174Sjeff	oublock_end = td->td_ru.ru_oublock;
81327221Sdyson
814302074Sjhb	job->msgrcv = msgrcv_end - msgrcv_st;
815302074Sjhb	job->msgsnd = msgsnd_end - msgsnd_st;
816302074Sjhb	job->inblock = inblock_end - inblock_st;
817302074Sjhb	job->outblock = oublock_end - oublock_st;
81831443Sdyson
81955943Sjasone	if ((error) && (auio.uio_resid != cnt)) {
82055943Sjasone		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
82155943Sjasone			error = 0;
82273929Sjhb		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
823296277Sjhb			PROC_LOCK(job->userproc);
824296277Sjhb			kern_psignal(job->userproc, SIGPIPE);
825296277Sjhb			PROC_UNLOCK(job->userproc);
82673929Sjhb		}
82727221Sdyson	}
82827221Sdyson
82927221Sdyson	cnt -= auio.uio_resid;
830106574Srwatson	td->td_ucred = td_savedcred;
831300331Sjhb	if (error)
832300331Sjhb		aio_complete(job, -1, error);
833300331Sjhb	else
834300331Sjhb		aio_complete(job, cnt, 0);
83527221Sdyson}
83627221Sdyson
837151260Sambriskostatic void
838295331Sjhbaio_process_sync(struct kaiocb *job)
839251522Sglebius{
840251522Sglebius	struct thread *td = curthread;
841251522Sglebius	struct ucred *td_savedcred = td->td_ucred;
842295331Sjhb	struct file *fp = job->fd_file;
843251522Sglebius	int error = 0;
844251522Sglebius
845295331Sjhb	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
846295331Sjhb	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
847251522Sglebius
848295331Sjhb	td->td_ucred = job->cred;
849251522Sglebius	if (fp->f_vnode != NULL)
850251522Sglebius		error = aio_fsync_vnode(td, fp->f_vnode);
851251522Sglebius	td->td_ucred = td_savedcred;
852300331Sjhb	if (error)
853300331Sjhb		aio_complete(job, -1, error);
854300331Sjhb	else
855300331Sjhb		aio_complete(job, 0, 0);
856251522Sglebius}
857251522Sglebius
858251522Sglebiusstatic void
859295331Sjhbaio_process_mlock(struct kaiocb *job)
860251526Sglebius{
861295331Sjhb	struct aiocb *cb = &job->uaiocb;
862251526Sglebius	int error;
863251526Sglebius
864295331Sjhb	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
865295331Sjhb	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
866251526Sglebius
867296277Sjhb	aio_switch_vmspace(job);
868314334Skib	error = kern_mlock(job->userproc, job->cred,
869314334Skib	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
870314334Skib	aio_complete(job, error != 0 ? -1 : 0, error);
871251526Sglebius}
872251526Sglebius
873251526Sglebiusstatic void
874296277Sjhbaio_bio_done_notify(struct proc *userp, struct kaiocb *job)
875154669Sdavidxu{
876154669Sdavidxu	struct aioliojob *lj;
877154669Sdavidxu	struct kaioinfo *ki;
878295331Sjhb	struct kaiocb *sjob, *sjobn;
879151260Sambrisko	int lj_done;
880296277Sjhb	bool schedule_fsync;
881151260Sambrisko
882151260Sambrisko	ki = userp->p_aioinfo;
883158373Sdavidxu	AIO_LOCK_ASSERT(ki, MA_OWNED);
884295331Sjhb	lj = job->lio;
885151260Sambrisko	lj_done = 0;
886151260Sambrisko	if (lj) {
887154669Sdavidxu		lj->lioj_finished_count++;
888154669Sdavidxu		if (lj->lioj_count == lj->lioj_finished_count)
889151260Sambrisko			lj_done = 1;
890151260Sambrisko	}
891295331Sjhb	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
892296277Sjhb	MPASS(job->jobflags & KAIOCB_FINISHED);
893156024Sdavidxu
894156024Sdavidxu	if (ki->kaio_flags & KAIO_RUNDOWN)
895156024Sdavidxu		goto notification_done;
896156024Sdavidxu
897295331Sjhb	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
898295331Sjhb	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
899295331Sjhb		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
900151260Sambrisko
901295331Sjhb	KNOTE_LOCKED(&job->klist, 1);
902154669Sdavidxu
903154669Sdavidxu	if (lj_done) {
904154669Sdavidxu		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
905154669Sdavidxu			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
906154669Sdavidxu			KNOTE_LOCKED(&lj->klist, 1);
907151260Sambrisko		}
908154669Sdavidxu		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
909154669Sdavidxu		    == LIOJ_SIGNAL
910154669Sdavidxu		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
911154669Sdavidxu		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
912154669Sdavidxu			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
913154669Sdavidxu			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
914151260Sambrisko		}
915151260Sambrisko	}
916156024Sdavidxu
917156024Sdavidxunotification_done:
918295331Sjhb	if (job->jobflags & KAIOCB_CHECKSYNC) {
919296277Sjhb		schedule_fsync = false;
920295331Sjhb		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
921303787Sjhb			if (job->fd_file != sjob->fd_file ||
922303787Sjhb			    job->seqno >= sjob->seqno)
923303787Sjhb				continue;
924303787Sjhb			if (--sjob->pending > 0)
925303787Sjhb				continue;
926303787Sjhb			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
927303787Sjhb			if (!aio_clear_cancel_function_locked(sjob))
928303787Sjhb				continue;
929303787Sjhb			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
930303787Sjhb			schedule_fsync = true;
931157037Sdavidxu		}
932296277Sjhb		if (schedule_fsync)
933296277Sjhb			taskqueue_enqueue(taskqueue_aiod_kick,
934296277Sjhb			    &ki->kaio_sync_task);
935157037Sdavidxu	}
936156024Sdavidxu	if (ki->kaio_flags & KAIO_WAKEUP) {
937154669Sdavidxu		ki->kaio_flags &= ~KAIO_WAKEUP;
938154669Sdavidxu		wakeup(&userp->p_aioinfo);
939151260Sambrisko	}
940151260Sambrisko}
941154669Sdavidxu
942294344Sjhbstatic void
943296277Sjhbaio_schedule_fsync(void *context, int pending)
944296277Sjhb{
945296277Sjhb	struct kaioinfo *ki;
946296277Sjhb	struct kaiocb *job;
947296277Sjhb
948296277Sjhb	ki = context;
949296277Sjhb	AIO_LOCK(ki);
950296277Sjhb	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
951296277Sjhb		job = TAILQ_FIRST(&ki->kaio_syncready);
952296277Sjhb		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
953296277Sjhb		AIO_UNLOCK(ki);
954296277Sjhb		aio_schedule(job, aio_process_sync);
955296277Sjhb		AIO_LOCK(ki);
956296277Sjhb	}
957296277Sjhb	AIO_UNLOCK(ki);
958296277Sjhb}
959296277Sjhb
960296277Sjhbbool
961296277Sjhbaio_cancel_cleared(struct kaiocb *job)
962296277Sjhb{
963296277Sjhb	struct kaioinfo *ki;
964296277Sjhb
965296277Sjhb	/*
966296277Sjhb	 * The caller should hold the same queue lock held when
967296277Sjhb	 * aio_clear_cancel_function() was called and set this flag
968296277Sjhb	 * ensuring this check sees an up-to-date value.  However,
969296277Sjhb	 * there is no way to assert that.
970296277Sjhb	 */
971296277Sjhb	ki = job->userproc->p_aioinfo;
972296277Sjhb	return ((job->jobflags & KAIOCB_CLEARED) != 0);
973296277Sjhb}
974296277Sjhb
975303787Sjhbstatic bool
976303787Sjhbaio_clear_cancel_function_locked(struct kaiocb *job)
977296277Sjhb{
978296277Sjhb
979303787Sjhb	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
980296277Sjhb	MPASS(job->cancel_fn != NULL);
981296277Sjhb	if (job->jobflags & KAIOCB_CANCELLING) {
982296277Sjhb		job->jobflags |= KAIOCB_CLEARED;
983296277Sjhb		return (false);
984296277Sjhb	}
985296277Sjhb	job->cancel_fn = NULL;
986296277Sjhb	return (true);
987296277Sjhb}
988296277Sjhb
989296277Sjhbbool
990303787Sjhbaio_clear_cancel_function(struct kaiocb *job)
991296277Sjhb{
992296277Sjhb	struct kaioinfo *ki;
993303787Sjhb	bool ret;
994296277Sjhb
995296277Sjhb	ki = job->userproc->p_aioinfo;
996296277Sjhb	AIO_LOCK(ki);
997303787Sjhb	ret = aio_clear_cancel_function_locked(job);
998303787Sjhb	AIO_UNLOCK(ki);
999303787Sjhb	return (ret);
1000303787Sjhb}
1001303787Sjhb
1002303787Sjhbstatic bool
1003303787Sjhbaio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
1004303787Sjhb{
1005303787Sjhb
1006303787Sjhb	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
1007303787Sjhb	if (job->jobflags & KAIOCB_CANCELLED)
1008296277Sjhb		return (false);
1009296277Sjhb	job->cancel_fn = func;
1010296277Sjhb	return (true);
1011296277Sjhb}
1012296277Sjhb
1013303787Sjhbbool
1014303787Sjhbaio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
1015303787Sjhb{
1016303787Sjhb	struct kaioinfo *ki;
1017303787Sjhb	bool ret;
1018303787Sjhb
1019303787Sjhb	ki = job->userproc->p_aioinfo;
1020303787Sjhb	AIO_LOCK(ki);
1021303787Sjhb	ret = aio_set_cancel_function_locked(job, func);
1022303787Sjhb	AIO_UNLOCK(ki);
1023303787Sjhb	return (ret);
1024303787Sjhb}
1025303787Sjhb
1026296277Sjhbvoid
1027296277Sjhbaio_complete(struct kaiocb *job, long status, int error)
1028296277Sjhb{
1029296277Sjhb	struct kaioinfo *ki;
1030296277Sjhb	struct proc *userp;
1031296277Sjhb
1032296277Sjhb	job->uaiocb._aiocb_private.error = error;
1033296277Sjhb	job->uaiocb._aiocb_private.status = status;
1034296277Sjhb
1035296277Sjhb	userp = job->userproc;
1036296277Sjhb	ki = userp->p_aioinfo;
1037296277Sjhb
1038296277Sjhb	AIO_LOCK(ki);
1039296277Sjhb	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
1040296277Sjhb	    ("duplicate aio_complete"));
1041296277Sjhb	job->jobflags |= KAIOCB_FINISHED;
1042296277Sjhb	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
1043296277Sjhb		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
1044296277Sjhb		aio_bio_done_notify(userp, job);
1045296277Sjhb	}
1046296277Sjhb	AIO_UNLOCK(ki);
1047296277Sjhb}
1048296277Sjhb
1049296277Sjhbvoid
1050296277Sjhbaio_cancel(struct kaiocb *job)
1051296277Sjhb{
1052296277Sjhb
1053296277Sjhb	aio_complete(job, -1, ECANCELED);
1054296277Sjhb}
1055296277Sjhb
1056296277Sjhbvoid
1057295331Sjhbaio_switch_vmspace(struct kaiocb *job)
1058294344Sjhb{
1059294344Sjhb
1060295331Sjhb	vmspace_switch_aio(job->userproc->p_vmspace);
1061294344Sjhb}
1062294344Sjhb
106327221Sdyson/*
1064251522Sglebius * The AIO daemon, most of the actual work is done in aio_process_*,
106531456Sdyson * but the setup (and address space mgmt) is done in this routine.
106627221Sdyson */
106727221Sdysonstatic void
1068154669Sdavidxuaio_daemon(void *_id)
106927221Sdyson{
1070295331Sjhb	struct kaiocb *job;
1071294482Sjhb	struct aioproc *aiop;
107255943Sjasone	struct kaioinfo *ki;
1073296277Sjhb	struct proc *p;
1074294344Sjhb	struct vmspace *myvm;
107583366Sjulian	struct thread *td = curthread;
1076154669Sdavidxu	int id = (intptr_t)_id;
107727221Sdyson
107827221Sdyson	/*
1079294344Sjhb	 * Grab an extra reference on the daemon's vmspace so that it
1080294344Sjhb	 * doesn't get freed by jobs that switch to a different
1081294344Sjhb	 * vmspace.
108227221Sdyson	 */
1083294344Sjhb	p = td->td_proc;
1084294344Sjhb	myvm = vmspace_acquire_ref(p);
108527221Sdyson
1086294344Sjhb	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
108731443Sdyson
108827221Sdyson	/*
108955943Sjasone	 * Allocate and ready the aio control info.  There is one aiop structure
109055943Sjasone	 * per daemon.
109131443Sdyson	 */
1092111119Simp	aiop = uma_zalloc(aiop_zone, M_WAITOK);
1093294482Sjhb	aiop->aioproc = p;
1094294482Sjhb	aiop->aioprocflags = 0;
109531443Sdyson
109631443Sdyson	/*
109731443Sdyson	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
109883366Sjulian	 * and creating too many daemons.)
109931443Sdyson	 */
1100154669Sdavidxu	sema_post(&aio_newproc_sem);
110131443Sdyson
1102154669Sdavidxu	mtx_lock(&aio_job_mtx);
110355943Sjasone	for (;;) {
110431443Sdyson		/*
110531443Sdyson		 * Take daemon off of free queue
110631443Sdyson		 */
1107294482Sjhb		if (aiop->aioprocflags & AIOP_FREE) {
110827221Sdyson			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1109294482Sjhb			aiop->aioprocflags &= ~AIOP_FREE;
111027221Sdyson		}
111127221Sdyson
111231443Sdyson		/*
111355943Sjasone		 * Check for jobs.
111431443Sdyson		 */
1115295331Sjhb		while ((job = aio_selectjob(aiop)) != NULL) {
1116154669Sdavidxu			mtx_unlock(&aio_job_mtx);
111727221Sdyson
1118296277Sjhb			ki = job->userproc->p_aioinfo;
1119296277Sjhb			job->handle_fn(job);
1120133660Sjmg
1121154671Sdavidxu			mtx_lock(&aio_job_mtx);
1122154671Sdavidxu			/* Decrement the active job count. */
1123154671Sdavidxu			ki->kaio_active_count--;
112427221Sdyson		}
112527221Sdyson
112631443Sdyson		/*
112755943Sjasone		 * Disconnect from user address space.
112831443Sdyson		 */
1129294344Sjhb		if (p->p_vmspace != myvm) {
1130154669Sdavidxu			mtx_unlock(&aio_job_mtx);
1131294344Sjhb			vmspace_switch_aio(myvm);
1132154669Sdavidxu			mtx_lock(&aio_job_mtx);
1133154669Sdavidxu			/*
1134154669Sdavidxu			 * We have to restart to avoid race, we only sleep if
1135294344Sjhb			 * no job can be selected.
1136154669Sdavidxu			 */
1137154669Sdavidxu			continue;
113827221Sdyson		}
113931443Sdyson
1140154669Sdavidxu		mtx_assert(&aio_job_mtx, MA_OWNED);
1141154669Sdavidxu
114231443Sdyson		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
1143294482Sjhb		aiop->aioprocflags |= AIOP_FREE;
114431443Sdyson
114531443Sdyson		/*
114655943Sjasone		 * If daemon is inactive for a long time, allow it to exit,
114755943Sjasone		 * thereby freeing resources.
114831443Sdyson		 */
1149294482Sjhb		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
1150294344Sjhb		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
1151294482Sjhb		    (aiop->aioprocflags & AIOP_FREE) &&
1152294344Sjhb		    num_aio_procs > target_aio_procs)
1153294344Sjhb			break;
115427221Sdyson	}
1155294344Sjhb	TAILQ_REMOVE(&aio_freeproc, aiop, list);
1156294344Sjhb	num_aio_procs--;
1157154669Sdavidxu	mtx_unlock(&aio_job_mtx);
1158294344Sjhb	uma_zfree(aiop_zone, aiop);
1159294344Sjhb	free_unr(aiod_unr, id);
1160294344Sjhb	vmspace_free(myvm);
1161294344Sjhb
1162294344Sjhb	KASSERT(p->p_vmspace == myvm,
1163294344Sjhb	    ("AIOD: bad vmspace for exiting daemon"));
1164294344Sjhb	KASSERT(myvm->vm_refcnt > 1,
1165294344Sjhb	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
1166294344Sjhb	kproc_exit(0);
116727221Sdyson}
116827221Sdyson
116927221Sdyson/*
1170154669Sdavidxu * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
117155943Sjasone * AIO daemon modifies its environment itself.
117227221Sdyson */
117327221Sdysonstatic int
1174154669Sdavidxuaio_newproc(int *start)
117531443Sdyson{
117627221Sdyson	int error;
117774015Salc	struct proc *p;
1178154669Sdavidxu	int id;
117927221Sdyson
1180154669Sdavidxu	id = alloc_unr(aiod_unr);
1181172836Sjulian	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
1182154669Sdavidxu		RFNOWAIT, 0, "aiod%d", id);
1183154669Sdavidxu	if (error == 0) {
1184154669Sdavidxu		/*
1185154669Sdavidxu		 * Wait until daemon is started.
1186154669Sdavidxu		 */
1187154669Sdavidxu		sema_wait(&aio_newproc_sem);
1188154669Sdavidxu		mtx_lock(&aio_job_mtx);
1189154669Sdavidxu		num_aio_procs++;
1190154669Sdavidxu		if (start != NULL)
1191154740Sdavidxu			(*start)--;
1192154669Sdavidxu		mtx_unlock(&aio_job_mtx);
1193154669Sdavidxu	} else {
1194154669Sdavidxu		free_unr(aiod_unr, id);
1195154669Sdavidxu	}
1196109177Salfred	return (error);
119727221Sdyson}
119827221Sdyson
119927221Sdyson/*
1200345393Sasomers * Try the high-performance, low-overhead bio method for eligible
120173559Salc * VCHR devices.  This method doesn't use an aio helper thread, and
1202133660Sjmg * thus has very low overhead.
120373559Salc *
1204154698Sdavidxu * Assumes that the caller, aio_aqueue(), has incremented the file
120573559Salc * structure's reference count, preventing its deallocation for the
1206133660Sjmg * duration of this call.
120731443Sdyson */
120873559Salcstatic int
1209345393Sasomersaio_qbio(struct proc *p, struct kaiocb *job)
121031443Sdyson{
121131443Sdyson	struct aiocb *cb;
121231443Sdyson	struct file *fp;
1213281860Smav	struct bio *bp;
1214281860Smav	struct buf *pbuf;
121531443Sdyson	struct vnode *vp;
1216248794Skib	struct cdevsw *csw;
1217248794Skib	struct cdev *dev;
121831443Sdyson	struct kaioinfo *ki;
1219297464Sjhb	int error, ref, poff;
1220281860Smav	vm_prot_t prot;
122131443Sdyson
1222295331Sjhb	cb = &job->uaiocb;
1223295331Sjhb	fp = job->fd_file;
122431443Sdyson
1225328581Sjhb	if (!(cb->aio_lio_opcode == LIO_WRITE ||
1226328581Sjhb	    cb->aio_lio_opcode == LIO_READ))
1227328581Sjhb		return (-1);
1228251526Sglebius	if (fp == NULL || fp->f_type != DTYPE_VNODE)
122952969Sphk		return (-1);
123031443Sdyson
1231116678Sphk	vp = fp->f_vnode;
1232281860Smav	if (vp->v_type != VCHR)
1233281860Smav		return (-1);
1234155887Sdavidxu	if (vp->v_bufobj.bo_bsize == 0)
1235155887Sdavidxu		return (-1);
1236281860Smav	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
123752969Sphk		return (-1);
123831443Sdyson
1239248794Skib	ref = 0;
1240248794Skib	csw = devvn_refthread(vp, &dev, &ref);
1241248794Skib	if (csw == NULL)
1242248794Skib		return (ENXIO);
1243281860Smav
1244281860Smav	if ((csw->d_flags & D_DISK) == 0) {
1245281860Smav		error = -1;
1246281860Smav		goto unref;
1247281860Smav	}
1248248794Skib	if (cb->aio_nbytes > dev->si_iosize_max) {
1249248794Skib		error = -1;
1250248794Skib		goto unref;
1251248794Skib	}
1252248794Skib
1253281860Smav	ki = p->p_aioinfo;
1254281860Smav	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
1255297464Sjhb	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
1256281860Smav		if (cb->aio_nbytes > MAXPHYS) {
1257281860Smav			error = -1;
1258281860Smav			goto unref;
1259281860Smav		}
1260297464Sjhb
1261297464Sjhb		pbuf = NULL;
1262281860Smav	} else {
1263281860Smav		if (cb->aio_nbytes > MAXPHYS - poff) {
1264281860Smav			error = -1;
1265281860Smav			goto unref;
1266281860Smav		}
1267328582Sjhb		if (ki->kaio_buffer_count >= max_buf_aio) {
1268328581Sjhb			error = EAGAIN;
1269281860Smav			goto unref;
1270281860Smav		}
1271297464Sjhb
1272295331Sjhb		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
1273281860Smav		BUF_KERNPROC(pbuf);
1274297464Sjhb		AIO_LOCK(ki);
1275281860Smav		ki->kaio_buffer_count++;
1276297464Sjhb		AIO_UNLOCK(ki);
1277297464Sjhb	}
1278297464Sjhb	job->bp = bp = g_alloc_bio();
127931443Sdyson
1280281860Smav	bp->bio_length = cb->aio_nbytes;
1281281860Smav	bp->bio_bcount = cb->aio_nbytes;
1282345393Sasomers	bp->bio_done = aio_biowakeup;
1283281860Smav	bp->bio_offset = cb->aio_offset;
1284281860Smav	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
1285281860Smav	bp->bio_dev = dev;
1286295331Sjhb	bp->bio_caller1 = (void *)job;
128731443Sdyson
1288281860Smav	prot = VM_PROT_READ;
1289281860Smav	if (cb->aio_lio_opcode == LIO_READ)
1290281860Smav		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
1291297464Sjhb	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
1292367449Sjhb	    (vm_offset_t)cb->aio_buf, bp->bio_length, prot, job->pages,
1293297464Sjhb	    nitems(job->pages));
1294297464Sjhb	if (job->npages < 0) {
1295109572Sdillon		error = EFAULT;
1296109572Sdillon		goto doerror;
1297109572Sdillon	}
1298297464Sjhb	if (pbuf != NULL) {
1299281860Smav		pmap_qenter((vm_offset_t)pbuf->b_data,
1300295331Sjhb		    job->pages, job->npages);
1301281860Smav		bp->bio_data = pbuf->b_data + poff;
1302297464Sjhb		atomic_add_int(&num_buf_aio, 1);
1303281860Smav	} else {
1304295331Sjhb		bp->bio_ma = job->pages;
1305295331Sjhb		bp->bio_ma_n = job->npages;
1306281860Smav		bp->bio_ma_offset = poff;
1307281860Smav		bp->bio_data = unmapped_buf;
1308281860Smav		bp->bio_flags |= BIO_UNMAPPED;
1309328575Sjhb		atomic_add_int(&num_unmapped_aio, 1);
1310281860Smav	}
131131443Sdyson
131255943Sjasone	/* Perform transfer. */
1313281860Smav	csw->d_strategy(bp);
1314248794Skib	dev_relthread(dev, ref);
1315109177Salfred	return (0);
131631443Sdyson
131731443Sdysondoerror:
1318297464Sjhb	if (pbuf != NULL) {
1319297464Sjhb		AIO_LOCK(ki);
1320281860Smav		ki->kaio_buffer_count--;
1321297464Sjhb		AIO_UNLOCK(ki);
1322281860Smav		relpbuf(pbuf, NULL);
1323295331Sjhb		job->pbuf = NULL;
1324281860Smav	}
1325281860Smav	g_destroy_bio(bp);
1326295331Sjhb	job->bp = NULL;
1327248794Skibunref:
1328248794Skib	dev_relthread(dev, ref);
1329109177Salfred	return (error);
133031443Sdyson}
133131443Sdyson
1332296572Sjhb#ifdef COMPAT_FREEBSD6
1333185878Sjhbstatic int
1334185878Sjhbconvert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
1335185878Sjhb{
1336185878Sjhb
1337185878Sjhb	/*
1338185878Sjhb	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
1339185878Sjhb	 * supported by AIO with the old sigevent structure.
1340185878Sjhb	 */
1341185878Sjhb	nsig->sigev_notify = osig->sigev_notify;
1342185878Sjhb	switch (nsig->sigev_notify) {
1343185878Sjhb	case SIGEV_NONE:
1344185878Sjhb		break;
1345185878Sjhb	case SIGEV_SIGNAL:
1346185878Sjhb		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
1347185878Sjhb		break;
1348185878Sjhb	case SIGEV_KEVENT:
1349185878Sjhb		nsig->sigev_notify_kqueue =
1350185878Sjhb		    osig->__sigev_u.__sigev_notify_kqueue;
1351185878Sjhb		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
1352185878Sjhb		break;
1353185878Sjhb	default:
1354185878Sjhb		return (EINVAL);
1355185878Sjhb	}
1356185878Sjhb	return (0);
1357185878Sjhb}
1358185878Sjhb
1359185878Sjhbstatic int
1360185878Sjhbaiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
1361185878Sjhb{
1362185878Sjhb	struct oaiocb *ojob;
1363185878Sjhb	int error;
1364185878Sjhb
1365185878Sjhb	bzero(kjob, sizeof(struct aiocb));
1366185878Sjhb	error = copyin(ujob, kjob, sizeof(struct oaiocb));
1367185878Sjhb	if (error)
1368185878Sjhb		return (error);
1369185878Sjhb	ojob = (struct oaiocb *)kjob;
1370185878Sjhb	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
1371185878Sjhb}
1372296572Sjhb#endif
1373185878Sjhb
1374185878Sjhbstatic int
1375185878Sjhbaiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
1376185878Sjhb{
1377185878Sjhb
1378185878Sjhb	return (copyin(ujob, kjob, sizeof(struct aiocb)));
1379185878Sjhb}
1380185878Sjhb
1381185878Sjhbstatic long
1382185878Sjhbaiocb_fetch_status(struct aiocb *ujob)
1383185878Sjhb{
1384185878Sjhb
1385185878Sjhb	return (fuword(&ujob->_aiocb_private.status));
1386185878Sjhb}
1387185878Sjhb
1388185878Sjhbstatic long
1389185878Sjhbaiocb_fetch_error(struct aiocb *ujob)
1390185878Sjhb{
1391185878Sjhb
1392185878Sjhb	return (fuword(&ujob->_aiocb_private.error));
1393185878Sjhb}
1394185878Sjhb
1395185878Sjhbstatic int
1396185878Sjhbaiocb_store_status(struct aiocb *ujob, long status)
1397185878Sjhb{
1398185878Sjhb
1399185878Sjhb	return (suword(&ujob->_aiocb_private.status, status));
1400185878Sjhb}
1401185878Sjhb
1402185878Sjhbstatic int
1403185878Sjhbaiocb_store_error(struct aiocb *ujob, long error)
1404185878Sjhb{
1405185878Sjhb
1406185878Sjhb	return (suword(&ujob->_aiocb_private.error, error));
1407185878Sjhb}
1408185878Sjhb
1409185878Sjhbstatic int
1410185878Sjhbaiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
1411185878Sjhb{
1412185878Sjhb
1413185878Sjhb	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
1414185878Sjhb}
1415185878Sjhb
1416185878Sjhbstatic int
1417185878Sjhbaiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
1418185878Sjhb{
1419185878Sjhb
1420185878Sjhb	return (suword(ujobp, (long)ujob));
1421185878Sjhb}
1422185878Sjhb
1423185878Sjhbstatic struct aiocb_ops aiocb_ops = {
1424185878Sjhb	.copyin = aiocb_copyin,
1425185878Sjhb	.fetch_status = aiocb_fetch_status,
1426185878Sjhb	.fetch_error = aiocb_fetch_error,
1427185878Sjhb	.store_status = aiocb_store_status,
1428185878Sjhb	.store_error = aiocb_store_error,
1429185878Sjhb	.store_kernelinfo = aiocb_store_kernelinfo,
1430185878Sjhb	.store_aiocb = aiocb_store_aiocb,
1431185878Sjhb};
1432185878Sjhb
1433296572Sjhb#ifdef COMPAT_FREEBSD6
1434185878Sjhbstatic struct aiocb_ops aiocb_ops_osigevent = {
1435185878Sjhb	.copyin = aiocb_copyin_old_sigevent,
1436185878Sjhb	.fetch_status = aiocb_fetch_status,
1437185878Sjhb	.fetch_error = aiocb_fetch_error,
1438185878Sjhb	.store_status = aiocb_store_status,
1439185878Sjhb	.store_error = aiocb_store_error,
1440185878Sjhb	.store_kernelinfo = aiocb_store_kernelinfo,
1441185878Sjhb	.store_aiocb = aiocb_store_aiocb,
1442185878Sjhb};
1443296572Sjhb#endif
1444185878Sjhb
144555943Sjasone/*
1446345393Sasomers * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
144755943Sjasone * technique is done in this code.
144855943Sjasone */
1449163379Snetchildint
1450295331Sjhbaio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
1451304738Skib    int type, struct aiocb_ops *ops)
145231443Sdyson{
145383366Sjulian	struct proc *p = td->td_proc;
1454255219Spjd	cap_rights_t rights;
145527221Sdyson	struct file *fp;
1456296277Sjhb	struct kaiocb *job;
145727221Sdyson	struct kaioinfo *ki;
145869002Salc	struct kevent kev;
1459154669Sdavidxu	int opcode;
1460154669Sdavidxu	int error;
1461162594Sjmg	int fd, kqfd;
1462154669Sdavidxu	int jid;
1463230857Sdavidxu	u_short evflags;
146427221Sdyson
1465154698Sdavidxu	if (p->p_aioinfo == NULL)
1466154698Sdavidxu		aio_init_aioinfo(p);
1467154698Sdavidxu
1468154669Sdavidxu	ki = p->p_aioinfo;
1469154669Sdavidxu
1470295331Sjhb	ops->store_status(ujob, -1);
1471295331Sjhb	ops->store_error(ujob, 0);
1472295331Sjhb	ops->store_kernelinfo(ujob, -1);
1473154698Sdavidxu
1474154698Sdavidxu	if (num_queue_count >= max_queue_count ||
1475328582Sjhb	    ki->kaio_count >= max_aio_queue_per_proc) {
1476295331Sjhb		ops->store_error(ujob, EAGAIN);
1477154698Sdavidxu		return (EAGAIN);
1478154698Sdavidxu	}
1479154698Sdavidxu
1480295331Sjhb	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
1481295331Sjhb	knlist_init_mtx(&job->klist, AIO_MTX(ki));
148231443Sdyson
1483295331Sjhb	error = ops->copyin(ujob, &job->uaiocb);
148427221Sdyson	if (error) {
1485295331Sjhb		ops->store_error(ujob, error);
1486295331Sjhb		uma_zfree(aiocb_zone, job);
1487109177Salfred		return (error);
148827221Sdyson	}
1489154706Sdavidxu
1490297167Sjhb	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
1491295331Sjhb		uma_zfree(aiocb_zone, job);
1492230583Sglebius		return (EINVAL);
1493230583Sglebius	}
1494230583Sglebius
1495295331Sjhb	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
1496295331Sjhb	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
1497295331Sjhb	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
1498295331Sjhb	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
1499295331Sjhb		ops->store_error(ujob, EINVAL);
1500295331Sjhb		uma_zfree(aiocb_zone, job);
1501154706Sdavidxu		return (EINVAL);
1502154706Sdavidxu	}
1503185878Sjhb
1504295331Sjhb	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
1505295331Sjhb	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
1506295331Sjhb		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
1507295331Sjhb		uma_zfree(aiocb_zone, job);
1508109177Salfred		return (EINVAL);
150975682Salfred	}
151027221Sdyson
1511295331Sjhb	ksiginfo_init(&job->ksi);
1512151994Sdavidxu
151355943Sjasone	/* Save userspace address of the job info. */
1514295331Sjhb	job->ujob = ujob;
151531473Sdyson
151655943Sjasone	/* Get the opcode. */
151755943Sjasone	if (type != LIO_NOP)
1518295331Sjhb		job->uaiocb.aio_lio_opcode = type;
1519295331Sjhb	opcode = job->uaiocb.aio_lio_opcode;
152027221Sdyson
1521224778Srwatson	/*
1522224778Srwatson	 * Validate the opcode and fetch the file object for the specified
1523224778Srwatson	 * file descriptor.
1524224778Srwatson	 *
1525224778Srwatson	 * XXXRW: Moved the opcode validation up here so that we don't
1526224778Srwatson	 * retrieve a file descriptor without knowing what the capabiltity
1527224778Srwatson	 * should be.
1528224778Srwatson	 */
1529295331Sjhb	fd = job->uaiocb.aio_fildes;
1530152208Sjhb	switch (opcode) {
1531152208Sjhb	case LIO_WRITE:
1532255219Spjd		error = fget_write(td, fd,
1533255219Spjd		    cap_rights_init(&rights, CAP_PWRITE), &fp);
1534152208Sjhb		break;
1535152208Sjhb	case LIO_READ:
1536255219Spjd		error = fget_read(td, fd,
1537255219Spjd		    cap_rights_init(&rights, CAP_PREAD), &fp);
1538152208Sjhb		break;
1539224778Srwatson	case LIO_SYNC:
1540255219Spjd		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
1541224778Srwatson		break;
1542251526Sglebius	case LIO_MLOCK:
1543251526Sglebius		fp = NULL;
1544251526Sglebius		break;
1545224778Srwatson	case LIO_NOP:
1546255219Spjd		error = fget(td, fd, cap_rights_init(&rights), &fp);
1547224778Srwatson		break;
1548152208Sjhb	default:
1549224778Srwatson		error = EINVAL;
155027221Sdyson	}
1551152208Sjhb	if (error) {
1552295331Sjhb		uma_zfree(aiocb_zone, job);
1553295331Sjhb		ops->store_error(ujob, error);
1554154073Sjhb		return (error);
155527221Sdyson	}
155627221Sdyson
1557157037Sdavidxu	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
1558157037Sdavidxu		error = EINVAL;
1559157037Sdavidxu		goto aqueue_fail;
1560157037Sdavidxu	}
1561157037Sdavidxu
1562320356Skib	if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
1563320356Skib	    job->uaiocb.aio_offset < 0 &&
1564320356Skib	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
156594035Salc		error = EINVAL;
156694035Salc		goto aqueue_fail;
156727221Sdyson	}
1568154669Sdavidxu
1569295331Sjhb	job->fd_file = fp;
1570157037Sdavidxu
1571154669Sdavidxu	mtx_lock(&aio_job_mtx);
1572157037Sdavidxu	jid = jobrefid++;
1573295331Sjhb	job->seqno = jobseqno++;
1574154669Sdavidxu	mtx_unlock(&aio_job_mtx);
1575295331Sjhb	error = ops->store_kernelinfo(ujob, jid);
1576154669Sdavidxu	if (error) {
1577154669Sdavidxu		error = EINVAL;
1578154669Sdavidxu		goto aqueue_fail;
1579154669Sdavidxu	}
1580295331Sjhb	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
1581154669Sdavidxu
158227221Sdyson	if (opcode == LIO_NOP) {
158393483Salc		fdrop(fp, td);
1584295331Sjhb		uma_zfree(aiocb_zone, job);
1585109177Salfred		return (0);
158627221Sdyson	}
158727221Sdyson
1588295331Sjhb	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
1589146963Salc		goto no_kqueue;
1590295331Sjhb	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
1591230857Sdavidxu	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
1592230857Sdavidxu		error = EINVAL;
1593230857Sdavidxu		goto aqueue_fail;
1594230857Sdavidxu	}
1595295331Sjhb	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
1596341076Smarkj	memset(&kev, 0, sizeof(kev));
1597295331Sjhb	kev.ident = (uintptr_t)job->ujob;
159869002Salc	kev.filter = EVFILT_AIO;
1599230857Sdavidxu	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
1600295331Sjhb	kev.data = (intptr_t)job;
1601295331Sjhb	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
1602162594Sjmg	error = kqfd_register(kqfd, &kev, td, 1);
1603296277Sjhb	if (error)
1604296277Sjhb		goto aqueue_fail;
1605296277Sjhb
160659288Sjlemonno_kqueue:
160759288Sjlemon
1608295331Sjhb	ops->store_error(ujob, EINPROGRESS);
1609295331Sjhb	job->uaiocb._aiocb_private.error = EINPROGRESS;
1610295331Sjhb	job->userproc = p;
1611295331Sjhb	job->cred = crhold(td->td_ucred);
1612296277Sjhb	job->jobflags = KAIOCB_QUEUEING;
1613295331Sjhb	job->lio = lj;
161431443Sdyson
1615296277Sjhb	if (opcode == LIO_MLOCK) {
1616296277Sjhb		aio_schedule(job, aio_process_mlock);
1617296277Sjhb		error = 0;
1618296277Sjhb	} else if (fp->f_ops->fo_aio_queue == NULL)
1619296277Sjhb		error = aio_queue_file(fp, job);
1620296277Sjhb	else
1621296277Sjhb		error = fo_aio_queue(fp, job);
1622296277Sjhb	if (error)
1623296277Sjhb		goto aqueue_fail;
1624157037Sdavidxu
1625296277Sjhb	AIO_LOCK(ki);
1626296277Sjhb	job->jobflags &= ~KAIOCB_QUEUEING;
1627296277Sjhb	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
1628296277Sjhb	ki->kaio_count++;
1629296277Sjhb	if (lj)
1630296277Sjhb		lj->lioj_count++;
1631296277Sjhb	atomic_add_int(&num_queue_count, 1);
1632296277Sjhb	if (job->jobflags & KAIOCB_FINISHED) {
163355943Sjasone		/*
1634296277Sjhb		 * The queue callback completed the request synchronously.
1635296277Sjhb		 * The bulk of the completion is deferred in that case
1636296277Sjhb		 * until this point.
163755943Sjasone		 */
1638296277Sjhb		aio_bio_done_notify(p, job);
1639296277Sjhb	} else
1640296277Sjhb		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
1641296277Sjhb	AIO_UNLOCK(ki);
1642296277Sjhb	return (0);
1643154765Sdavidxu
1644296277Sjhbaqueue_fail:
1645296277Sjhb	knlist_delete(&job->klist, curthread, 0);
1646296277Sjhb	if (fp)
1647296277Sjhb		fdrop(fp, td);
1648296277Sjhb	uma_zfree(aiocb_zone, job);
1649296277Sjhb	ops->store_error(ujob, error);
1650296277Sjhb	return (error);
1651296277Sjhb}
1652154669Sdavidxu
1653296277Sjhbstatic void
1654296277Sjhbaio_cancel_daemon_job(struct kaiocb *job)
1655296277Sjhb{
1656296277Sjhb
1657296277Sjhb	mtx_lock(&aio_job_mtx);
1658296277Sjhb	if (!aio_cancel_cleared(job))
1659296277Sjhb		TAILQ_REMOVE(&aio_jobs, job, list);
1660296277Sjhb	mtx_unlock(&aio_job_mtx);
1661296277Sjhb	aio_cancel(job);
1662296277Sjhb}
1663296277Sjhb
1664296277Sjhbvoid
1665296277Sjhbaio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
1666296277Sjhb{
1667296277Sjhb
1668296277Sjhb	mtx_lock(&aio_job_mtx);
1669296277Sjhb	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
1670296277Sjhb		mtx_unlock(&aio_job_mtx);
1671296277Sjhb		aio_cancel(job);
1672296277Sjhb		return;
167355943Sjasone	}
1674296277Sjhb	job->handle_fn = func;
1675296277Sjhb	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
1676296277Sjhb	aio_kick_nowait(job->userproc);
1677296277Sjhb	mtx_unlock(&aio_job_mtx);
1678296277Sjhb}
167955943Sjasone
1680296277Sjhbstatic void
1681296277Sjhbaio_cancel_sync(struct kaiocb *job)
1682296277Sjhb{
1683296277Sjhb	struct kaioinfo *ki;
1684296277Sjhb
1685296277Sjhb	ki = job->userproc->p_aioinfo;
1686303787Sjhb	AIO_LOCK(ki);
1687296277Sjhb	if (!aio_cancel_cleared(job))
1688296277Sjhb		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
1689303787Sjhb	AIO_UNLOCK(ki);
1690296277Sjhb	aio_cancel(job);
1691296277Sjhb}
1692296277Sjhb
1693296277Sjhbint
1694296277Sjhbaio_queue_file(struct file *fp, struct kaiocb *job)
1695296277Sjhb{
1696296277Sjhb	struct aioliojob *lj;
1697296277Sjhb	struct kaioinfo *ki;
1698296277Sjhb	struct kaiocb *job2;
1699303434Skib	struct vnode *vp;
1700303434Skib	struct mount *mp;
1701328581Sjhb	int error;
1702303434Skib	bool safe;
1703296277Sjhb
1704296277Sjhb	lj = job->lio;
1705296277Sjhb	ki = job->userproc->p_aioinfo;
1706345393Sasomers	error = aio_qbio(job->userproc, job);
1707328581Sjhb	if (error >= 0)
1708328581Sjhb		return (error);
1709303434Skib	safe = false;
1710303434Skib	if (fp->f_type == DTYPE_VNODE) {
1711303434Skib		vp = fp->f_vnode;
1712303434Skib		if (vp->v_type == VREG || vp->v_type == VDIR) {
1713303434Skib			mp = fp->f_vnode->v_mount;
1714303434Skib			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
1715303434Skib				safe = true;
1716303434Skib		}
1717303434Skib	}
1718303460Sjhb	if (!(safe || enable_aio_unsafe)) {
1719303460Sjhb		counted_warning(&unsafe_warningcnt,
1720303460Sjhb		    "is attempting to use unsafe AIO requests");
1721296277Sjhb		return (EOPNOTSUPP);
1722303460Sjhb	}
172331456Sdyson
1724328580Sjhb	switch (job->uaiocb.aio_lio_opcode) {
1725328580Sjhb	case LIO_READ:
1726328580Sjhb	case LIO_WRITE:
1727328580Sjhb		aio_schedule(job, aio_process_rw);
1728328580Sjhb		error = 0;
1729328580Sjhb		break;
1730328580Sjhb	case LIO_SYNC:
1731296277Sjhb		AIO_LOCK(ki);
1732295331Sjhb		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
1733295331Sjhb			if (job2->fd_file == job->fd_file &&
1734295331Sjhb			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
1735295331Sjhb			    job2->seqno < job->seqno) {
1736295331Sjhb				job2->jobflags |= KAIOCB_CHECKSYNC;
1737295331Sjhb				job->pending++;
1738157073Sdavidxu			}
1739157073Sdavidxu		}
1740296277Sjhb		if (job->pending != 0) {
1741303787Sjhb			if (!aio_set_cancel_function_locked(job,
1742303787Sjhb				aio_cancel_sync)) {
1743296277Sjhb				AIO_UNLOCK(ki);
1744296277Sjhb				aio_cancel(job);
1745296277Sjhb				return (0);
1746157073Sdavidxu			}
1747295331Sjhb			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
1748158373Sdavidxu			AIO_UNLOCK(ki);
1749296277Sjhb			return (0);
1750157073Sdavidxu		}
1751296277Sjhb		AIO_UNLOCK(ki);
1752296277Sjhb		aio_schedule(job, aio_process_sync);
1753296277Sjhb		error = 0;
1754296277Sjhb		break;
1755296277Sjhb	default:
1756296277Sjhb		error = EINVAL;
1757296277Sjhb	}
1758157037Sdavidxu	return (error);
1759157037Sdavidxu}
176027221Sdyson
1761157037Sdavidxustatic void
1762157037Sdavidxuaio_kick_nowait(struct proc *userp)
1763157037Sdavidxu{
1764157037Sdavidxu	struct kaioinfo *ki = userp->p_aioinfo;
1765294482Sjhb	struct aioproc *aiop;
176631443Sdyson
1767157037Sdavidxu	mtx_assert(&aio_job_mtx, MA_OWNED);
1768157037Sdavidxu	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1769157037Sdavidxu		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1770294482Sjhb		aiop->aioprocflags &= ~AIOP_FREE;
1771294482Sjhb		wakeup(aiop->aioproc);
1772294851Sjhb	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1773328582Sjhb	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
1774294036Sjhb		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
1775157037Sdavidxu	}
1776157037Sdavidxu}
1777157037Sdavidxu
1778157073Sdavidxustatic int
1779157037Sdavidxuaio_kick(struct proc *userp)
1780157037Sdavidxu{
1781157037Sdavidxu	struct kaioinfo *ki = userp->p_aioinfo;
1782294482Sjhb	struct aioproc *aiop;
1783157073Sdavidxu	int error, ret = 0;
1784157037Sdavidxu
1785157037Sdavidxu	mtx_assert(&aio_job_mtx, MA_OWNED);
178627221Sdysonretryproc:
178743301Sdillon	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
178827221Sdyson		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1789294482Sjhb		aiop->aioprocflags &= ~AIOP_FREE;
1790294482Sjhb		wakeup(aiop->aioproc);
1791294851Sjhb	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
1792328582Sjhb	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
179331443Sdyson		num_aio_resv_start++;
1794154669Sdavidxu		mtx_unlock(&aio_job_mtx);
1795154669Sdavidxu		error = aio_newproc(&num_aio_resv_start);
1796154669Sdavidxu		mtx_lock(&aio_job_mtx);
1797154669Sdavidxu		if (error) {
1798154669Sdavidxu			num_aio_resv_start--;
1799152208Sjhb			goto retryproc;
1800154669Sdavidxu		}
1801157073Sdavidxu	} else {
1802157073Sdavidxu		ret = -1;
180327221Sdyson	}
1804157073Sdavidxu	return (ret);
1805157037Sdavidxu}
1806157037Sdavidxu
1807157037Sdavidxustatic void
1808157037Sdavidxuaio_kick_helper(void *context, int pending)
1809157037Sdavidxu{
1810157037Sdavidxu	struct proc *userp = context;
1811157037Sdavidxu
1812157037Sdavidxu	mtx_lock(&aio_job_mtx);
1813157073Sdavidxu	while (--pending >= 0) {
1814157073Sdavidxu		if (aio_kick(userp))
1815157073Sdavidxu			break;
1816157073Sdavidxu	}
1817154669Sdavidxu	mtx_unlock(&aio_job_mtx);
181827221Sdyson}
181927221Sdyson
182031443Sdyson/*
182155943Sjasone * Support the aio_return system call, as a side-effect, kernel resources are
182255943Sjasone * released.
182327221Sdyson */
1824185878Sjhbstatic int
1825295331Sjhbkern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
182631443Sdyson{
182783366Sjulian	struct proc *p = td->td_proc;
1828295331Sjhb	struct kaiocb *job;
182927221Sdyson	struct kaioinfo *ki;
1830297167Sjhb	long status, error;
183127221Sdyson
183294162Salc	ki = p->p_aioinfo;
183394162Salc	if (ki == NULL)
1834109177Salfred		return (EINVAL);
1835158373Sdavidxu	AIO_LOCK(ki);
1836295331Sjhb	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
1837295331Sjhb		if (job->ujob == ujob)
183894162Salc			break;
183931456Sdyson	}
1840295331Sjhb	if (job != NULL) {
1841296277Sjhb		MPASS(job->jobflags & KAIOCB_FINISHED);
1842295331Sjhb		status = job->uaiocb._aiocb_private.status;
1843295331Sjhb		error = job->uaiocb._aiocb_private.error;
1844154669Sdavidxu		td->td_retval[0] = status;
1845302074Sjhb		td->td_ru.ru_oublock += job->outblock;
1846302074Sjhb		td->td_ru.ru_inblock += job->inblock;
1847302074Sjhb		td->td_ru.ru_msgsnd += job->msgsnd;
1848302074Sjhb		td->td_ru.ru_msgrcv += job->msgrcv;
1849295331Sjhb		aio_free_entry(job);
1850158373Sdavidxu		AIO_UNLOCK(ki);
1851295331Sjhb		ops->store_error(ujob, error);
1852295331Sjhb		ops->store_status(ujob, status);
1853154854Sdavidxu	} else {
1854154669Sdavidxu		error = EINVAL;
1855158373Sdavidxu		AIO_UNLOCK(ki);
1856154854Sdavidxu	}
1857154669Sdavidxu	return (error);
185827221Sdyson}
185927221Sdyson
1860185878Sjhbint
1861225617Skmacysys_aio_return(struct thread *td, struct aio_return_args *uap)
1862185878Sjhb{
1863185878Sjhb
1864185878Sjhb	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
1865185878Sjhb}
1866185878Sjhb
186727221Sdyson/*
186855943Sjasone * Allow a process to wakeup when any of the I/O requests are completed.
186927221Sdyson */
1870185878Sjhbstatic int
1871185878Sjhbkern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
1872185878Sjhb    struct timespec *ts)
187331443Sdyson{
187483366Sjulian	struct proc *p = td->td_proc;
187531016Sphk	struct timeval atv;
187627221Sdyson	struct kaioinfo *ki;
1877295331Sjhb	struct kaiocb *firstjob, *job;
1878185878Sjhb	int error, i, timo;
1879133660Sjmg
188027221Sdyson	timo = 0;
1881185878Sjhb	if (ts) {
1882185878Sjhb		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
188327221Sdyson			return (EINVAL);
188427221Sdyson
1885185878Sjhb		TIMESPEC_TO_TIMEVAL(&atv, ts);
188627221Sdyson		if (itimerfix(&atv))
188727221Sdyson			return (EINVAL);
188834961Sphk		timo = tvtohz(&atv);
188927221Sdyson	}
189027221Sdyson
189127221Sdyson	ki = p->p_aioinfo;
189227221Sdyson	if (ki == NULL)
1893109177Salfred		return (EAGAIN);
189427221Sdyson
1895185878Sjhb	if (njoblist == 0)
1896109177Salfred		return (0);
189727221Sdyson
1898158373Sdavidxu	AIO_LOCK(ki);
189955943Sjasone	for (;;) {
1900295331Sjhb		firstjob = NULL;
1901154669Sdavidxu		error = 0;
1902295331Sjhb		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
190355943Sjasone			for (i = 0; i < njoblist; i++) {
1904295331Sjhb				if (job->ujob == ujoblist[i]) {
1905295331Sjhb					if (firstjob == NULL)
1906295331Sjhb						firstjob = job;
1907296277Sjhb					if (job->jobflags & KAIOCB_FINISHED)
1908154669Sdavidxu						goto RETURN;
190927221Sdyson				}
191027221Sdyson			}
191127221Sdyson		}
1912154669Sdavidxu		/* All tasks were finished. */
1913295331Sjhb		if (firstjob == NULL)
1914154669Sdavidxu			break;
191527221Sdyson
191631443Sdyson		ki->kaio_flags |= KAIO_WAKEUP;
1917158373Sdavidxu		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
1918154669Sdavidxu		    "aiospn", timo);
1919154669Sdavidxu		if (error == ERESTART)
1920154669Sdavidxu			error = EINTR;
1921154669Sdavidxu		if (error)
1922154669Sdavidxu			break;
192327221Sdyson	}
1924154669SdavidxuRETURN:
1925158373Sdavidxu	AIO_UNLOCK(ki);
1926185878Sjhb	return (error);
1927185878Sjhb}
1928185878Sjhb
1929185878Sjhbint
1930225617Skmacysys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1931185878Sjhb{
1932185878Sjhb	struct timespec ts, *tsp;
1933185878Sjhb	struct aiocb **ujoblist;
1934185878Sjhb	int error;
1935185878Sjhb
1936326322Sasomers	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
1937185878Sjhb		return (EINVAL);
1938185878Sjhb
1939185878Sjhb	if (uap->timeout) {
1940185878Sjhb		/* Get timespec struct. */
1941185878Sjhb		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1942185878Sjhb			return (error);
1943185878Sjhb		tsp = &ts;
1944185878Sjhb	} else
1945185878Sjhb		tsp = NULL;
1946185878Sjhb
1947326322Sasomers	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
1948185878Sjhb	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
1949185878Sjhb	if (error == 0)
1950185878Sjhb		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
1951326322Sasomers	free(ujoblist, M_AIOS);
1952154669Sdavidxu	return (error);
195327221Sdyson}
195427221Sdyson
195527221Sdyson/*
1956345393Sasomers * aio_cancel cancels any non-bio aio operations not currently in progress.
195726670Sdyson */
195826670Sdysonint
1959225617Skmacysys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
196031443Sdyson{
196183366Sjulian	struct proc *p = td->td_proc;
196257395Sjasone	struct kaioinfo *ki;
1963295331Sjhb	struct kaiocb *job, *jobn;
196457395Sjasone	struct file *fp;
1965285172Smjg	cap_rights_t rights;
1966154669Sdavidxu	int error;
1967154669Sdavidxu	int cancelled = 0;
1968154669Sdavidxu	int notcancelled = 0;
196957395Sjasone	struct vnode *vp;
197057395Sjasone
1971152208Sjhb	/* Lookup file object. */
1972285172Smjg	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
1973152208Sjhb	if (error)
1974152208Sjhb		return (error);
197557395Sjasone
1976154669Sdavidxu	ki = p->p_aioinfo;
1977154669Sdavidxu	if (ki == NULL)
1978154669Sdavidxu		goto done;
1979154669Sdavidxu
1980133660Sjmg	if (fp->f_type == DTYPE_VNODE) {
1981116678Sphk		vp = fp->f_vnode;
1982154669Sdavidxu		if (vn_isdisk(vp, &error)) {
1983152208Sjhb			fdrop(fp, td);
198483366Sjulian			td->td_retval[0] = AIO_NOTCANCELED;
1985133660Sjmg			return (0);
198657395Sjasone		}
198757395Sjasone	}
198857395Sjasone
1989158373Sdavidxu	AIO_LOCK(ki);
1990295331Sjhb	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
1991295331Sjhb		if ((uap->fd == job->uaiocb.aio_fildes) &&
1992154669Sdavidxu		    ((uap->aiocbp == NULL) ||
1993295331Sjhb		     (uap->aiocbp == job->ujob))) {
1994296277Sjhb			if (aio_cancel_job(p, ki, job)) {
1995154669Sdavidxu				cancelled++;
199657395Sjasone			} else {
199757395Sjasone				notcancelled++;
199857395Sjasone			}
1999154765Sdavidxu			if (uap->aiocbp != NULL)
2000154765Sdavidxu				break;
200157395Sjasone		}
200257395Sjasone	}
2003158373Sdavidxu	AIO_UNLOCK(ki);
2004154669Sdavidxu
2005101694Salcdone:
2006152208Sjhb	fdrop(fp, td);
2007154765Sdavidxu
2008154765Sdavidxu	if (uap->aiocbp != NULL) {
2009154765Sdavidxu		if (cancelled) {
2010154765Sdavidxu			td->td_retval[0] = AIO_CANCELED;
2011154765Sdavidxu			return (0);
2012154765Sdavidxu		}
2013154765Sdavidxu	}
2014154765Sdavidxu
201557395Sjasone	if (notcancelled) {
201683366Sjulian		td->td_retval[0] = AIO_NOTCANCELED;
2017109177Salfred		return (0);
201857395Sjasone	}
2019154765Sdavidxu
202057395Sjasone	if (cancelled) {
202183366Sjulian		td->td_retval[0] = AIO_CANCELED;
2022109177Salfred		return (0);
202357395Sjasone	}
2024154765Sdavidxu
202583366Sjulian	td->td_retval[0] = AIO_ALLDONE;
202657395Sjasone
2027109177Salfred	return (0);
202826670Sdyson}
202926670Sdyson
203026670Sdyson/*
2031167232Srwatson * aio_error is implemented in the kernel level for compatibility purposes
2032167232Srwatson * only.  For a user mode async implementation, it would be best to do it in
2033167232Srwatson * a userland subroutine.
203426670Sdyson */
2035185878Sjhbstatic int
2036295331Sjhbkern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
203731443Sdyson{
203883366Sjulian	struct proc *p = td->td_proc;
2039295331Sjhb	struct kaiocb *job;
204027221Sdyson	struct kaioinfo *ki;
2041154669Sdavidxu	int status;
204226670Sdyson
204327221Sdyson	ki = p->p_aioinfo;
2044154669Sdavidxu	if (ki == NULL) {
2045154669Sdavidxu		td->td_retval[0] = EINVAL;
2046154669Sdavidxu		return (0);
2047154669Sdavidxu	}
204827221Sdyson
2049158373Sdavidxu	AIO_LOCK(ki);
2050295331Sjhb	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
2051295331Sjhb		if (job->ujob == ujob) {
2052296277Sjhb			if (job->jobflags & KAIOCB_FINISHED)
2053154669Sdavidxu				td->td_retval[0] =
2054295331Sjhb					job->uaiocb._aiocb_private.error;
2055154669Sdavidxu			else
2056154669Sdavidxu				td->td_retval[0] = EINPROGRESS;
2057158373Sdavidxu			AIO_UNLOCK(ki);
2058109177Salfred			return (0);
205927221Sdyson		}
206026670Sdyson	}
2061158373Sdavidxu	AIO_UNLOCK(ki);
206231456Sdyson
206327221Sdyson	/*
2064154698Sdavidxu	 * Hack for failure of aio_aqueue.
206527221Sdyson	 */
2066295331Sjhb	status = ops->fetch_status(ujob);
2067154669Sdavidxu	if (status == -1) {
2068295331Sjhb		td->td_retval[0] = ops->fetch_error(ujob);
2069154669Sdavidxu		return (0);
2070154669Sdavidxu	}
2071154669Sdavidxu
2072154669Sdavidxu	td->td_retval[0] = EINVAL;
2073154669Sdavidxu	return (0);
207426670Sdyson}
207526670Sdyson
2076185878Sjhbint
2077225617Skmacysys_aio_error(struct thread *td, struct aio_error_args *uap)
2078185878Sjhb{
2079185878Sjhb
2080185878Sjhb	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
2081185878Sjhb}
2082185878Sjhb
208391690Seivind/* syscall - asynchronous read from a file (REALTIME) */
2084296572Sjhb#ifdef COMPAT_FREEBSD6
208526670Sdysonint
2086296572Sjhbfreebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
2087151867Sdavidxu{
2088151867Sdavidxu
2089185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2090185878Sjhb	    &aiocb_ops_osigevent));
2091151867Sdavidxu}
2092296572Sjhb#endif
2093151867Sdavidxu
2094151867Sdavidxuint
2095225617Skmacysys_aio_read(struct thread *td, struct aio_read_args *uap)
209631443Sdyson{
209788633Salfred
2098185878Sjhb	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
209926670Sdyson}
210026670Sdyson
210191690Seivind/* syscall - asynchronous write to a file (REALTIME) */
2102296572Sjhb#ifdef COMPAT_FREEBSD6
210326670Sdysonint
2104296572Sjhbfreebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
2105151867Sdavidxu{
2106151867Sdavidxu
2107185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2108185878Sjhb	    &aiocb_ops_osigevent));
2109151867Sdavidxu}
2110296572Sjhb#endif
2111151867Sdavidxu
2112151867Sdavidxuint
2113225617Skmacysys_aio_write(struct thread *td, struct aio_write_args *uap)
211431443Sdyson{
211588633Salfred
2116185878Sjhb	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
211726670Sdyson}
211826670Sdyson
2119251526Sglebiusint
2120251526Sglebiussys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
2121251526Sglebius{
2122251526Sglebius
2123251526Sglebius	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
2124251526Sglebius}
2125251526Sglebius
2126151867Sdavidxustatic int
2127185878Sjhbkern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
2128185878Sjhb    struct aiocb **acb_list, int nent, struct sigevent *sig,
2129185878Sjhb    struct aiocb_ops *ops)
2130151867Sdavidxu{
213183366Sjulian	struct proc *p = td->td_proc;
2132295331Sjhb	struct aiocb *job;
213327221Sdyson	struct kaioinfo *ki;
2134154669Sdavidxu	struct aioliojob *lj;
2135151260Sambrisko	struct kevent kev;
2136154669Sdavidxu	int error;
2137337239Sasomers	int nagain, nerror;
213826670Sdyson	int i;
213926670Sdyson
2140185878Sjhb	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
2141109177Salfred		return (EINVAL);
214227221Sdyson
2143326322Sasomers	if (nent < 0 || nent > max_aio_queue_per_proc)
2144109177Salfred		return (EINVAL);
214527221Sdyson
214655943Sjasone	if (p->p_aioinfo == NULL)
214727221Sdyson		aio_init_aioinfo(p);
214827221Sdyson
214927221Sdyson	ki = p->p_aioinfo;
215027221Sdyson
2151111119Simp	lj = uma_zalloc(aiolio_zone, M_WAITOK);
215231456Sdyson	lj->lioj_flags = 0;
2153154669Sdavidxu	lj->lioj_count = 0;
2154154669Sdavidxu	lj->lioj_finished_count = 0;
2155193951Skib	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
2156151994Sdavidxu	ksiginfo_init(&lj->lioj_ksi);
215731456Sdyson
215831456Sdyson	/*
215955943Sjasone	 * Setup signal.
216031456Sdyson	 */
2161185878Sjhb	if (sig && (mode == LIO_NOWAIT)) {
2162185878Sjhb		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
2163151260Sambrisko		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2164151260Sambrisko			/* Assume only new style KEVENT */
2165341076Smarkj			memset(&kev, 0, sizeof(kev));
2166151260Sambrisko			kev.filter = EVFILT_LIO;
2167151260Sambrisko			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
2168185878Sjhb			kev.ident = (uintptr_t)uacb_list; /* something unique */
2169151260Sambrisko			kev.data = (intptr_t)lj;
2170154669Sdavidxu			/* pass user defined sigval data */
2171154669Sdavidxu			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
2172162594Sjmg			error = kqfd_register(
2173162594Sjmg			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
2174151260Sambrisko			if (error) {
2175151260Sambrisko				uma_zfree(aiolio_zone, lj);
2176151260Sambrisko				return (error);
2177151260Sambrisko			}
2178154669Sdavidxu		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
2179154669Sdavidxu			;
2180154706Sdavidxu		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2181154706Sdavidxu			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
2182154706Sdavidxu				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2183154706Sdavidxu					uma_zfree(aiolio_zone, lj);
2184154706Sdavidxu					return EINVAL;
2185154706Sdavidxu				}
2186154706Sdavidxu				lj->lioj_flags |= LIOJ_SIGNAL;
2187154706Sdavidxu		} else {
218892751Sjeff			uma_zfree(aiolio_zone, lj);
2189151260Sambrisko			return EINVAL;
219075682Salfred		}
2191154669Sdavidxu	}
2192151260Sambrisko
2193158373Sdavidxu	AIO_LOCK(ki);
219475682Salfred	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
219555943Sjasone	/*
2196154669Sdavidxu	 * Add extra aiocb count to avoid the lio to be freed
2197154669Sdavidxu	 * by other threads doing aio_waitcomplete or aio_return,
2198154669Sdavidxu	 * and prevent event from being sent until we have queued
2199154669Sdavidxu	 * all tasks.
2200154669Sdavidxu	 */
2201154669Sdavidxu	lj->lioj_count = 1;
2202158373Sdavidxu	AIO_UNLOCK(ki);
2203154669Sdavidxu
2204154669Sdavidxu	/*
220555943Sjasone	 * Get pointers to the list of I/O requests.
220655943Sjasone	 */
2207337239Sasomers	nagain = 0;
220831443Sdyson	nerror = 0;
2209185878Sjhb	for (i = 0; i < nent; i++) {
2210295331Sjhb		job = acb_list[i];
2211295331Sjhb		if (job != NULL) {
2212295331Sjhb			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
2213337239Sasomers			if (error == EAGAIN)
2214337239Sasomers				nagain++;
2215337239Sasomers			else if (error != 0)
221631443Sdyson				nerror++;
221731443Sdyson		}
221827221Sdyson	}
221927221Sdyson
2220154669Sdavidxu	error = 0;
2221158373Sdavidxu	AIO_LOCK(ki);
2222185878Sjhb	if (mode == LIO_WAIT) {
2223154669Sdavidxu		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
222431443Sdyson			ki->kaio_flags |= KAIO_WAKEUP;
2225158373Sdavidxu			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
2226154669Sdavidxu			    PRIBIO | PCATCH, "aiospn", 0);
2227154669Sdavidxu			if (error == ERESTART)
2228154669Sdavidxu				error = EINTR;
2229154669Sdavidxu			if (error)
2230154669Sdavidxu				break;
223127221Sdyson		}
2232154669Sdavidxu	} else {
2233154669Sdavidxu		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
2234154669Sdavidxu			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
2235154669Sdavidxu				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
2236154669Sdavidxu				KNOTE_LOCKED(&lj->klist, 1);
2237154669Sdavidxu			}
2238154669Sdavidxu			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
2239154669Sdavidxu			    == LIOJ_SIGNAL
2240154669Sdavidxu			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
2241154669Sdavidxu			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
2242154669Sdavidxu				aio_sendsig(p, &lj->lioj_signal,
2243154669Sdavidxu					    &lj->lioj_ksi);
2244154669Sdavidxu				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2245154669Sdavidxu			}
2246154669Sdavidxu		}
224727221Sdyson	}
2248154669Sdavidxu	lj->lioj_count--;
2249154669Sdavidxu	if (lj->lioj_count == 0) {
2250154669Sdavidxu		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
2251154669Sdavidxu		knlist_delete(&lj->klist, curthread, 1);
2252158373Sdavidxu		PROC_LOCK(p);
2253154669Sdavidxu		sigqueue_take(&lj->lioj_ksi);
2254154669Sdavidxu		PROC_UNLOCK(p);
2255158373Sdavidxu		AIO_UNLOCK(ki);
2256154669Sdavidxu		uma_zfree(aiolio_zone, lj);
2257154669Sdavidxu	} else
2258158373Sdavidxu		AIO_UNLOCK(ki);
225927221Sdyson
2260154669Sdavidxu	if (nerror)
2261154669Sdavidxu		return (EIO);
2262337239Sasomers	else if (nagain)
2263337239Sasomers		return (EAGAIN);
2264337239Sasomers	else
2265337239Sasomers		return (error);
226626670Sdyson}
226731443Sdyson
2268185878Sjhb/* syscall - list directed I/O (REALTIME) */
2269296572Sjhb#ifdef COMPAT_FREEBSD6
2270185878Sjhbint
2271296572Sjhbfreebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
2272185878Sjhb{
2273185878Sjhb	struct aiocb **acb_list;
2274185878Sjhb	struct sigevent *sigp, sig;
2275185878Sjhb	struct osigevent osig;
2276185878Sjhb	int error, nent;
2277185878Sjhb
2278185878Sjhb	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2279185878Sjhb		return (EINVAL);
2280185878Sjhb
2281185878Sjhb	nent = uap->nent;
2282326322Sasomers	if (nent < 0 || nent > max_aio_queue_per_proc)
2283185878Sjhb		return (EINVAL);
2284185878Sjhb
2285185878Sjhb	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2286185878Sjhb		error = copyin(uap->sig, &osig, sizeof(osig));
2287185878Sjhb		if (error)
2288185878Sjhb			return (error);
2289185878Sjhb		error = convert_old_sigevent(&osig, &sig);
2290185878Sjhb		if (error)
2291185878Sjhb			return (error);
2292185878Sjhb		sigp = &sig;
2293185878Sjhb	} else
2294185878Sjhb		sigp = NULL;
2295185878Sjhb
2296185878Sjhb	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2297185878Sjhb	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2298185878Sjhb	if (error == 0)
2299185878Sjhb		error = kern_lio_listio(td, uap->mode,
2300185878Sjhb		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2301185878Sjhb		    &aiocb_ops_osigevent);
2302185878Sjhb	free(acb_list, M_LIO);
2303185878Sjhb	return (error);
2304185878Sjhb}
2305296572Sjhb#endif
2306185878Sjhb
2307185878Sjhb/* syscall - list directed I/O (REALTIME) */
2308185878Sjhbint
2309225617Skmacysys_lio_listio(struct thread *td, struct lio_listio_args *uap)
2310185878Sjhb{
2311185878Sjhb	struct aiocb **acb_list;
2312185878Sjhb	struct sigevent *sigp, sig;
2313185878Sjhb	int error, nent;
2314185878Sjhb
2315185878Sjhb	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2316185878Sjhb		return (EINVAL);
2317185878Sjhb
2318185878Sjhb	nent = uap->nent;
2319326322Sasomers	if (nent < 0 || nent > max_aio_queue_per_proc)
2320185878Sjhb		return (EINVAL);
2321185878Sjhb
2322185878Sjhb	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2323185878Sjhb		error = copyin(uap->sig, &sig, sizeof(sig));
2324185878Sjhb		if (error)
2325185878Sjhb			return (error);
2326185878Sjhb		sigp = &sig;
2327185878Sjhb	} else
2328185878Sjhb		sigp = NULL;
2329185878Sjhb
2330185878Sjhb	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2331185878Sjhb	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
2332185878Sjhb	if (error == 0)
2333185878Sjhb		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
2334185878Sjhb		    nent, sigp, &aiocb_ops);
2335185878Sjhb	free(acb_list, M_LIO);
2336185878Sjhb	return (error);
2337185878Sjhb}
2338185878Sjhb
233931456Sdysonstatic void
2340345393Sasomersaio_biowakeup(struct bio *bp)
234131443Sdyson{
2342295331Sjhb	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
2343154669Sdavidxu	struct proc *userp;
2344156024Sdavidxu	struct kaioinfo *ki;
2345296277Sjhb	size_t nbytes;
2346296277Sjhb	int error, nblks;
234731456Sdyson
2348281860Smav	/* Release mapping into kernel space. */
2349296277Sjhb	userp = job->userproc;
2350296277Sjhb	ki = userp->p_aioinfo;
2351295331Sjhb	if (job->pbuf) {
2352295331Sjhb		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
2353295331Sjhb		relpbuf(job->pbuf, NULL);
2354295331Sjhb		job->pbuf = NULL;
2355281860Smav		atomic_subtract_int(&num_buf_aio, 1);
2356296277Sjhb		AIO_LOCK(ki);
2357296277Sjhb		ki->kaio_buffer_count--;
2358296277Sjhb		AIO_UNLOCK(ki);
2359328575Sjhb	} else
2360328575Sjhb		atomic_subtract_int(&num_unmapped_aio, 1);
2361295331Sjhb	vm_page_unhold_pages(job->pages, job->npages);
2362281860Smav
2363295331Sjhb	bp = job->bp;
2364295331Sjhb	job->bp = NULL;
2365296277Sjhb	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
2366296277Sjhb	error = 0;
2367281860Smav	if (bp->bio_flags & BIO_ERROR)
2368296277Sjhb		error = bp->bio_error;
2369296277Sjhb	nblks = btodb(nbytes);
2370295331Sjhb	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
2371302074Sjhb		job->outblock += nblks;
2372154669Sdavidxu	else
2373302074Sjhb		job->inblock += nblks;
237431456Sdyson
2375300331Sjhb	if (error)
2376300331Sjhb		aio_complete(job, -1, error);
2377300331Sjhb	else
2378300331Sjhb		aio_complete(job, nbytes, 0);
2379296277Sjhb
2380281860Smav	g_destroy_bio(bp);
238131443Sdyson}
238255943Sjasone
238391690Seivind/* syscall - wait for the next completion of an aio request */
2384185878Sjhbstatic int
2385295331Sjhbkern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
2386185878Sjhb    struct timespec *ts, struct aiocb_ops *ops)
238755943Sjasone{
238883366Sjulian	struct proc *p = td->td_proc;
238955943Sjasone	struct timeval atv;
239055943Sjasone	struct kaioinfo *ki;
2391295331Sjhb	struct kaiocb *job;
2392295331Sjhb	struct aiocb *ujob;
2393297167Sjhb	long error, status;
2394297167Sjhb	int timo;
2395133660Sjmg
2396295331Sjhb	ops->store_aiocb(ujobp, NULL);
239757395Sjasone
2398289941Spjd	if (ts == NULL) {
2399289941Spjd		timo = 0;
2400289941Spjd	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
2401289941Spjd		timo = -1;
2402289941Spjd	} else {
2403185878Sjhb		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
240455943Sjasone			return (EINVAL);
240555943Sjasone
2406185878Sjhb		TIMESPEC_TO_TIMEVAL(&atv, ts);
240755943Sjasone		if (itimerfix(&atv))
240855943Sjasone			return (EINVAL);
240955943Sjasone		timo = tvtohz(&atv);
241055943Sjasone	}
241155943Sjasone
2412154389Scsjp	if (p->p_aioinfo == NULL)
2413154389Scsjp		aio_init_aioinfo(p);
241455943Sjasone	ki = p->p_aioinfo;
241555943Sjasone
2416154669Sdavidxu	error = 0;
2417295331Sjhb	job = NULL;
2418158373Sdavidxu	AIO_LOCK(ki);
2419295331Sjhb	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
2420289941Spjd		if (timo == -1) {
2421289941Spjd			error = EWOULDBLOCK;
2422289941Spjd			break;
2423289941Spjd		}
2424154669Sdavidxu		ki->kaio_flags |= KAIO_WAKEUP;
2425158373Sdavidxu		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
2426154669Sdavidxu		    "aiowc", timo);
2427156024Sdavidxu		if (timo && error == ERESTART)
2428154669Sdavidxu			error = EINTR;
2429154669Sdavidxu		if (error)
2430154669Sdavidxu			break;
2431154669Sdavidxu	}
243255943Sjasone
2433295331Sjhb	if (job != NULL) {
2434296277Sjhb		MPASS(job->jobflags & KAIOCB_FINISHED);
2435295331Sjhb		ujob = job->ujob;
2436295331Sjhb		status = job->uaiocb._aiocb_private.status;
2437295331Sjhb		error = job->uaiocb._aiocb_private.error;
2438154669Sdavidxu		td->td_retval[0] = status;
2439302074Sjhb		td->td_ru.ru_oublock += job->outblock;
2440302074Sjhb		td->td_ru.ru_inblock += job->inblock;
2441302074Sjhb		td->td_ru.ru_msgsnd += job->msgsnd;
2442302074Sjhb		td->td_ru.ru_msgrcv += job->msgrcv;
2443295331Sjhb		aio_free_entry(job);
2444158373Sdavidxu		AIO_UNLOCK(ki);
2445295331Sjhb		ops->store_aiocb(ujobp, ujob);
2446295331Sjhb		ops->store_error(ujob, error);
2447295331Sjhb		ops->store_status(ujob, status);
2448154669Sdavidxu	} else
2449158373Sdavidxu		AIO_UNLOCK(ki);
245055943Sjasone
2451154669Sdavidxu	return (error);
245255943Sjasone}
245359288Sjlemon
2454157037Sdavidxuint
2455225617Skmacysys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2456157037Sdavidxu{
2457185878Sjhb	struct timespec ts, *tsp;
2458185878Sjhb	int error;
2459185878Sjhb
2460185878Sjhb	if (uap->timeout) {
2461185878Sjhb		/* Get timespec struct. */
2462185878Sjhb		error = copyin(uap->timeout, &ts, sizeof(ts));
2463185878Sjhb		if (error)
2464185878Sjhb			return (error);
2465185878Sjhb		tsp = &ts;
2466185878Sjhb	} else
2467185878Sjhb		tsp = NULL;
2468185878Sjhb
2469185878Sjhb	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
2470185878Sjhb}
2471185878Sjhb
2472185878Sjhbstatic int
2473295331Sjhbkern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
2474185878Sjhb    struct aiocb_ops *ops)
2475185878Sjhb{
2476157037Sdavidxu
2477185878Sjhb	if (op != O_SYNC) /* XXX lack of O_DSYNC */
2478157037Sdavidxu		return (EINVAL);
2479295331Sjhb	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
2480157037Sdavidxu}
2481157037Sdavidxu
2482185878Sjhbint
2483225617Skmacysys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
2484185878Sjhb{
2485185878Sjhb
2486185878Sjhb	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
2487185878Sjhb}
2488185878Sjhb
248991690Seivind/* kqueue attach function */
249059288Sjlemonstatic int
249159288Sjlemonfilt_aioattach(struct knote *kn)
249259288Sjlemon{
2493295331Sjhb	struct kaiocb *job = (struct kaiocb *)kn->kn_sdata;
249459288Sjlemon
249559288Sjlemon	/*
2496295331Sjhb	 * The job pointer must be validated before using it, so
249759288Sjlemon	 * registration is restricted to the kernel; the user cannot
249859288Sjlemon	 * set EV_FLAG1.
249959288Sjlemon	 */
250059288Sjlemon	if ((kn->kn_flags & EV_FLAG1) == 0)
250159288Sjlemon		return (EPERM);
2502295331Sjhb	kn->kn_ptr.p_aio = job;
250359288Sjlemon	kn->kn_flags &= ~EV_FLAG1;
250459288Sjlemon
2505295331Sjhb	knlist_add(&job->klist, kn, 0);
250659288Sjlemon
250759288Sjlemon	return (0);
250859288Sjlemon}
250959288Sjlemon
251091690Seivind/* kqueue detach function */
251159288Sjlemonstatic void
251259288Sjlemonfilt_aiodetach(struct knote *kn)
251359288Sjlemon{
2514230778Sambrisko	struct knlist *knl;
251559288Sjlemon
2516230778Sambrisko	knl = &kn->kn_ptr.p_aio->klist;
2517230778Sambrisko	knl->kl_lock(knl->kl_lockarg);
2518230778Sambrisko	if (!knlist_empty(knl))
2519230778Sambrisko		knlist_remove(knl, kn, 1);
2520230778Sambrisko	knl->kl_unlock(knl->kl_lockarg);
252159288Sjlemon}
252259288Sjlemon
252391690Seivind/* kqueue filter function */
252459288Sjlemon/*ARGSUSED*/
252559288Sjlemonstatic int
252659288Sjlemonfilt_aio(struct knote *kn, long hint)
252759288Sjlemon{
2528295331Sjhb	struct kaiocb *job = kn->kn_ptr.p_aio;
252959288Sjlemon
2530295331Sjhb	kn->kn_data = job->uaiocb._aiocb_private.error;
2531296277Sjhb	if (!(job->jobflags & KAIOCB_FINISHED))
253259288Sjlemon		return (0);
2533133660Sjmg	kn->kn_flags |= EV_EOF;
253459288Sjlemon	return (1);
253559288Sjlemon}
2536151260Sambrisko
2537151260Sambrisko/* kqueue attach function */
2538151260Sambriskostatic int
2539151260Sambriskofilt_lioattach(struct knote *kn)
2540151260Sambrisko{
2541154669Sdavidxu	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
2542151260Sambrisko
2543151260Sambrisko	/*
2544154669Sdavidxu	 * The aioliojob pointer must be validated before using it, so
2545151260Sambrisko	 * registration is restricted to the kernel; the user cannot
2546151260Sambrisko	 * set EV_FLAG1.
2547151260Sambrisko	 */
2548151260Sambrisko	if ((kn->kn_flags & EV_FLAG1) == 0)
2549151260Sambrisko		return (EPERM);
2550175642Sdumbbell	kn->kn_ptr.p_lio = lj;
2551151260Sambrisko	kn->kn_flags &= ~EV_FLAG1;
2552151260Sambrisko
2553151260Sambrisko	knlist_add(&lj->klist, kn, 0);
2554151260Sambrisko
2555151260Sambrisko	return (0);
2556151260Sambrisko}
2557151260Sambrisko
2558151260Sambrisko/* kqueue detach function */
2559151260Sambriskostatic void
2560151260Sambriskofilt_liodetach(struct knote *kn)
2561151260Sambrisko{
2562230778Sambrisko	struct knlist *knl;
2563151260Sambrisko
2564230778Sambrisko	knl = &kn->kn_ptr.p_lio->klist;
2565230778Sambrisko	knl->kl_lock(knl->kl_lockarg);
2566230778Sambrisko	if (!knlist_empty(knl))
2567230778Sambrisko		knlist_remove(knl, kn, 1);
2568230778Sambrisko	knl->kl_unlock(knl->kl_lockarg);
2569151260Sambrisko}
2570151260Sambrisko
2571151260Sambrisko/* kqueue filter function */
2572151260Sambrisko/*ARGSUSED*/
2573151260Sambriskostatic int
2574151260Sambriskofilt_lio(struct knote *kn, long hint)
2575151260Sambrisko{
2576175642Sdumbbell	struct aioliojob * lj = kn->kn_ptr.p_lio;
2577154669Sdavidxu
2578151260Sambrisko	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
2579151260Sambrisko}
2580185878Sjhb
2581205014Snwhitehorn#ifdef COMPAT_FREEBSD32
2582296572Sjhb#include <sys/mount.h>
2583296572Sjhb#include <sys/socket.h>
2584296572Sjhb#include <compat/freebsd32/freebsd32.h>
2585296572Sjhb#include <compat/freebsd32/freebsd32_proto.h>
2586296572Sjhb#include <compat/freebsd32/freebsd32_signal.h>
2587296572Sjhb#include <compat/freebsd32/freebsd32_syscall.h>
2588296572Sjhb#include <compat/freebsd32/freebsd32_util.h>
2589185878Sjhb
2590185878Sjhbstruct __aiocb_private32 {
2591185878Sjhb	int32_t	status;
2592185878Sjhb	int32_t	error;
2593185878Sjhb	uint32_t kernelinfo;
2594185878Sjhb};
2595185878Sjhb
2596296572Sjhb#ifdef COMPAT_FREEBSD6
2597185878Sjhbtypedef struct oaiocb32 {
2598185878Sjhb	int	aio_fildes;		/* File descriptor */
2599185878Sjhb	uint64_t aio_offset __packed;	/* File offset for I/O */
2600185878Sjhb	uint32_t aio_buf;		/* I/O buffer in process space */
2601185878Sjhb	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2602185878Sjhb	struct	osigevent32 aio_sigevent; /* Signal to deliver */
2603185878Sjhb	int	aio_lio_opcode;		/* LIO opcode */
2604185878Sjhb	int	aio_reqprio;		/* Request priority -- ignored */
2605185878Sjhb	struct	__aiocb_private32 _aiocb_private;
2606185878Sjhb} oaiocb32_t;
2607296572Sjhb#endif
2608185878Sjhb
2609185878Sjhbtypedef struct aiocb32 {
2610185878Sjhb	int32_t	aio_fildes;		/* File descriptor */
2611185878Sjhb	uint64_t aio_offset __packed;	/* File offset for I/O */
2612185878Sjhb	uint32_t aio_buf;		/* I/O buffer in process space */
2613185878Sjhb	uint32_t aio_nbytes;		/* Number of bytes for I/O */
2614185878Sjhb	int	__spare__[2];
2615185878Sjhb	uint32_t __spare2__;
2616185878Sjhb	int	aio_lio_opcode;		/* LIO opcode */
2617185878Sjhb	int	aio_reqprio;		/* Request priority -- ignored */
2618294851Sjhb	struct	__aiocb_private32 _aiocb_private;
2619294851Sjhb	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
2620185878Sjhb} aiocb32_t;
2621185878Sjhb
2622296572Sjhb#ifdef COMPAT_FREEBSD6
2623185878Sjhbstatic int
2624185878Sjhbconvert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
2625185878Sjhb{
2626185878Sjhb
2627185878Sjhb	/*
2628185878Sjhb	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
2629185878Sjhb	 * supported by AIO with the old sigevent structure.
2630185878Sjhb	 */
2631185878Sjhb	CP(*osig, *nsig, sigev_notify);
2632185878Sjhb	switch (nsig->sigev_notify) {
2633185878Sjhb	case SIGEV_NONE:
2634185878Sjhb		break;
2635185878Sjhb	case SIGEV_SIGNAL:
2636185878Sjhb		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
2637185878Sjhb		break;
2638185878Sjhb	case SIGEV_KEVENT:
2639185878Sjhb		nsig->sigev_notify_kqueue =
2640185878Sjhb		    osig->__sigev_u.__sigev_notify_kqueue;
2641185878Sjhb		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
2642185878Sjhb		break;
2643185878Sjhb	default:
2644185878Sjhb		return (EINVAL);
2645185878Sjhb	}
2646185878Sjhb	return (0);
2647185878Sjhb}
2648185878Sjhb
2649185878Sjhbstatic int
2650185878Sjhbaiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
2651185878Sjhb{
2652185878Sjhb	struct oaiocb32 job32;
2653185878Sjhb	int error;
2654185878Sjhb
2655185878Sjhb	bzero(kjob, sizeof(struct aiocb));
2656185878Sjhb	error = copyin(ujob, &job32, sizeof(job32));
2657185878Sjhb	if (error)
2658185878Sjhb		return (error);
2659185878Sjhb
2660185878Sjhb	CP(job32, *kjob, aio_fildes);
2661185878Sjhb	CP(job32, *kjob, aio_offset);
2662185878Sjhb	PTRIN_CP(job32, *kjob, aio_buf);
2663185878Sjhb	CP(job32, *kjob, aio_nbytes);
2664185878Sjhb	CP(job32, *kjob, aio_lio_opcode);
2665185878Sjhb	CP(job32, *kjob, aio_reqprio);
2666185878Sjhb	CP(job32, *kjob, _aiocb_private.status);
2667185878Sjhb	CP(job32, *kjob, _aiocb_private.error);
2668185878Sjhb	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2669185878Sjhb	return (convert_old_sigevent32(&job32.aio_sigevent,
2670185878Sjhb	    &kjob->aio_sigevent));
2671185878Sjhb}
2672296572Sjhb#endif
2673185878Sjhb
2674185878Sjhbstatic int
2675185878Sjhbaiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
2676185878Sjhb{
2677185878Sjhb	struct aiocb32 job32;
2678185878Sjhb	int error;
2679185878Sjhb
2680185878Sjhb	error = copyin(ujob, &job32, sizeof(job32));
2681185878Sjhb	if (error)
2682185878Sjhb		return (error);
2683185878Sjhb	CP(job32, *kjob, aio_fildes);
2684185878Sjhb	CP(job32, *kjob, aio_offset);
2685185878Sjhb	PTRIN_CP(job32, *kjob, aio_buf);
2686185878Sjhb	CP(job32, *kjob, aio_nbytes);
2687185878Sjhb	CP(job32, *kjob, aio_lio_opcode);
2688185878Sjhb	CP(job32, *kjob, aio_reqprio);
2689185878Sjhb	CP(job32, *kjob, _aiocb_private.status);
2690185878Sjhb	CP(job32, *kjob, _aiocb_private.error);
2691185878Sjhb	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
2692185878Sjhb	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
2693185878Sjhb}
2694185878Sjhb
2695185878Sjhbstatic long
2696185878Sjhbaiocb32_fetch_status(struct aiocb *ujob)
2697185878Sjhb{
2698185878Sjhb	struct aiocb32 *ujob32;
2699185878Sjhb
2700185878Sjhb	ujob32 = (struct aiocb32 *)ujob;
2701185878Sjhb	return (fuword32(&ujob32->_aiocb_private.status));
2702185878Sjhb}
2703185878Sjhb
2704185878Sjhbstatic long
2705185878Sjhbaiocb32_fetch_error(struct aiocb *ujob)
2706185878Sjhb{
2707185878Sjhb	struct aiocb32 *ujob32;
2708185878Sjhb
2709185878Sjhb	ujob32 = (struct aiocb32 *)ujob;
2710185878Sjhb	return (fuword32(&ujob32->_aiocb_private.error));
2711185878Sjhb}
2712185878Sjhb
2713185878Sjhbstatic int
2714185878Sjhbaiocb32_store_status(struct aiocb *ujob, long status)
2715185878Sjhb{
2716185878Sjhb	struct aiocb32 *ujob32;
2717185878Sjhb
2718185878Sjhb	ujob32 = (struct aiocb32 *)ujob;
2719185878Sjhb	return (suword32(&ujob32->_aiocb_private.status, status));
2720185878Sjhb}
2721185878Sjhb
2722185878Sjhbstatic int
2723185878Sjhbaiocb32_store_error(struct aiocb *ujob, long error)
2724185878Sjhb{
2725185878Sjhb	struct aiocb32 *ujob32;
2726185878Sjhb
2727185878Sjhb	ujob32 = (struct aiocb32 *)ujob;
2728185878Sjhb	return (suword32(&ujob32->_aiocb_private.error, error));
2729185878Sjhb}
2730185878Sjhb
2731185878Sjhbstatic int
2732185878Sjhbaiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
2733185878Sjhb{
2734185878Sjhb	struct aiocb32 *ujob32;
2735185878Sjhb
2736185878Sjhb	ujob32 = (struct aiocb32 *)ujob;
2737185878Sjhb	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
2738185878Sjhb}
2739185878Sjhb
2740185878Sjhbstatic int
2741185878Sjhbaiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
2742185878Sjhb{
2743185878Sjhb
2744185878Sjhb	return (suword32(ujobp, (long)ujob));
2745185878Sjhb}
2746185878Sjhb
2747185878Sjhbstatic struct aiocb_ops aiocb32_ops = {
2748185878Sjhb	.copyin = aiocb32_copyin,
2749185878Sjhb	.fetch_status = aiocb32_fetch_status,
2750185878Sjhb	.fetch_error = aiocb32_fetch_error,
2751185878Sjhb	.store_status = aiocb32_store_status,
2752185878Sjhb	.store_error = aiocb32_store_error,
2753185878Sjhb	.store_kernelinfo = aiocb32_store_kernelinfo,
2754185878Sjhb	.store_aiocb = aiocb32_store_aiocb,
2755185878Sjhb};
2756185878Sjhb
2757296572Sjhb#ifdef COMPAT_FREEBSD6
2758185878Sjhbstatic struct aiocb_ops aiocb32_ops_osigevent = {
2759185878Sjhb	.copyin = aiocb32_copyin_old_sigevent,
2760185878Sjhb	.fetch_status = aiocb32_fetch_status,
2761185878Sjhb	.fetch_error = aiocb32_fetch_error,
2762185878Sjhb	.store_status = aiocb32_store_status,
2763185878Sjhb	.store_error = aiocb32_store_error,
2764185878Sjhb	.store_kernelinfo = aiocb32_store_kernelinfo,
2765185878Sjhb	.store_aiocb = aiocb32_store_aiocb,
2766185878Sjhb};
2767296572Sjhb#endif
2768185878Sjhb
2769185878Sjhbint
2770185878Sjhbfreebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
2771185878Sjhb{
2772185878Sjhb
2773185878Sjhb	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2774185878Sjhb}
2775185878Sjhb
2776185878Sjhbint
2777185878Sjhbfreebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
2778185878Sjhb{
2779185878Sjhb	struct timespec32 ts32;
2780185878Sjhb	struct timespec ts, *tsp;
2781185878Sjhb	struct aiocb **ujoblist;
2782185878Sjhb	uint32_t *ujoblist32;
2783185878Sjhb	int error, i;
2784185878Sjhb
2785326322Sasomers	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
2786185878Sjhb		return (EINVAL);
2787185878Sjhb
2788185878Sjhb	if (uap->timeout) {
2789185878Sjhb		/* Get timespec struct. */
2790185878Sjhb		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
2791185878Sjhb			return (error);
2792185878Sjhb		CP(ts32, ts, tv_sec);
2793185878Sjhb		CP(ts32, ts, tv_nsec);
2794185878Sjhb		tsp = &ts;
2795185878Sjhb	} else
2796185878Sjhb		tsp = NULL;
2797185878Sjhb
2798326322Sasomers	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
2799185878Sjhb	ujoblist32 = (uint32_t *)ujoblist;
2800185878Sjhb	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
2801185878Sjhb	    sizeof(ujoblist32[0]));
2802185878Sjhb	if (error == 0) {
2803326322Sasomers		for (i = uap->nent - 1; i >= 0; i--)
2804185878Sjhb			ujoblist[i] = PTRIN(ujoblist32[i]);
2805185878Sjhb
2806185878Sjhb		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
2807185878Sjhb	}
2808326322Sasomers	free(ujoblist, M_AIOS);
2809185878Sjhb	return (error);
2810185878Sjhb}
2811185878Sjhb
2812185878Sjhbint
2813185878Sjhbfreebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
2814185878Sjhb{
2815185878Sjhb
2816185878Sjhb	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
2817185878Sjhb}
2818185878Sjhb
2819296572Sjhb#ifdef COMPAT_FREEBSD6
2820185878Sjhbint
2821296572Sjhbfreebsd6_freebsd32_aio_read(struct thread *td,
2822296572Sjhb    struct freebsd6_freebsd32_aio_read_args *uap)
2823185878Sjhb{
2824185878Sjhb
2825185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2826185878Sjhb	    &aiocb32_ops_osigevent));
2827185878Sjhb}
2828296572Sjhb#endif
2829185878Sjhb
2830185878Sjhbint
2831185878Sjhbfreebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
2832185878Sjhb{
2833185878Sjhb
2834185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
2835185878Sjhb	    &aiocb32_ops));
2836185878Sjhb}
2837185878Sjhb
2838296572Sjhb#ifdef COMPAT_FREEBSD6
2839185878Sjhbint
2840296572Sjhbfreebsd6_freebsd32_aio_write(struct thread *td,
2841296572Sjhb    struct freebsd6_freebsd32_aio_write_args *uap)
2842185878Sjhb{
2843185878Sjhb
2844185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2845185878Sjhb	    &aiocb32_ops_osigevent));
2846185878Sjhb}
2847296572Sjhb#endif
2848185878Sjhb
2849185878Sjhbint
2850185878Sjhbfreebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
2851185878Sjhb{
2852185878Sjhb
2853185878Sjhb	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
2854185878Sjhb	    &aiocb32_ops));
2855185878Sjhb}
2856185878Sjhb
2857185878Sjhbint
2858251526Sglebiusfreebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
2859251526Sglebius{
2860251526Sglebius
2861251526Sglebius	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
2862251526Sglebius	    &aiocb32_ops));
2863251526Sglebius}
2864251526Sglebius
2865251526Sglebiusint
2866185878Sjhbfreebsd32_aio_waitcomplete(struct thread *td,
2867185878Sjhb    struct freebsd32_aio_waitcomplete_args *uap)
2868185878Sjhb{
2869187631Sjhb	struct timespec32 ts32;
2870185878Sjhb	struct timespec ts, *tsp;
2871185878Sjhb	int error;
2872185878Sjhb
2873185878Sjhb	if (uap->timeout) {
2874185878Sjhb		/* Get timespec struct. */
2875185878Sjhb		error = copyin(uap->timeout, &ts32, sizeof(ts32));
2876185878Sjhb		if (error)
2877185878Sjhb			return (error);
2878185878Sjhb		CP(ts32, ts, tv_sec);
2879185878Sjhb		CP(ts32, ts, tv_nsec);
2880185878Sjhb		tsp = &ts;
2881185878Sjhb	} else
2882185878Sjhb		tsp = NULL;
2883185878Sjhb
2884185878Sjhb	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
2885185878Sjhb	    &aiocb32_ops));
2886185878Sjhb}
2887185878Sjhb
2888185878Sjhbint
2889185878Sjhbfreebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
2890185878Sjhb{
2891185878Sjhb
2892185878Sjhb	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
2893185878Sjhb	    &aiocb32_ops));
2894185878Sjhb}
2895185878Sjhb
2896296572Sjhb#ifdef COMPAT_FREEBSD6
2897185878Sjhbint
2898296572Sjhbfreebsd6_freebsd32_lio_listio(struct thread *td,
2899296572Sjhb    struct freebsd6_freebsd32_lio_listio_args *uap)
2900185878Sjhb{
2901185878Sjhb	struct aiocb **acb_list;
2902185878Sjhb	struct sigevent *sigp, sig;
2903185878Sjhb	struct osigevent32 osig;
2904185878Sjhb	uint32_t *acb_list32;
2905185878Sjhb	int error, i, nent;
2906185878Sjhb
2907185878Sjhb	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2908185878Sjhb		return (EINVAL);
2909185878Sjhb
2910185878Sjhb	nent = uap->nent;
2911326322Sasomers	if (nent < 0 || nent > max_aio_queue_per_proc)
2912185878Sjhb		return (EINVAL);
2913185878Sjhb
2914185878Sjhb	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2915185878Sjhb		error = copyin(uap->sig, &osig, sizeof(osig));
2916185878Sjhb		if (error)
2917185878Sjhb			return (error);
2918185878Sjhb		error = convert_old_sigevent32(&osig, &sig);
2919185878Sjhb		if (error)
2920185878Sjhb			return (error);
2921185878Sjhb		sigp = &sig;
2922185878Sjhb	} else
2923185878Sjhb		sigp = NULL;
2924185878Sjhb
2925185878Sjhb	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2926185878Sjhb	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2927185878Sjhb	if (error) {
2928185878Sjhb		free(acb_list32, M_LIO);
2929185878Sjhb		return (error);
2930185878Sjhb	}
2931185878Sjhb	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2932185878Sjhb	for (i = 0; i < nent; i++)
2933185878Sjhb		acb_list[i] = PTRIN(acb_list32[i]);
2934185878Sjhb	free(acb_list32, M_LIO);
2935185878Sjhb
2936185878Sjhb	error = kern_lio_listio(td, uap->mode,
2937185878Sjhb	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2938185878Sjhb	    &aiocb32_ops_osigevent);
2939185878Sjhb	free(acb_list, M_LIO);
2940185878Sjhb	return (error);
2941185878Sjhb}
2942296572Sjhb#endif
2943185878Sjhb
2944185878Sjhbint
2945185878Sjhbfreebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
2946185878Sjhb{
2947185878Sjhb	struct aiocb **acb_list;
2948185878Sjhb	struct sigevent *sigp, sig;
2949185878Sjhb	struct sigevent32 sig32;
2950185878Sjhb	uint32_t *acb_list32;
2951185878Sjhb	int error, i, nent;
2952185878Sjhb
2953185878Sjhb	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2954185878Sjhb		return (EINVAL);
2955185878Sjhb
2956185878Sjhb	nent = uap->nent;
2957326322Sasomers	if (nent < 0 || nent > max_aio_queue_per_proc)
2958185878Sjhb		return (EINVAL);
2959185878Sjhb
2960185878Sjhb	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2961185878Sjhb		error = copyin(uap->sig, &sig32, sizeof(sig32));
2962185878Sjhb		if (error)
2963185878Sjhb			return (error);
2964185878Sjhb		error = convert_sigevent32(&sig32, &sig);
2965185878Sjhb		if (error)
2966185878Sjhb			return (error);
2967185878Sjhb		sigp = &sig;
2968185878Sjhb	} else
2969185878Sjhb		sigp = NULL;
2970185878Sjhb
2971185878Sjhb	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
2972185878Sjhb	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
2973185878Sjhb	if (error) {
2974185878Sjhb		free(acb_list32, M_LIO);
2975185878Sjhb		return (error);
2976185878Sjhb	}
2977185878Sjhb	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
2978185878Sjhb	for (i = 0; i < nent; i++)
2979185878Sjhb		acb_list[i] = PTRIN(acb_list32[i]);
2980185878Sjhb	free(acb_list32, M_LIO);
2981185878Sjhb
2982185878Sjhb	error = kern_lio_listio(td, uap->mode,
2983185878Sjhb	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
2984185878Sjhb	    &aiocb32_ops);
2985185878Sjhb	free(acb_list, M_LIO);
2986185878Sjhb	return (error);
2987185878Sjhb}
2988185878Sjhb
2989185878Sjhb#endif
2990