vfs_aio.c revision 89465
1/*
2 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 *    derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author.  This software is distributed AS-IS.
15 *
16 * $FreeBSD: head/sys/kern/vfs_aio.c 89465 2002-01-17 17:19:40Z alc $
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/bio.h>
26#include <sys/buf.h>
27#include <sys/sysproto.h>
28#include <sys/filedesc.h>
29#include <sys/kernel.h>
30#include <sys/kthread.h>
31#include <sys/fcntl.h>
32#include <sys/file.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/unistd.h>
36#include <sys/proc.h>
37#include <sys/resourcevar.h>
38#include <sys/signalvar.h>
39#include <sys/protosw.h>
40#include <sys/socketvar.h>
41#include <sys/syscall.h>
42#include <sys/sysent.h>
43#include <sys/sysctl.h>
44#include <sys/vnode.h>
45#include <sys/conf.h>
46#include <sys/event.h>
47
48#include <vm/vm.h>
49#include <vm/vm_extern.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_zone.h>
53#include <sys/aio.h>
54
55#include <machine/limits.h>
56
57#include "opt_vfs_aio.h"
58
59static	long jobrefid;
60
61#define JOBST_NULL		0x0
62#define	JOBST_JOBQPROC		0x1
63#define JOBST_JOBQGLOBAL	0x2
64#define JOBST_JOBRUNNING	0x3
65#define JOBST_JOBFINISHED	0x4
66#define	JOBST_JOBQBUF		0x5
67#define	JOBST_JOBBFINISHED	0x6
68
69#ifndef MAX_AIO_PER_PROC
70#define MAX_AIO_PER_PROC	32
71#endif
72
73#ifndef MAX_AIO_QUEUE_PER_PROC
74#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
75#endif
76
77#ifndef MAX_AIO_PROCS
78#define MAX_AIO_PROCS		32
79#endif
80
81#ifndef MAX_AIO_QUEUE
82#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
83#endif
84
85#ifndef TARGET_AIO_PROCS
86#define TARGET_AIO_PROCS	4
87#endif
88
89#ifndef MAX_BUF_AIO
90#define MAX_BUF_AIO		16
91#endif
92
93#ifndef AIOD_TIMEOUT_DEFAULT
94#define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
95#endif
96
97#ifndef AIOD_LIFETIME_DEFAULT
98#define AIOD_LIFETIME_DEFAULT	(30 * hz)
99#endif
100
101static int max_aio_procs = MAX_AIO_PROCS;
102static int num_aio_procs = 0;
103static int target_aio_procs = TARGET_AIO_PROCS;
104static int max_queue_count = MAX_AIO_QUEUE;
105static int num_queue_count = 0;
106static int num_buf_aio = 0;
107static int num_aio_resv_start = 0;
108static int aiod_timeout;
109static int aiod_lifetime;
110static int unloadable = 0;
111
112static int max_aio_per_proc = MAX_AIO_PER_PROC;
113static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
114static int max_buf_aio = MAX_BUF_AIO;
115
116SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
117
118SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
119	CTLFLAG_RW, &max_aio_per_proc, 0, "");
120
121SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
122	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
123
124SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
125	CTLFLAG_RW, &max_aio_procs, 0, "");
126
127SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
128	CTLFLAG_RD, &num_aio_procs, 0, "");
129
130SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
131	CTLFLAG_RD, &num_queue_count, 0, "");
132
133SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
134	CTLFLAG_RW, &max_queue_count, 0, "");
135
136SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
137	CTLFLAG_RW, &target_aio_procs, 0, "");
138
139SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
140	CTLFLAG_RW, &max_buf_aio, 0, "");
141
142SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
143	CTLFLAG_RD, &num_buf_aio, 0, "");
144
145SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
146	CTLFLAG_RW, &aiod_lifetime, 0, "");
147
148SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
149	CTLFLAG_RW, &aiod_timeout, 0, "");
150
151SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152    "Allow unload of aio (not recommended)");
153
154struct aiocblist {
155        TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
156        TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
157        int	jobflags;
158        int	jobstate;
159	int	inputcharge;
160	int	outputcharge;
161	struct	callout_handle timeouthandle;
162        struct	buf *bp;		/* Buffer pointer */
163        struct	proc *userproc;		/* User process */ /* Not td! */
164        struct	file *fd_file;		/* Pointer to file structure */
165	struct	aiothreadlist *jobaiothread;  /* AIO process descriptor */
166        struct	aio_liojob *lio;	/* Optional lio job */
167        struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
168	struct	klist klist;		/* list of knotes */
169        struct	aiocb uaiocb;		/* Kernel I/O control block */
170};
171
172/* jobflags */
173#define AIOCBLIST_RUNDOWN       0x4
174#define AIOCBLIST_ASYNCFREE     0x8
175#define AIOCBLIST_DONE          0x10
176
177/*
178 * AIO process info
179 */
180#define AIOP_FREE	0x1			/* proc on free queue */
181#define AIOP_SCHED	0x2			/* proc explicitly scheduled */
182
183struct aiothreadlist {
184	int aiothreadflags;			/* AIO proc flags */
185	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
186	struct thread *aiothread;		/* The AIO thread */
187	TAILQ_HEAD(,aiocblist) jobtorun;	/* suggested job to run */
188};
189
190/*
191 * data-structure for lio signal management
192 */
193struct aio_liojob {
194	int	lioj_flags;
195	int	lioj_buffer_count;
196	int	lioj_buffer_finished_count;
197	int	lioj_queue_count;
198	int	lioj_queue_finished_count;
199	struct	sigevent lioj_signal;	/* signal on all I/O done */
200	TAILQ_ENTRY(aio_liojob) lioj_list;
201	struct	kaioinfo *lioj_ki;
202};
203#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
204#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
205
206/*
207 * per process aio data structure
208 */
209struct kaioinfo {
210	int	kaio_flags;		/* per process kaio flags */
211	int	kaio_maxactive_count;	/* maximum number of AIOs */
212	int	kaio_active_count;	/* number of currently used AIOs */
213	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
214	int	kaio_queue_count;	/* size of AIO queue */
215	int	kaio_ballowed_count;	/* maximum number of buffers */
216	int	kaio_queue_finished_count; /* number of daemon jobs finished */
217	int	kaio_buffer_count;	/* number of physio buffers */
218	int	kaio_buffer_finished_count; /* count of I/O done */
219	struct 	proc *kaio_p;		/* process that uses this kaio block */
220	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
221	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
222	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
223	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
224	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
225	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
226};
227
228#define KAIO_RUNDOWN	0x1	/* process is being run down */
229#define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
230
231static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc;
232static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
233static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
234
235static void	aio_init_aioinfo(struct proc *p);
236static void	aio_onceonly(void);
237static int	aio_free_entry(struct aiocblist *aiocbe);
238static void	aio_process(struct aiocblist *aiocbe);
239static int	aio_newproc(void);
240static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
241static void	aio_physwakeup(struct buf *bp);
242static void	aio_proc_rundown(struct proc *p);
243static int	aio_fphysio(struct aiocblist *aiocbe);
244static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
245static void	aio_daemon(void *uproc);
246static void	aio_swake_cb(struct socket *, struct sockbuf *);
247static int	aio_unload(void);
248static void	process_signal(void *aioj);
249static int	filt_aioattach(struct knote *kn);
250static void	filt_aiodetach(struct knote *kn);
251static int	filt_aio(struct knote *kn, long hint);
252
253static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone;
254static vm_zone_t aiolio_zone;
255
256static struct filterops aio_filtops =
257	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
258
259static int
260aio_modload(struct module *module, int cmd, void *arg)
261{
262	int error = 0;
263
264	switch (cmd) {
265	case MOD_LOAD:
266		aio_onceonly();
267		break;
268	case MOD_UNLOAD:
269		error = aio_unload();
270		break;
271	case MOD_SHUTDOWN:
272		break;
273	default:
274		error = EINVAL;
275		break;
276	}
277	return (error);
278}
279
280static moduledata_t aio_mod = {
281	"aio",
282	&aio_modload,
283	NULL
284};
285
286SYSCALL_MODULE_HELPER(aio_return);
287SYSCALL_MODULE_HELPER(aio_suspend);
288SYSCALL_MODULE_HELPER(aio_cancel);
289SYSCALL_MODULE_HELPER(aio_error);
290SYSCALL_MODULE_HELPER(aio_read);
291SYSCALL_MODULE_HELPER(aio_write);
292SYSCALL_MODULE_HELPER(aio_waitcomplete);
293SYSCALL_MODULE_HELPER(lio_listio);
294
295DECLARE_MODULE(aio, aio_mod,
296	SI_SUB_VFS, SI_ORDER_ANY);
297MODULE_VERSION(aio, 1);
298
299/*
300 * Startup initialization
301 */
302static void
303aio_onceonly(void)
304{
305
306	/* XXX: should probably just use so->callback */
307	aio_swake = &aio_swake_cb;
308	at_exit(aio_proc_rundown);
309	at_exec(aio_proc_rundown);
310	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
311	TAILQ_INIT(&aio_freeproc);
312	TAILQ_INIT(&aio_activeproc);
313	TAILQ_INIT(&aio_jobs);
314	TAILQ_INIT(&aio_bufjobs);
315	kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
316	aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1);
317	aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
318	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
319	aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
320	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
321	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
322	jobrefid = 1;
323}
324
325static int
326aio_unload(void)
327{
328
329	/*
330	 * XXX: no unloads by default, it's too dangerous.
331	 * perhaps we could do it if locked out callers and then
332	 * did an aio_proc_rundown() on each process.
333	 */
334	if (!unloadable)
335		return (EOPNOTSUPP);
336
337	aio_swake = NULL;
338	rm_at_exit(aio_proc_rundown);
339	rm_at_exec(aio_proc_rundown);
340	kqueue_del_filteropts(EVFILT_AIO);
341	return (0);
342}
343
344/*
345 * Init the per-process aioinfo structure.  The aioinfo limits are set
346 * per-process for user limit (resource) management.
347 */
348static void
349aio_init_aioinfo(struct proc *p)
350{
351	struct kaioinfo *ki;
352	if (p->p_aioinfo == NULL) {
353		ki = zalloc(kaio_zone);
354		p->p_aioinfo = ki;
355		ki->kaio_flags = 0;
356		ki->kaio_maxactive_count = max_aio_per_proc;
357		ki->kaio_active_count = 0;
358		ki->kaio_qallowed_count = max_aio_queue_per_proc;
359		ki->kaio_queue_count = 0;
360		ki->kaio_ballowed_count = max_buf_aio;
361		ki->kaio_buffer_count = 0;
362		ki->kaio_buffer_finished_count = 0;
363		ki->kaio_p = p;
364		TAILQ_INIT(&ki->kaio_jobdone);
365		TAILQ_INIT(&ki->kaio_jobqueue);
366		TAILQ_INIT(&ki->kaio_bufdone);
367		TAILQ_INIT(&ki->kaio_bufqueue);
368		TAILQ_INIT(&ki->kaio_liojoblist);
369		TAILQ_INIT(&ki->kaio_sockqueue);
370	}
371
372	while (num_aio_procs < target_aio_procs)
373		aio_newproc();
374}
375
376/*
377 * Free a job entry.  Wait for completion if it is currently active, but don't
378 * delay forever.  If we delay, we return a flag that says that we have to
379 * restart the queue scan.
380 */
381static int
382aio_free_entry(struct aiocblist *aiocbe)
383{
384	struct kaioinfo *ki;
385	struct aiothreadlist *aiop;
386	struct aio_liojob *lj;
387	struct proc *p;
388	int error;
389	int s;
390
391	if (aiocbe->jobstate == JOBST_NULL)
392		panic("aio_free_entry: freeing already free job");
393
394	p = aiocbe->userproc;
395	ki = p->p_aioinfo;
396	lj = aiocbe->lio;
397	if (ki == NULL)
398		panic("aio_free_entry: missing p->p_aioinfo");
399
400	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
401		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
402			return 0;
403		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
404		tsleep(aiocbe, PRIBIO, "jobwai", 0);
405	}
406	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
407
408	if (aiocbe->bp == NULL) {
409		if (ki->kaio_queue_count <= 0)
410			panic("aio_free_entry: process queue size <= 0");
411		if (num_queue_count <= 0)
412			panic("aio_free_entry: system wide queue size <= 0");
413
414		if (lj) {
415			lj->lioj_queue_count--;
416			if (aiocbe->jobflags & AIOCBLIST_DONE)
417				lj->lioj_queue_finished_count--;
418		}
419		ki->kaio_queue_count--;
420		if (aiocbe->jobflags & AIOCBLIST_DONE)
421			ki->kaio_queue_finished_count--;
422		num_queue_count--;
423	} else {
424		if (lj) {
425			lj->lioj_buffer_count--;
426			if (aiocbe->jobflags & AIOCBLIST_DONE)
427				lj->lioj_buffer_finished_count--;
428		}
429		if (aiocbe->jobflags & AIOCBLIST_DONE)
430			ki->kaio_buffer_finished_count--;
431		ki->kaio_buffer_count--;
432		num_buf_aio--;
433	}
434
435	/* aiocbe is going away, we need to destroy any knotes */
436	knote_remove(&p->p_thread, &aiocbe->klist); /* XXXKSE */
437	/* XXXKSE Note the thread here is used to eventually find the
438	 * owning process again, but it is also used to do a fo_close
439	 * and that requires the thread. (but does it require the
440	 * OWNING thread? (or maby the running thread?)
441	 * There is a semantic problem here...
442	 */
443
444	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
445	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
446		ki->kaio_flags &= ~KAIO_WAKEUP;
447		wakeup(p);
448	}
449
450	if (aiocbe->jobstate == JOBST_JOBQBUF) {
451		if ((error = aio_fphysio(aiocbe)) != 0)
452			return error;
453		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
454			panic("aio_free_entry: invalid physio finish-up state");
455		s = splbio();
456		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
457		splx(s);
458	} else if (aiocbe->jobstate == JOBST_JOBQPROC) {
459		aiop = aiocbe->jobaiothread;
460		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
461	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
462		s = splnet();
463		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
464		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
465		splx(s);
466	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
467		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
468	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
469		s = splbio();
470		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
471		splx(s);
472		if (aiocbe->bp) {
473			vunmapbuf(aiocbe->bp);
474			relpbuf(aiocbe->bp, NULL);
475			aiocbe->bp = NULL;
476		}
477	}
478	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
479		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
480		zfree(aiolio_zone, lj);
481	}
482	aiocbe->jobstate = JOBST_NULL;
483	untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
484	zfree(aiocb_zone, aiocbe);
485	return 0;
486}
487
488/*
489 * Rundown the jobs for a given process.
490 */
491static void
492aio_proc_rundown(struct proc *p)
493{
494	int s;
495	struct kaioinfo *ki;
496	struct aio_liojob *lj, *ljn;
497	struct aiocblist *aiocbe, *aiocbn;
498	struct file *fp;
499	struct filedesc *fdp;
500	struct socket *so;
501
502	ki = p->p_aioinfo;
503	if (ki == NULL)
504		return;
505
506	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
507	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
508	    ki->kaio_buffer_finished_count)) {
509		ki->kaio_flags |= KAIO_RUNDOWN;
510		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
511			break;
512	}
513
514	/*
515	 * Move any aio ops that are waiting on socket I/O to the normal job
516	 * queues so they are cleaned up with any others.
517	 */
518	fdp = p->p_fd;
519
520	s = splnet();
521	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
522	    aiocbn) {
523		aiocbn = TAILQ_NEXT(aiocbe, plist);
524		fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
525
526		/*
527		 * Under some circumstances, the aio_fildes and the file
528		 * structure don't match.  This would leave aiocbe's in the
529		 * TAILQ associated with the socket and cause a panic later.
530		 *
531		 * Detect and fix.
532		 */
533		if ((fp == NULL) || (fp != aiocbe->fd_file))
534			fp = aiocbe->fd_file;
535		if (fp) {
536			so = (struct socket *)fp->f_data;
537			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
538			if (TAILQ_EMPTY(&so->so_aiojobq)) {
539				so->so_snd.sb_flags &= ~SB_AIO;
540				so->so_rcv.sb_flags &= ~SB_AIO;
541			}
542		}
543		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
544		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
545		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
546	}
547	splx(s);
548
549restart1:
550	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
551		aiocbn = TAILQ_NEXT(aiocbe, plist);
552		if (aio_free_entry(aiocbe))
553			goto restart1;
554	}
555
556restart2:
557	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
558	    aiocbn) {
559		aiocbn = TAILQ_NEXT(aiocbe, plist);
560		if (aio_free_entry(aiocbe))
561			goto restart2;
562	}
563
564/*
565 * Note the use of lots of splbio here, trying to avoid splbio for long chains
566 * of I/O.  Probably unnecessary.
567 */
568restart3:
569	s = splbio();
570	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
571		ki->kaio_flags |= KAIO_WAKEUP;
572		tsleep(p, PRIBIO, "aioprn", 0);
573		splx(s);
574		goto restart3;
575	}
576	splx(s);
577
578restart4:
579	s = splbio();
580	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
581		aiocbn = TAILQ_NEXT(aiocbe, plist);
582		if (aio_free_entry(aiocbe)) {
583			splx(s);
584			goto restart4;
585		}
586	}
587	splx(s);
588
589        /*
590         * If we've slept, jobs might have moved from one queue to another.
591         * Retry rundown if we didn't manage to empty the queues.
592         */
593        if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
594	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
595	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
596	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
597		goto restart1;
598
599	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
600		ljn = TAILQ_NEXT(lj, lioj_list);
601		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
602		    0)) {
603			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
604			zfree(aiolio_zone, lj);
605		} else {
606#ifdef DIAGNOSTIC
607			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
608			    "QF:%d\n", lj->lioj_buffer_count,
609			    lj->lioj_buffer_finished_count,
610			    lj->lioj_queue_count,
611			    lj->lioj_queue_finished_count);
612#endif
613		}
614	}
615
616	zfree(kaio_zone, ki);
617	p->p_aioinfo = NULL;
618}
619
620/*
621 * Select a job to run (called by an AIO daemon).
622 */
623static struct aiocblist *
624aio_selectjob(struct aiothreadlist *aiop)
625{
626	int s;
627	struct aiocblist *aiocbe;
628	struct kaioinfo *ki;
629	struct proc *userp;
630
631	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
632	if (aiocbe) {
633		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
634		return aiocbe;
635	}
636
637	s = splnet();
638	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
639	    TAILQ_NEXT(aiocbe, list)) {
640		userp = aiocbe->userproc;
641		ki = userp->p_aioinfo;
642
643		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
644			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
645			splx(s);
646			return aiocbe;
647		}
648	}
649	splx(s);
650
651	return NULL;
652}
653
654/*
655 * The AIO processing activity.  This is the code that does the I/O request for
656 * the non-physio version of the operations.  The normal vn operations are used,
657 * and this code should work in all instances for every type of file, including
658 * pipes, sockets, fifos, and regular files.
659 */
660static void
661aio_process(struct aiocblist *aiocbe)
662{
663	struct filedesc *fdp;
664	struct thread *td;
665	struct proc *userp;
666	struct proc *mycp;
667	struct aiocb *cb;
668	struct file *fp;
669	struct uio auio;
670	struct iovec aiov;
671	unsigned int fd;
672	int cnt;
673	int error;
674	off_t offset;
675	int oublock_st, oublock_end;
676	int inblock_st, inblock_end;
677
678	userp = aiocbe->userproc;
679	td = curthread;
680	mycp = td->td_proc;
681	cb = &aiocbe->uaiocb;
682
683	fdp = mycp->p_fd;
684	fd = cb->aio_fildes;
685	fp = fdp->fd_ofiles[fd];
686
687	if ((fp == NULL) || (fp != aiocbe->fd_file)) {
688		cb->_aiocb_private.error = EBADF;
689		cb->_aiocb_private.status = -1;
690		return;
691	}
692
693	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
694	aiov.iov_len = cb->aio_nbytes;
695
696	auio.uio_iov = &aiov;
697	auio.uio_iovcnt = 1;
698	auio.uio_offset = offset = cb->aio_offset;
699	auio.uio_resid = cb->aio_nbytes;
700	cnt = cb->aio_nbytes;
701	auio.uio_segflg = UIO_USERSPACE;
702	auio.uio_td = td;
703
704	inblock_st = mycp->p_stats->p_ru.ru_inblock;
705	oublock_st = mycp->p_stats->p_ru.ru_oublock;
706	/*
707	 * Temporarily bump the ref count while reading to avoid the
708	 * descriptor being ripped out from under us.
709	 */
710	fhold(fp);
711	if (cb->aio_lio_opcode == LIO_READ) {
712		auio.uio_rw = UIO_READ;
713		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
714	} else {
715		auio.uio_rw = UIO_WRITE;
716		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
717	}
718	fdrop(fp, td);
719	inblock_end = mycp->p_stats->p_ru.ru_inblock;
720	oublock_end = mycp->p_stats->p_ru.ru_oublock;
721
722	aiocbe->inputcharge = inblock_end - inblock_st;
723	aiocbe->outputcharge = oublock_end - oublock_st;
724
725	if ((error) && (auio.uio_resid != cnt)) {
726		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
727			error = 0;
728		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
729			PROC_LOCK(userp);
730			psignal(userp, SIGPIPE);
731			PROC_UNLOCK(userp);
732		}
733	}
734
735	cnt -= auio.uio_resid;
736	cb->_aiocb_private.error = error;
737	cb->_aiocb_private.status = cnt;
738}
739
740/*
741 * The AIO daemon, most of the actual work is done in aio_process,
742 * but the setup (and address space mgmt) is done in this routine.
743 */
744static void
745aio_daemon(void *uproc)
746{
747	int s;
748	struct aio_liojob *lj;
749	struct aiocb *cb;
750	struct aiocblist *aiocbe;
751	struct aiothreadlist *aiop;
752	struct kaioinfo *ki;
753	struct proc *curcp, *mycp, *userp;
754	struct vmspace *myvm, *tmpvm;
755	struct thread *td = curthread;
756
757	mtx_lock(&Giant);
758	/*
759	 * Local copies of curproc (cp) and vmspace (myvm)
760	 */
761	mycp = td->td_proc;
762	myvm = mycp->p_vmspace;
763
764	if (mycp->p_textvp) {
765		vrele(mycp->p_textvp);
766		mycp->p_textvp = NULL;
767	}
768
769	/*
770	 * Allocate and ready the aio control info.  There is one aiop structure
771	 * per daemon.
772	 */
773	aiop = zalloc(aiop_zone);
774	aiop->aiothread = td;
775	aiop->aiothreadflags |= AIOP_FREE;
776	TAILQ_INIT(&aiop->jobtorun);
777
778	s = splnet();
779
780	/*
781	 * Place thread (lightweight process) onto the AIO free thread list.
782	 */
783	if (TAILQ_EMPTY(&aio_freeproc))
784		wakeup(&aio_freeproc);
785	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
786
787	splx(s);
788
789	/*
790	 * Get rid of our current filedescriptors.  AIOD's don't need any
791	 * filedescriptors, except as temporarily inherited from the client.
792	 */
793	fdfree(td);
794	mycp->p_fd = NULL;
795
796	/* The daemon resides in its own pgrp. */
797	enterpgrp(mycp, mycp->p_pid, 1);
798
799	/* Mark special process type. */
800	mycp->p_flag |= P_SYSTEM;
801
802	/*
803	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
804	 * and creating too many daemons.)
805	 */
806	wakeup(mycp);
807
808	for (;;) {
809		/*
810		 * curcp is the current daemon process context.
811		 * userp is the current user process context.
812		 */
813		curcp = mycp;
814
815		/*
816		 * Take daemon off of free queue
817		 */
818		if (aiop->aiothreadflags & AIOP_FREE) {
819			s = splnet();
820			TAILQ_REMOVE(&aio_freeproc, aiop, list);
821			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
822			aiop->aiothreadflags &= ~AIOP_FREE;
823			splx(s);
824		}
825		aiop->aiothreadflags &= ~AIOP_SCHED;
826
827		/*
828		 * Check for jobs.
829		 */
830		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
831			cb = &aiocbe->uaiocb;
832			userp = aiocbe->userproc;
833
834			aiocbe->jobstate = JOBST_JOBRUNNING;
835
836			/*
837			 * Connect to process address space for user program.
838			 */
839			if (userp != curcp) {
840				/*
841				 * Save the current address space that we are
842				 * connected to.
843				 */
844				tmpvm = mycp->p_vmspace;
845
846				/*
847				 * Point to the new user address space, and
848				 * refer to it.
849				 */
850				mycp->p_vmspace = userp->p_vmspace;
851				mycp->p_vmspace->vm_refcnt++;
852
853				/* Activate the new mapping. */
854				pmap_activate(&mycp->p_thread);
855
856				/*
857				 * If the old address space wasn't the daemons
858				 * own address space, then we need to remove the
859				 * daemon's reference from the other process
860				 * that it was acting on behalf of.
861				 */
862				if (tmpvm != myvm) {
863					vmspace_free(tmpvm);
864				}
865
866				/*
867				 * Disassociate from previous clients file
868				 * descriptors, and associate to the new clients
869				 * descriptors.  Note that the daemon doesn't
870				 * need to worry about its orginal descriptors,
871				 * because they were originally freed.
872				 */
873				if (mycp->p_fd)
874					fdfree(td);
875				mycp->p_fd = fdshare(userp);
876				curcp = userp;
877			}
878
879			ki = userp->p_aioinfo;
880			lj = aiocbe->lio;
881
882			/* Account for currently active jobs. */
883			ki->kaio_active_count++;
884
885			/* Do the I/O function. */
886			aiocbe->jobaiothread = aiop;
887			aio_process(aiocbe);
888
889			/* Decrement the active job count. */
890			ki->kaio_active_count--;
891
892			/*
893			 * Increment the completion count for wakeup/signal
894			 * comparisons.
895			 */
896			aiocbe->jobflags |= AIOCBLIST_DONE;
897			ki->kaio_queue_finished_count++;
898			if (lj)
899				lj->lioj_queue_finished_count++;
900			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
901			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
902				ki->kaio_flags &= ~KAIO_WAKEUP;
903				wakeup(userp);
904			}
905
906			s = splbio();
907			if (lj && (lj->lioj_flags &
908			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
909				if ((lj->lioj_queue_finished_count ==
910				    lj->lioj_queue_count) &&
911				    (lj->lioj_buffer_finished_count ==
912				    lj->lioj_buffer_count)) {
913					PROC_LOCK(userp);
914					psignal(userp,
915					    lj->lioj_signal.sigev_signo);
916					PROC_UNLOCK(userp);
917					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
918				}
919			}
920			splx(s);
921
922			aiocbe->jobstate = JOBST_JOBFINISHED;
923
924			/*
925			 * If the I/O request should be automatically rundown,
926			 * do the needed cleanup.  Otherwise, place the queue
927			 * entry for the just finished I/O request into the done
928			 * queue for the associated client.
929			 */
930			s = splnet();
931			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
932				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
933				zfree(aiocb_zone, aiocbe);
934			} else {
935				TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
936				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
937				    plist);
938			}
939			splx(s);
940			KNOTE(&aiocbe->klist, 0);
941
942			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
943				wakeup(aiocbe);
944				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
945			}
946
947			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
948				PROC_LOCK(userp);
949				psignal(userp, cb->aio_sigevent.sigev_signo);
950				PROC_UNLOCK(userp);
951			}
952		}
953
954		/*
955		 * Disconnect from user address space.
956		 */
957		if (curcp != mycp) {
958			/* Get the user address space to disconnect from. */
959			tmpvm = mycp->p_vmspace;
960
961			/* Get original address space for daemon. */
962			mycp->p_vmspace = myvm;
963
964			/* Activate the daemon's address space. */
965			pmap_activate(&mycp->p_thread);
966#ifdef DIAGNOSTIC
967			if (tmpvm == myvm) {
968				printf("AIOD: vmspace problem -- %d\n",
969				    mycp->p_pid);
970			}
971#endif
972			/* Remove our vmspace reference. */
973			vmspace_free(tmpvm);
974
975			/*
976			 * Disassociate from the user process's file
977			 * descriptors.
978			 */
979			if (mycp->p_fd)
980				fdfree(td);
981			mycp->p_fd = NULL;
982			curcp = mycp;
983		}
984
985		/*
986		 * If we are the first to be put onto the free queue, wakeup
987		 * anyone waiting for a daemon.
988		 */
989		s = splnet();
990		TAILQ_REMOVE(&aio_activeproc, aiop, list);
991		if (TAILQ_EMPTY(&aio_freeproc))
992			wakeup(&aio_freeproc);
993		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
994		aiop->aiothreadflags |= AIOP_FREE;
995		splx(s);
996
997		/*
998		 * If daemon is inactive for a long time, allow it to exit,
999		 * thereby freeing resources.
1000		 */
1001		if (((aiop->aiothreadflags & AIOP_SCHED) == 0) && tsleep(mycp,
1002		    PRIBIO, "aiordy", aiod_lifetime)) {
1003			s = splnet();
1004			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
1005			    (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
1006				if ((aiop->aiothreadflags & AIOP_FREE) &&
1007				    (num_aio_procs > target_aio_procs)) {
1008					TAILQ_REMOVE(&aio_freeproc, aiop, list);
1009					splx(s);
1010					zfree(aiop_zone, aiop);
1011					num_aio_procs--;
1012#ifdef DIAGNOSTIC
1013					if (mycp->p_vmspace->vm_refcnt <= 1) {
1014						printf("AIOD: bad vm refcnt for"
1015						    " exiting daemon: %d\n",
1016						    mycp->p_vmspace->vm_refcnt);
1017					}
1018#endif
1019					kthread_exit(0);
1020				}
1021			}
1022			splx(s);
1023		}
1024	}
1025}
1026
1027/*
1028 * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
1029 * AIO daemon modifies its environment itself.
1030 */
1031static int
1032aio_newproc()
1033{
1034	int error;
1035	struct proc *p;
1036
1037	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
1038			       num_aio_procs);
1039	if (error)
1040		return error;
1041
1042	/*
1043	 * Wait until daemon is started, but continue on just in case to
1044	 * handle error conditions.
1045	 */
1046	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1047
1048	num_aio_procs++;
1049
1050	return error;
1051}
1052
1053/*
1054 * Try the high-performance, low-overhead physio method for eligible
1055 * VCHR devices.  This method doesn't use an aio helper thread, and
1056 * thus has very low overhead.
1057 *
1058 * Assumes that the caller, _aio_aqueue(), has incremented the file
1059 * structure's reference count, preventing its deallocation for the
1060 * duration of this call.
1061 */
1062static int
1063aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1064{
1065	int error;
1066	struct aiocb *cb;
1067	struct file *fp;
1068	struct buf *bp;
1069	struct vnode *vp;
1070	struct kaioinfo *ki;
1071	struct filedesc *fdp;
1072	struct aio_liojob *lj;
1073	int fd;
1074	int s;
1075	int notify;
1076
1077	cb = &aiocbe->uaiocb;
1078	fdp = p->p_fd;
1079	fd = cb->aio_fildes;
1080	fp = fdp->fd_ofiles[fd];
1081
1082	if (fp->f_type != DTYPE_VNODE)
1083		return (-1);
1084
1085	vp = (struct vnode *)fp->f_data;
1086
1087	/*
1088	 * If its not a disk, we don't want to return a positive error.
1089	 * It causes the aio code to not fall through to try the thread
1090	 * way when you're talking to a regular file.
1091	 */
1092	if (!vn_isdisk(vp, &error)) {
1093		if (error == ENOTBLK)
1094			return (-1);
1095		else
1096			return (error);
1097	}
1098
1099 	if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1100		return (-1);
1101
1102	if (cb->aio_nbytes >
1103	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1104		return (-1);
1105
1106	ki = p->p_aioinfo;
1107	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1108		return (-1);
1109
1110	ki->kaio_buffer_count++;
1111
1112	lj = aiocbe->lio;
1113	if (lj)
1114		lj->lioj_buffer_count++;
1115
1116	/* Create and build a buffer header for a transfer. */
1117	bp = (struct buf *)getpbuf(NULL);
1118	BUF_KERNPROC(bp);
1119
1120	/*
1121	 * Get a copy of the kva from the physical buffer.
1122	 */
1123	bp->b_caller1 = p;
1124	bp->b_dev = vp->v_rdev;
1125	error = bp->b_error = 0;
1126
1127	bp->b_bcount = cb->aio_nbytes;
1128	bp->b_bufsize = cb->aio_nbytes;
1129	bp->b_flags = B_PHYS;
1130	bp->b_iodone = aio_physwakeup;
1131	bp->b_saveaddr = bp->b_data;
1132	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1133	bp->b_blkno = btodb(cb->aio_offset);
1134
1135	if (cb->aio_lio_opcode == LIO_WRITE) {
1136		bp->b_iocmd = BIO_WRITE;
1137		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1138			error = EFAULT;
1139			goto doerror;
1140		}
1141	} else {
1142		bp->b_iocmd = BIO_READ;
1143		if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1144			error = EFAULT;
1145			goto doerror;
1146		}
1147	}
1148
1149	/* Bring buffer into kernel space. */
1150	vmapbuf(bp);
1151
1152	s = splbio();
1153	aiocbe->bp = bp;
1154	bp->b_spc = (void *)aiocbe;
1155	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1156	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1157	aiocbe->jobstate = JOBST_JOBQBUF;
1158	cb->_aiocb_private.status = cb->aio_nbytes;
1159	num_buf_aio++;
1160	bp->b_error = 0;
1161
1162	splx(s);
1163
1164	/* Perform transfer. */
1165	DEV_STRATEGY(bp, 0);
1166
1167	notify = 0;
1168	s = splbio();
1169
1170	/*
1171	 * If we had an error invoking the request, or an error in processing
1172	 * the request before we have returned, we process it as an error in
1173	 * transfer.  Note that such an I/O error is not indicated immediately,
1174	 * but is returned using the aio_error mechanism.  In this case,
1175	 * aio_suspend will return immediately.
1176	 */
1177	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1178		struct aiocb *job = aiocbe->uuaiocb;
1179
1180		aiocbe->uaiocb._aiocb_private.status = 0;
1181		suword(&job->_aiocb_private.status, 0);
1182		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1183		suword(&job->_aiocb_private.error, bp->b_error);
1184
1185		ki->kaio_buffer_finished_count++;
1186
1187		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1188			aiocbe->jobstate = JOBST_JOBBFINISHED;
1189			aiocbe->jobflags |= AIOCBLIST_DONE;
1190			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1191			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1192			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1193			notify = 1;
1194		}
1195	}
1196	splx(s);
1197	if (notify)
1198		KNOTE(&aiocbe->klist, 0);
1199	return 0;
1200
1201doerror:
1202	ki->kaio_buffer_count--;
1203	if (lj)
1204		lj->lioj_buffer_count--;
1205	aiocbe->bp = NULL;
1206	relpbuf(bp, NULL);
1207	return error;
1208}
1209
1210/*
1211 * This waits/tests physio completion.
1212 */
1213static int
1214aio_fphysio(struct aiocblist *iocb)
1215{
1216	int s;
1217	struct buf *bp;
1218	int error;
1219
1220	bp = iocb->bp;
1221
1222	s = splbio();
1223	while ((bp->b_flags & B_DONE) == 0) {
1224		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1225			if ((bp->b_flags & B_DONE) == 0) {
1226				splx(s);
1227				return EINPROGRESS;
1228			} else
1229				break;
1230		}
1231	}
1232	splx(s);
1233
1234	/* Release mapping into kernel space. */
1235	vunmapbuf(bp);
1236	iocb->bp = 0;
1237
1238	error = 0;
1239
1240	/* Check for an error. */
1241	if (bp->b_ioflags & BIO_ERROR)
1242		error = bp->b_error;
1243
1244	relpbuf(bp, NULL);
1245	return (error);
1246}
1247
1248/*
1249 * Wake up aio requests that may be serviceable now.
1250 */
1251static void
1252aio_swake_cb(struct socket *so, struct sockbuf *sb)
1253{
1254	struct aiocblist *cb,*cbn;
1255	struct proc *p;
1256	struct kaioinfo *ki = NULL;
1257	int opcode, wakecount = 0;
1258	struct aiothreadlist *aiop;
1259
1260	if (sb == &so->so_snd) {
1261		opcode = LIO_WRITE;
1262		so->so_snd.sb_flags &= ~SB_AIO;
1263	} else {
1264		opcode = LIO_READ;
1265		so->so_rcv.sb_flags &= ~SB_AIO;
1266	}
1267
1268	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1269		cbn = TAILQ_NEXT(cb, list);
1270		if (opcode == cb->uaiocb.aio_lio_opcode) {
1271			p = cb->userproc;
1272			ki = p->p_aioinfo;
1273			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1274			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1275			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1276			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1277			wakecount++;
1278			if (cb->jobstate != JOBST_JOBQGLOBAL)
1279				panic("invalid queue value");
1280		}
1281	}
1282
1283	while (wakecount--) {
1284		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1285			TAILQ_REMOVE(&aio_freeproc, aiop, list);
1286			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1287			aiop->aiothreadflags &= ~AIOP_FREE;
1288			wakeup(aiop->aiothread);
1289		}
1290	}
1291}
1292
1293/*
1294 * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
1295 * technique is done in this code.
1296 */
1297static int
1298_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1299{
1300	struct proc *p = td->td_proc;
1301	struct filedesc *fdp;
1302	struct file *fp;
1303	unsigned int fd;
1304	struct socket *so;
1305	int s;
1306	int error;
1307	int opcode;
1308	struct aiocblist *aiocbe;
1309	struct aiothreadlist *aiop;
1310	struct kaioinfo *ki;
1311	struct kevent kev;
1312	struct kqueue *kq;
1313	struct file *kq_fp;
1314
1315	aiocbe = zalloc(aiocb_zone);
1316	aiocbe->inputcharge = 0;
1317	aiocbe->outputcharge = 0;
1318	callout_handle_init(&aiocbe->timeouthandle);
1319	SLIST_INIT(&aiocbe->klist);
1320
1321	suword(&job->_aiocb_private.status, -1);
1322	suword(&job->_aiocb_private.error, 0);
1323	suword(&job->_aiocb_private.kernelinfo, -1);
1324
1325	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1326	if (error) {
1327		suword(&job->_aiocb_private.error, error);
1328		zfree(aiocb_zone, aiocbe);
1329		return error;
1330	}
1331	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1332		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1333		zfree(aiocb_zone, aiocbe);
1334		return EINVAL;
1335	}
1336
1337	/* Save userspace address of the job info. */
1338	aiocbe->uuaiocb = job;
1339
1340	/* Get the opcode. */
1341	if (type != LIO_NOP)
1342		aiocbe->uaiocb.aio_lio_opcode = type;
1343	opcode = aiocbe->uaiocb.aio_lio_opcode;
1344
1345	/* Get the fd info for process. */
1346	fdp = p->p_fd;
1347
1348	/*
1349	 * Range check file descriptor.
1350	 */
1351	fd = aiocbe->uaiocb.aio_fildes;
1352	if (fd >= fdp->fd_nfiles) {
1353		zfree(aiocb_zone, aiocbe);
1354		if (type == 0)
1355			suword(&job->_aiocb_private.error, EBADF);
1356		return EBADF;
1357	}
1358
1359	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1360	if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1361	    0))) {
1362		zfree(aiocb_zone, aiocbe);
1363		if (type == 0)
1364			suword(&job->_aiocb_private.error, EBADF);
1365		return EBADF;
1366	}
1367
1368	if (aiocbe->uaiocb.aio_offset == -1LL) {
1369		zfree(aiocb_zone, aiocbe);
1370		if (type == 0)
1371			suword(&job->_aiocb_private.error, EINVAL);
1372		return EINVAL;
1373	}
1374
1375	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1376	if (error) {
1377		zfree(aiocb_zone, aiocbe);
1378		if (type == 0)
1379			suword(&job->_aiocb_private.error, EINVAL);
1380		return error;
1381	}
1382
1383	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1384	if (jobrefid == LONG_MAX)
1385		jobrefid = 1;
1386	else
1387		jobrefid++;
1388
1389	if (opcode == LIO_NOP) {
1390		zfree(aiocb_zone, aiocbe);
1391		if (type == 0) {
1392			suword(&job->_aiocb_private.error, 0);
1393			suword(&job->_aiocb_private.status, 0);
1394			suword(&job->_aiocb_private.kernelinfo, 0);
1395		}
1396		return 0;
1397	}
1398
1399	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1400		zfree(aiocb_zone, aiocbe);
1401		if (type == 0) {
1402			suword(&job->_aiocb_private.status, 0);
1403			suword(&job->_aiocb_private.error, EINVAL);
1404		}
1405		return EINVAL;
1406	}
1407
1408	fhold(fp);
1409
1410	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1411		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1412		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1413	}
1414	else {
1415		/*
1416		 * This method for requesting kevent-based notification won't
1417		 * work on the alpha, since we're passing in a pointer
1418		 * via aio_lio_opcode, which is an int.  Use the SIGEV_KEVENT-
1419		 * based method instead.
1420		 */
1421		struct kevent *kevp;
1422
1423		kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
1424		if (kevp == NULL)
1425			goto no_kqueue;
1426
1427		error = copyin(kevp, &kev, sizeof(kev));
1428		if (error)
1429			goto aqueue_fail;
1430	}
1431	if ((u_int)kev.ident >= fdp->fd_nfiles ||
1432	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1433	    (kq_fp->f_type != DTYPE_KQUEUE)) {
1434		error = EBADF;
1435		goto aqueue_fail;
1436	}
1437	kq = (struct kqueue *)kq_fp->f_data;
1438	kev.ident = (uintptr_t)aiocbe;
1439	kev.filter = EVFILT_AIO;
1440	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1441	error = kqueue_register(kq, &kev, td);
1442aqueue_fail:
1443	if (error) {
1444		zfree(aiocb_zone, aiocbe);
1445		if (type == 0)
1446			suword(&job->_aiocb_private.error, error);
1447		goto done;
1448	}
1449no_kqueue:
1450
1451	suword(&job->_aiocb_private.error, EINPROGRESS);
1452	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1453	aiocbe->userproc = p;
1454	aiocbe->jobflags = 0;
1455	aiocbe->lio = lj;
1456	ki = p->p_aioinfo;
1457
1458	if (fp->f_type == DTYPE_SOCKET) {
1459		/*
1460		 * Alternate queueing for socket ops: Reach down into the
1461		 * descriptor to get the socket data.  Then check to see if the
1462		 * socket is ready to be read or written (based on the requested
1463		 * operation).
1464		 *
1465		 * If it is not ready for io, then queue the aiocbe on the
1466		 * socket, and set the flags so we get a call when sbnotify()
1467		 * happens.
1468		 */
1469		so = (struct socket *)fp->f_data;
1470		s = splnet();
1471		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1472		    LIO_WRITE) && (!sowriteable(so)))) {
1473			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1474			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1475			if (opcode == LIO_READ)
1476				so->so_rcv.sb_flags |= SB_AIO;
1477			else
1478				so->so_snd.sb_flags |= SB_AIO;
1479			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1480			ki->kaio_queue_count++;
1481			num_queue_count++;
1482			splx(s);
1483			error = 0;
1484			goto done;
1485		}
1486		splx(s);
1487	}
1488
1489	if ((error = aio_qphysio(p, aiocbe)) == 0)
1490		goto done;
1491	if (error > 0) {
1492		suword(&job->_aiocb_private.status, 0);
1493		aiocbe->uaiocb._aiocb_private.error = error;
1494		suword(&job->_aiocb_private.error, error);
1495		goto done;
1496	}
1497
1498	/* No buffer for daemon I/O. */
1499	aiocbe->bp = NULL;
1500
1501	ki->kaio_queue_count++;
1502	if (lj)
1503		lj->lioj_queue_count++;
1504	s = splnet();
1505	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1506	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1507	splx(s);
1508	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1509
1510	num_queue_count++;
1511	error = 0;
1512
1513	/*
1514	 * If we don't have a free AIO process, and we are below our quota, then
1515	 * start one.  Otherwise, depend on the subsequent I/O completions to
1516	 * pick-up this job.  If we don't sucessfully create the new process
1517	 * (thread) due to resource issues, we return an error for now (EAGAIN),
1518	 * which is likely not the correct thing to do.
1519	 */
1520	s = splnet();
1521retryproc:
1522	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1523		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1524		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1525		aiop->aiothreadflags &= ~AIOP_FREE;
1526		wakeup(aiop->aiothread);
1527	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1528	    ((ki->kaio_active_count + num_aio_resv_start) <
1529	    ki->kaio_maxactive_count)) {
1530		num_aio_resv_start++;
1531		if ((error = aio_newproc()) == 0) {
1532			num_aio_resv_start--;
1533			td->td_retval[0] = 0;
1534			goto retryproc;
1535		}
1536		num_aio_resv_start--;
1537	}
1538	splx(s);
1539done:
1540	fdrop(fp, td);
1541	return error;
1542}
1543
1544/*
1545 * This routine queues an AIO request, checking for quotas.
1546 */
1547static int
1548aio_aqueue(struct thread *td, struct aiocb *job, int type)
1549{
1550	struct proc *p = td->td_proc;
1551	struct kaioinfo *ki;
1552
1553	if (p->p_aioinfo == NULL)
1554		aio_init_aioinfo(p);
1555
1556	if (num_queue_count >= max_queue_count)
1557		return EAGAIN;
1558
1559	ki = p->p_aioinfo;
1560	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1561		return EAGAIN;
1562
1563	return _aio_aqueue(td, job, NULL, type);
1564}
1565
1566/*
1567 * Support the aio_return system call, as a side-effect, kernel resources are
1568 * released.
1569 */
1570int
1571aio_return(struct thread *td, struct aio_return_args *uap)
1572{
1573	struct proc *p = td->td_proc;
1574	int s;
1575	int jobref;
1576	struct aiocblist *cb, *ncb;
1577	struct aiocb *ujob;
1578	struct kaioinfo *ki;
1579
1580	ki = p->p_aioinfo;
1581	if (ki == NULL)
1582		return EINVAL;
1583
1584	ujob = uap->aiocbp;
1585
1586	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1587	if (jobref == -1 || jobref == 0)
1588		return EINVAL;
1589
1590	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1591		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1592		    jobref) {
1593			if (ujob == cb->uuaiocb) {
1594				td->td_retval[0] =
1595				    cb->uaiocb._aiocb_private.status;
1596			} else
1597				td->td_retval[0] = EFAULT;
1598			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1599				p->p_stats->p_ru.ru_oublock +=
1600				    cb->outputcharge;
1601				cb->outputcharge = 0;
1602			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1603				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1604				cb->inputcharge = 0;
1605			}
1606			aio_free_entry(cb);
1607			return 0;
1608		}
1609	}
1610	s = splbio();
1611	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1612		ncb = TAILQ_NEXT(cb, plist);
1613		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1614		    == jobref) {
1615			splx(s);
1616			if (ujob == cb->uuaiocb) {
1617				td->td_retval[0] =
1618				    cb->uaiocb._aiocb_private.status;
1619			} else
1620				td->td_retval[0] = EFAULT;
1621			aio_free_entry(cb);
1622			return 0;
1623		}
1624	}
1625	splx(s);
1626
1627	return (EINVAL);
1628}
1629
1630/*
1631 * Allow a process to wakeup when any of the I/O requests are completed.
1632 */
1633int
1634aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1635{
1636	struct proc *p = td->td_proc;
1637	struct timeval atv;
1638	struct timespec ts;
1639	struct aiocb *const *cbptr, *cbp;
1640	struct kaioinfo *ki;
1641	struct aiocblist *cb;
1642	int i;
1643	int njoblist;
1644	int error, s, timo;
1645	int *ijoblist;
1646	struct aiocb **ujoblist;
1647
1648	if (uap->nent > AIO_LISTIO_MAX)
1649		return EINVAL;
1650
1651	timo = 0;
1652	if (uap->timeout) {
1653		/* Get timespec struct. */
1654		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1655			return error;
1656
1657		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1658			return (EINVAL);
1659
1660		TIMESPEC_TO_TIMEVAL(&atv, &ts);
1661		if (itimerfix(&atv))
1662			return (EINVAL);
1663		timo = tvtohz(&atv);
1664	}
1665
1666	ki = p->p_aioinfo;
1667	if (ki == NULL)
1668		return EAGAIN;
1669
1670	njoblist = 0;
1671	ijoblist = zalloc(aiol_zone);
1672	ujoblist = zalloc(aiol_zone);
1673	cbptr = uap->aiocbp;
1674
1675	for (i = 0; i < uap->nent; i++) {
1676		cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1677		if (cbp == 0)
1678			continue;
1679		ujoblist[njoblist] = cbp;
1680		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1681		njoblist++;
1682	}
1683
1684	if (njoblist == 0) {
1685		zfree(aiol_zone, ijoblist);
1686		zfree(aiol_zone, ujoblist);
1687		return 0;
1688	}
1689
1690	error = 0;
1691	for (;;) {
1692		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1693			for (i = 0; i < njoblist; i++) {
1694				if (((intptr_t)
1695				    cb->uaiocb._aiocb_private.kernelinfo) ==
1696				    ijoblist[i]) {
1697					if (ujoblist[i] != cb->uuaiocb)
1698						error = EINVAL;
1699					zfree(aiol_zone, ijoblist);
1700					zfree(aiol_zone, ujoblist);
1701					return error;
1702				}
1703			}
1704		}
1705
1706		s = splbio();
1707		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1708		    TAILQ_NEXT(cb, plist)) {
1709			for (i = 0; i < njoblist; i++) {
1710				if (((intptr_t)
1711				    cb->uaiocb._aiocb_private.kernelinfo) ==
1712				    ijoblist[i]) {
1713					splx(s);
1714					if (ujoblist[i] != cb->uuaiocb)
1715						error = EINVAL;
1716					zfree(aiol_zone, ijoblist);
1717					zfree(aiol_zone, ujoblist);
1718					return error;
1719				}
1720			}
1721		}
1722
1723		ki->kaio_flags |= KAIO_WAKEUP;
1724		error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1725		splx(s);
1726
1727		if (error == ERESTART || error == EINTR) {
1728			zfree(aiol_zone, ijoblist);
1729			zfree(aiol_zone, ujoblist);
1730			return EINTR;
1731		} else if (error == EWOULDBLOCK) {
1732			zfree(aiol_zone, ijoblist);
1733			zfree(aiol_zone, ujoblist);
1734			return EAGAIN;
1735		}
1736	}
1737
1738/* NOTREACHED */
1739	return EINVAL;
1740}
1741
1742/*
1743 * aio_cancel cancels any non-physio aio operations not currently in
1744 * progress.
1745 */
1746int
1747aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1748{
1749	struct proc *p = td->td_proc;
1750	struct kaioinfo *ki;
1751	struct aiocblist *cbe, *cbn;
1752	struct file *fp;
1753	struct filedesc *fdp;
1754	struct socket *so;
1755	struct proc *po;
1756	int s,error;
1757	int cancelled=0;
1758	int notcancelled=0;
1759	struct vnode *vp;
1760
1761	fdp = p->p_fd;
1762	if ((u_int)uap->fd >= fdp->fd_nfiles ||
1763	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1764		return (EBADF);
1765
1766        if (fp->f_type == DTYPE_VNODE) {
1767		vp = (struct vnode *)fp->f_data;
1768
1769		if (vn_isdisk(vp,&error)) {
1770			td->td_retval[0] = AIO_NOTCANCELED;
1771        	        return 0;
1772		}
1773	} else if (fp->f_type == DTYPE_SOCKET) {
1774		so = (struct socket *)fp->f_data;
1775
1776		s = splnet();
1777
1778		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1779			cbn = TAILQ_NEXT(cbe, list);
1780			if ((uap->aiocbp == NULL) ||
1781				(uap->aiocbp == cbe->uuaiocb) ) {
1782				po = cbe->userproc;
1783				ki = po->p_aioinfo;
1784				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1785				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1786				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1787				if (ki->kaio_flags & KAIO_WAKEUP) {
1788					wakeup(po);
1789				}
1790				cbe->jobstate = JOBST_JOBFINISHED;
1791				cbe->uaiocb._aiocb_private.status=-1;
1792				cbe->uaiocb._aiocb_private.error=ECANCELED;
1793				cancelled++;
1794/* XXX cancelled, knote? */
1795			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1796				    SIGEV_SIGNAL) {
1797					PROC_LOCK(cbe->userproc);
1798					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1799					PROC_UNLOCK(cbe->userproc);
1800				}
1801				if (uap->aiocbp)
1802					break;
1803			}
1804		}
1805		splx(s);
1806
1807		if ((cancelled) && (uap->aiocbp)) {
1808			td->td_retval[0] = AIO_CANCELED;
1809			return 0;
1810		}
1811	}
1812	ki=p->p_aioinfo;
1813	s = splnet();
1814
1815	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1816		cbn = TAILQ_NEXT(cbe, plist);
1817
1818		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1819		    ((uap->aiocbp == NULL ) ||
1820		     (uap->aiocbp == cbe->uuaiocb))) {
1821
1822			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1823				TAILQ_REMOVE(&aio_jobs, cbe, list);
1824                                TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1825                                TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1826                                    plist);
1827				cancelled++;
1828				ki->kaio_queue_finished_count++;
1829				cbe->jobstate = JOBST_JOBFINISHED;
1830				cbe->uaiocb._aiocb_private.status = -1;
1831				cbe->uaiocb._aiocb_private.error = ECANCELED;
1832/* XXX cancelled, knote? */
1833			        if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1834				    SIGEV_SIGNAL) {
1835					PROC_LOCK(cbe->userproc);
1836					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1837					PROC_UNLOCK(cbe->userproc);
1838				}
1839			} else {
1840				notcancelled++;
1841			}
1842		}
1843	}
1844	splx(s);
1845
1846	if (notcancelled) {
1847		td->td_retval[0] = AIO_NOTCANCELED;
1848		return 0;
1849	}
1850	if (cancelled) {
1851		td->td_retval[0] = AIO_CANCELED;
1852		return 0;
1853	}
1854	td->td_retval[0] = AIO_ALLDONE;
1855
1856	return 0;
1857}
1858
1859/*
1860 * aio_error is implemented in the kernel level for compatibility purposes only.
1861 * For a user mode async implementation, it would be best to do it in a userland
1862 * subroutine.
1863 */
1864int
1865aio_error(struct thread *td, struct aio_error_args *uap)
1866{
1867	struct proc *p = td->td_proc;
1868	int s;
1869	struct aiocblist *cb;
1870	struct kaioinfo *ki;
1871	int jobref;
1872
1873	ki = p->p_aioinfo;
1874	if (ki == NULL)
1875		return EINVAL;
1876
1877	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1878	if ((jobref == -1) || (jobref == 0))
1879		return EINVAL;
1880
1881	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1882		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1883		    jobref) {
1884			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1885			return 0;
1886		}
1887	}
1888
1889	s = splnet();
1890
1891	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1892	    plist)) {
1893		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1894		    jobref) {
1895			td->td_retval[0] = EINPROGRESS;
1896			splx(s);
1897			return 0;
1898		}
1899	}
1900
1901	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1902	    plist)) {
1903		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1904		    jobref) {
1905			td->td_retval[0] = EINPROGRESS;
1906			splx(s);
1907			return 0;
1908		}
1909	}
1910	splx(s);
1911
1912	s = splbio();
1913	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1914	    plist)) {
1915		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1916		    jobref) {
1917			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1918			splx(s);
1919			return 0;
1920		}
1921	}
1922
1923	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1924	    plist)) {
1925		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1926		    jobref) {
1927			td->td_retval[0] = EINPROGRESS;
1928			splx(s);
1929			return 0;
1930		}
1931	}
1932	splx(s);
1933
1934#if (0)
1935	/*
1936	 * Hack for lio.
1937	 */
1938	status = fuword(&uap->aiocbp->_aiocb_private.status);
1939	if (status == -1)
1940		return fuword(&uap->aiocbp->_aiocb_private.error);
1941#endif
1942	return EINVAL;
1943}
1944
1945int
1946aio_read(struct thread *td, struct aio_read_args *uap)
1947{
1948
1949	return aio_aqueue(td, uap->aiocbp, LIO_READ);
1950}
1951
1952int
1953aio_write(struct thread *td, struct aio_write_args *uap)
1954{
1955
1956	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1957}
1958
1959int
1960lio_listio(struct thread *td, struct lio_listio_args *uap)
1961{
1962	struct proc *p = td->td_proc;
1963	int nent, nentqueued;
1964	struct aiocb *iocb, * const *cbptr;
1965	struct aiocblist *cb;
1966	struct kaioinfo *ki;
1967	struct aio_liojob *lj;
1968	int error, runningcode;
1969	int nerror;
1970	int i;
1971	int s;
1972
1973	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1974		return EINVAL;
1975
1976	nent = uap->nent;
1977	if (nent > AIO_LISTIO_MAX)
1978		return EINVAL;
1979
1980	if (p->p_aioinfo == NULL)
1981		aio_init_aioinfo(p);
1982
1983	if ((nent + num_queue_count) > max_queue_count)
1984		return EAGAIN;
1985
1986	ki = p->p_aioinfo;
1987	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1988		return EAGAIN;
1989
1990	lj = zalloc(aiolio_zone);
1991	if (!lj)
1992		return EAGAIN;
1993
1994	lj->lioj_flags = 0;
1995	lj->lioj_buffer_count = 0;
1996	lj->lioj_buffer_finished_count = 0;
1997	lj->lioj_queue_count = 0;
1998	lj->lioj_queue_finished_count = 0;
1999	lj->lioj_ki = ki;
2000
2001	/*
2002	 * Setup signal.
2003	 */
2004	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2005		error = copyin(uap->sig, &lj->lioj_signal,
2006			       sizeof(lj->lioj_signal));
2007		if (error) {
2008			zfree(aiolio_zone, lj);
2009			return error;
2010		}
2011		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2012			zfree(aiolio_zone, lj);
2013			return EINVAL;
2014		}
2015		lj->lioj_flags |= LIOJ_SIGNAL;
2016		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2017	} else
2018		lj->lioj_flags &= ~LIOJ_SIGNAL;
2019
2020	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2021	/*
2022	 * Get pointers to the list of I/O requests.
2023	 */
2024	nerror = 0;
2025	nentqueued = 0;
2026	cbptr = uap->acb_list;
2027	for (i = 0; i < uap->nent; i++) {
2028		iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2029		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2030			error = _aio_aqueue(td, iocb, lj, 0);
2031			if (error == 0)
2032				nentqueued++;
2033			else
2034				nerror++;
2035		}
2036	}
2037
2038	/*
2039	 * If we haven't queued any, then just return error.
2040	 */
2041	if (nentqueued == 0)
2042		return 0;
2043
2044	/*
2045	 * Calculate the appropriate error return.
2046	 */
2047	runningcode = 0;
2048	if (nerror)
2049		runningcode = EIO;
2050
2051	if (uap->mode == LIO_WAIT) {
2052		int command, found, jobref;
2053
2054		for (;;) {
2055			found = 0;
2056			for (i = 0; i < uap->nent; i++) {
2057				/*
2058				 * Fetch address of the control buf pointer in
2059				 * user space.
2060				 */
2061				iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2062				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2063				    == 0))
2064					continue;
2065
2066				/*
2067				 * Fetch the associated command from user space.
2068				 */
2069				command = fuword(&iocb->aio_lio_opcode);
2070				if (command == LIO_NOP) {
2071					found++;
2072					continue;
2073				}
2074
2075				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2076
2077				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2078					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2079					    == jobref) {
2080						if (cb->uaiocb.aio_lio_opcode
2081						    == LIO_WRITE) {
2082							p->p_stats->p_ru.ru_oublock
2083							    +=
2084							    cb->outputcharge;
2085							cb->outputcharge = 0;
2086						} else if (cb->uaiocb.aio_lio_opcode
2087						    == LIO_READ) {
2088							p->p_stats->p_ru.ru_inblock
2089							    += cb->inputcharge;
2090							cb->inputcharge = 0;
2091						}
2092						found++;
2093						break;
2094					}
2095				}
2096
2097				s = splbio();
2098				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2099					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2100					    == jobref) {
2101						found++;
2102						break;
2103					}
2104				}
2105				splx(s);
2106			}
2107
2108			/*
2109			 * If all I/Os have been disposed of, then we can
2110			 * return.
2111			 */
2112			if (found == nentqueued)
2113				return runningcode;
2114
2115			ki->kaio_flags |= KAIO_WAKEUP;
2116			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2117
2118			if (error == EINTR)
2119				return EINTR;
2120			else if (error == EWOULDBLOCK)
2121				return EAGAIN;
2122		}
2123	}
2124
2125	return runningcode;
2126}
2127
2128/*
2129 * This is a weird hack so that we can post a signal.  It is safe to do so from
2130 * a timeout routine, but *not* from an interrupt routine.
2131 */
2132static void
2133process_signal(void *aioj)
2134{
2135	struct aiocblist *aiocbe = aioj;
2136	struct aio_liojob *lj = aiocbe->lio;
2137	struct aiocb *cb = &aiocbe->uaiocb;
2138
2139	if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2140		(lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2141		PROC_LOCK(lj->lioj_ki->kaio_p);
2142		psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2143		PROC_UNLOCK(lj->lioj_ki->kaio_p);
2144		lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2145	}
2146
2147	if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2148		PROC_LOCK(aiocbe->userproc);
2149		psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2150		PROC_UNLOCK(aiocbe->userproc);
2151	}
2152}
2153
2154/*
2155 * Interrupt handler for physio, performs the necessary process wakeups, and
2156 * signals.
2157 */
2158static void
2159aio_physwakeup(struct buf *bp)
2160{
2161	struct aiocblist *aiocbe;
2162	struct proc *p;
2163	struct kaioinfo *ki;
2164	struct aio_liojob *lj;
2165
2166	wakeup(bp);
2167
2168	aiocbe = (struct aiocblist *)bp->b_spc;
2169	if (aiocbe) {
2170		p = bp->b_caller1;
2171
2172		aiocbe->jobstate = JOBST_JOBBFINISHED;
2173		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2174		aiocbe->uaiocb._aiocb_private.error = 0;
2175		aiocbe->jobflags |= AIOCBLIST_DONE;
2176
2177		if (bp->b_ioflags & BIO_ERROR)
2178			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2179
2180		lj = aiocbe->lio;
2181		if (lj) {
2182			lj->lioj_buffer_finished_count++;
2183
2184			/*
2185			 * wakeup/signal if all of the interrupt jobs are done.
2186			 */
2187			if (lj->lioj_buffer_finished_count ==
2188			    lj->lioj_buffer_count) {
2189				/*
2190				 * Post a signal if it is called for.
2191				 */
2192				if ((lj->lioj_flags &
2193				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2194				    LIOJ_SIGNAL) {
2195					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2196					aiocbe->timeouthandle =
2197						timeout(process_signal,
2198							aiocbe, 0);
2199				}
2200			}
2201		}
2202
2203		ki = p->p_aioinfo;
2204		if (ki) {
2205			ki->kaio_buffer_finished_count++;
2206			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2207			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2208			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2209
2210			KNOTE(&aiocbe->klist, 0);
2211			/* Do the wakeup. */
2212			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2213				ki->kaio_flags &= ~KAIO_WAKEUP;
2214				wakeup(p);
2215			}
2216		}
2217
2218		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2219			aiocbe->timeouthandle =
2220				timeout(process_signal, aiocbe, 0);
2221	}
2222}
2223
2224int
2225aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2226{
2227	struct proc *p = td->td_proc;
2228	struct timeval atv;
2229	struct timespec ts;
2230	struct aiocb **cbptr;
2231	struct kaioinfo *ki;
2232	struct aiocblist *cb = NULL;
2233	int error, s, timo;
2234
2235	suword(uap->aiocbp, (int)NULL);
2236
2237	timo = 0;
2238	if (uap->timeout) {
2239		/* Get timespec struct. */
2240		error = copyin(uap->timeout, &ts, sizeof(ts));
2241		if (error)
2242			return error;
2243
2244		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2245			return (EINVAL);
2246
2247		TIMESPEC_TO_TIMEVAL(&atv, &ts);
2248		if (itimerfix(&atv))
2249			return (EINVAL);
2250		timo = tvtohz(&atv);
2251	}
2252
2253	ki = p->p_aioinfo;
2254	if (ki == NULL)
2255		return EAGAIN;
2256
2257	cbptr = uap->aiocbp;
2258
2259	for (;;) {
2260		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2261			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2262			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2263			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2264				p->p_stats->p_ru.ru_oublock +=
2265				    cb->outputcharge;
2266				cb->outputcharge = 0;
2267			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2268				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2269				cb->inputcharge = 0;
2270			}
2271			aio_free_entry(cb);
2272			return cb->uaiocb._aiocb_private.error;
2273		}
2274
2275		s = splbio();
2276 		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2277			splx(s);
2278			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2279			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2280			aio_free_entry(cb);
2281			return cb->uaiocb._aiocb_private.error;
2282		}
2283
2284		ki->kaio_flags |= KAIO_WAKEUP;
2285		error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2286		splx(s);
2287
2288		if (error == ERESTART)
2289			return EINTR;
2290		else if (error < 0)
2291			return error;
2292		else if (error == EINTR)
2293			return EINTR;
2294		else if (error == EWOULDBLOCK)
2295			return EAGAIN;
2296	}
2297}
2298
2299static int
2300filt_aioattach(struct knote *kn)
2301{
2302	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2303
2304	/*
2305	 * The aiocbe pointer must be validated before using it, so
2306	 * registration is restricted to the kernel; the user cannot
2307	 * set EV_FLAG1.
2308	 */
2309	if ((kn->kn_flags & EV_FLAG1) == 0)
2310		return (EPERM);
2311	kn->kn_flags &= ~EV_FLAG1;
2312
2313	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2314
2315	return (0);
2316}
2317
2318static void
2319filt_aiodetach(struct knote *kn)
2320{
2321	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2322
2323	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2324}
2325
2326/*ARGSUSED*/
2327static int
2328filt_aio(struct knote *kn, long hint)
2329{
2330	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2331
2332	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2333	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2334	    aiocbe->jobstate != JOBST_JOBBFINISHED)
2335		return (0);
2336	kn->kn_flags |= EV_EOF;
2337	return (1);
2338}
2339