vfs_aio.c revision 31473
1/*
2 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 *    derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author.  This software is distributed AS-IS.
15 *
16 * $Id: vfs_aio.c,v 1.16 1997/11/30 23:21:08 dyson Exp $
17 */
18
19/*
20 * This file contains support for the POSIX.4 AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/sysproto.h>
26#include <sys/filedesc.h>
27#include <sys/kernel.h>
28#include <sys/fcntl.h>
29#include <sys/file.h>
30#include <sys/lock.h>
31#include <sys/unistd.h>
32#include <sys/proc.h>
33#include <sys/uio.h>
34#include <sys/malloc.h>
35#include <sys/signalvar.h>
36#include <sys/sysctl.h>
37#include <sys/vnode.h>
38#include <sys/conf.h>
39#include <miscfs/specfs/specdev.h>
40
41#include <vm/vm.h>
42#include <vm/vm_param.h>
43#include <vm/vm_extern.h>
44#include <vm/pmap.h>
45#include <vm/vm_map.h>
46#include <vm/vm_zone.h>
47#include <sys/aio.h>
48#include <sys/shm.h>
49#include <sys/user.h>
50
51#include <machine/cpu.h>
52
53static	int jobrefid;
54
55#define JOBST_NULL			0x0
56#define	JOBST_JOBQPROC		0x1
57#define JOBST_JOBQGLOBAL	0x2
58#define JOBST_JOBRUNNING	0x3
59#define JOBST_JOBFINISHED	0x4
60#define	JOBST_JOBQBUF		0x5
61#define	JOBST_JOBBFINISHED	0x6
62
63#ifndef MAX_AIO_PER_PROC
64#define MAX_AIO_PER_PROC	32
65#endif
66
67#ifndef MAX_AIO_QUEUE_PER_PROC
68#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
69#endif
70
71#ifndef MAX_AIO_PROCS
72#define MAX_AIO_PROCS		32
73#endif
74
75#ifndef MAX_AIO_QUEUE
76#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
77#endif
78
79#ifndef TARGET_AIO_PROCS
80#define TARGET_AIO_PROCS	0
81#endif
82
83#ifndef MAX_BUF_AIO
84#define MAX_BUF_AIO 16
85#endif
86
87#ifndef AIOD_TIMEOUT_DEFAULT
88#define	AIOD_TIMEOUT_DEFAULT (10 * hz)
89#endif
90
91#ifndef AIOD_LIFETIME_DEFAULT
92#define AIOD_LIFETIME_DEFAULT (30 * hz)
93#endif
94
95int max_aio_procs = MAX_AIO_PROCS;
96int num_aio_procs = 0;
97int target_aio_procs = TARGET_AIO_PROCS;
98int max_queue_count = MAX_AIO_QUEUE;
99int num_queue_count = 0;
100int num_buf_aio = 0;
101int num_aio_resv_start = 0;
102int aiod_timeout;
103int aiod_lifetime;
104
105int max_aio_per_proc = MAX_AIO_PER_PROC,
106	max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
107
108int max_buf_aio = MAX_BUF_AIO;
109
110SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
111
112SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
113	CTLFLAG_RW, &max_aio_per_proc, 0, "");
114
115SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
116	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
117
118SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
119	CTLFLAG_RW, &max_aio_procs, 0, "");
120
121SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
122	CTLFLAG_RD, &num_aio_procs, 0, "");
123
124SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
125	CTLFLAG_RD, &num_queue_count, 0, "");
126
127SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
128	CTLFLAG_RW, &max_queue_count, 0, "");
129
130SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
131	CTLFLAG_RW, &target_aio_procs, 0, "");
132
133SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
134	CTLFLAG_RW, &max_buf_aio, 0, "");
135
136SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
137	CTLFLAG_RD, &num_buf_aio, 0, "");
138
139SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
140	CTLFLAG_RW, &aiod_lifetime, 0, "");
141
142SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
143	CTLFLAG_RW, &aiod_timeout, 0, "");
144
145
146/*
147 * Job queue item
148 */
149
150#define AIOCBLIST_CANCELLED	0x1
151#define AIOCBLIST_RUNDOWN	0x4
152#define AIOCBLIST_ASYNCFREE	0x8
153#define AIOCBLIST_DONE		0x10
154
155struct aiocblist {
156	TAILQ_ENTRY (aiocblist) list;		/* List of jobs */
157	TAILQ_ENTRY (aiocblist) plist;		/* List of jobs for proc */
158	int	jobflags;
159	int	jobstate;
160	int inputcharge, outputcharge;
161	struct	buf *bp;				/* buffer pointer */
162	struct	proc *userproc;			/* User process */
163	struct	aioproclist	*jobaioproc;	/* AIO process descriptor */
164	struct	aio_liojob	*lio;		/* optional lio job */
165	struct	aiocb *uuaiocb;			/* pointer in userspace of aiocb */
166	struct	aiocb uaiocb;			/* Kernel I/O control block */
167};
168
169
170/*
171 * AIO process info
172 */
173#define AIOP_FREE	0x1			/* proc on free queue */
174#define AIOP_SCHED	0x2			/* proc explicitly scheduled */
175
176struct aioproclist {
177	int aioprocflags;			/* AIO proc flags */
178	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
179	struct proc *aioproc;			/* The AIO thread */
180	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
181};
182
183/*
184 * data-structure for lio signal management
185 */
186struct aio_liojob {
187	int lioj_flags;
188	int	lioj_buffer_count;
189	int	lioj_buffer_finished_count;
190	int	lioj_queue_count;
191	int	lioj_queue_finished_count;
192	struct sigevent lioj_signal;	/* signal on all I/O done */
193	TAILQ_ENTRY (aio_liojob) lioj_list;
194	struct kaioinfo *lioj_ki;
195};
196#define	LIOJ_SIGNAL			0x1 /* signal on all done (lio) */
197#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
198
199/*
200 * per process aio data structure
201 */
202struct kaioinfo {
203	int	kaio_flags;			/* per process kaio flags */
204	int	kaio_maxactive_count;	/* maximum number of AIOs */
205	int	kaio_active_count;	/* number of currently used AIOs */
206	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
207	int	kaio_queue_count;	/* size of AIO queue */
208	int	kaio_ballowed_count;	/* maximum number of buffers */
209	int	kaio_queue_finished_count;	/* number of daemon jobs finished */
210	int	kaio_buffer_count;	/* number of physio buffers */
211	int	kaio_buffer_finished_count;	/* count of I/O done */
212	struct proc *kaio_p;			/* process that uses this kaio block */
213	TAILQ_HEAD (,aio_liojob) kaio_liojoblist;	/* list of lio jobs */
214	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
215	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
216	TAILQ_HEAD (,aiocblist)	kaio_bufqueue;	/* buffer job queue for process */
217	TAILQ_HEAD (,aiocblist)	kaio_bufdone;	/* buffer done queue for process */
218};
219
220#define KAIO_RUNDOWN 0x1		/* process is being run down */
221#define KAIO_WAKEUP 0x2			/* wakeup process when there is a significant
222								   event */
223
224
225TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
226TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
227TAILQ_HEAD(,aiocblist) aio_bufjobs;			/* Phys I/O job list */
228TAILQ_HEAD(,aiocblist) aio_freejobs;		/* Pool of free jobs */
229
230static void aio_init_aioinfo(struct proc *p) ;
231static void aio_onceonly(void *) ;
232static int aio_free_entry(struct aiocblist *aiocbe);
233static void aio_process(struct aiocblist *aiocbe);
234static int aio_newproc(void) ;
235static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
236static void aio_physwakeup(struct buf *bp);
237static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
238static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
239static void aio_daemon(void *uproc);
240
241SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
242
243static vm_zone_t kaio_zone=0, aiop_zone=0,
244	aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
245
246/*
247 * Single AIOD vmspace shared amongst all of them
248 */
249static struct vmspace *aiovmspace = NULL;
250
251/*
252 * Startup initialization
253 */
254void
255aio_onceonly(void *na)
256{
257	TAILQ_INIT(&aio_freeproc);
258	TAILQ_INIT(&aio_activeproc);
259	TAILQ_INIT(&aio_jobs);
260	TAILQ_INIT(&aio_bufjobs);
261	TAILQ_INIT(&aio_freejobs);
262	kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
263	aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
264	aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
265	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
266	aiolio_zone = zinit("AIOLIO",
267		AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
268	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
269	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
270	jobrefid = 1;
271}
272
273/*
274 * Init the per-process aioinfo structure.
275 * The aioinfo limits are set per-process for user limit (resource) management.
276 */
277void
278aio_init_aioinfo(struct proc *p)
279{
280	struct kaioinfo *ki;
281	if (p->p_aioinfo == NULL) {
282		ki = zalloc(kaio_zone);
283		p->p_aioinfo = ki;
284		ki->kaio_flags = 0;
285		ki->kaio_maxactive_count = max_aio_per_proc;
286		ki->kaio_active_count = 0;
287		ki->kaio_qallowed_count = max_aio_queue_per_proc;
288		ki->kaio_queue_count = 0;
289		ki->kaio_ballowed_count = max_buf_aio;
290		ki->kaio_buffer_count = 0;
291		ki->kaio_buffer_finished_count = 0;
292		ki->kaio_p = p;
293		TAILQ_INIT(&ki->kaio_jobdone);
294		TAILQ_INIT(&ki->kaio_jobqueue);
295		TAILQ_INIT(&ki->kaio_bufdone);
296		TAILQ_INIT(&ki->kaio_bufqueue);
297		TAILQ_INIT(&ki->kaio_liojoblist);
298	}
299}
300
301/*
302 * Free a job entry.  Wait for completion if it is currently
303 * active, but don't delay forever.  If we delay, we return
304 * a flag that says that we have to restart the queue scan.
305 */
306int
307aio_free_entry(struct aiocblist *aiocbe)
308{
309	struct kaioinfo *ki;
310	struct aioproclist *aiop;
311	struct aio_liojob *lj;
312	struct proc *p;
313	int error;
314	int s;
315
316	if (aiocbe->jobstate == JOBST_NULL)
317		panic("aio_free_entry: freeing already free job");
318
319	p = aiocbe->userproc;
320	ki = p->p_aioinfo;
321	lj = aiocbe->lio;
322	if (ki == NULL)
323		panic("aio_free_entry: missing p->p_aioinfo");
324
325	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
326		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
327			return 0;
328		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
329		tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
330	}
331	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
332
333	if (aiocbe->bp == NULL) {
334		if (ki->kaio_queue_count <= 0)
335			panic("aio_free_entry: process queue size <= 0");
336		if (num_queue_count <= 0)
337			panic("aio_free_entry: system wide queue size <= 0");
338
339		if(lj) {
340			lj->lioj_queue_count--;
341			if (aiocbe->jobflags & AIOCBLIST_DONE)
342				lj->lioj_queue_finished_count--;
343		}
344		ki->kaio_queue_count--;
345		if (aiocbe->jobflags & AIOCBLIST_DONE)
346			ki->kaio_queue_finished_count--;
347		num_queue_count--;
348
349	} else {
350		if(lj) {
351			lj->lioj_buffer_count--;
352			if (aiocbe->jobflags & AIOCBLIST_DONE)
353				lj->lioj_buffer_finished_count--;
354		}
355		if (aiocbe->jobflags & AIOCBLIST_DONE)
356			ki->kaio_buffer_finished_count--;
357		ki->kaio_buffer_count--;
358		num_buf_aio--;
359
360	}
361
362	if ((ki->kaio_flags & KAIO_WAKEUP) ||
363		(ki->kaio_flags & KAIO_RUNDOWN) &&
364		((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
365		ki->kaio_flags &= ~KAIO_WAKEUP;
366		wakeup(p);
367	}
368
369	if ( aiocbe->jobstate == JOBST_JOBQBUF) {
370		if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
371			return error;
372		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
373			panic("aio_free_entry: invalid physio finish-up state");
374		s = splbio();
375		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
376		splx(s);
377	} else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
378		aiop = aiocbe->jobaioproc;
379		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
380	} else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
381		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
382	} else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
383		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
384	} else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
385		s = splbio();
386		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
387		splx(s);
388		if (aiocbe->bp) {
389			vunmapbuf(aiocbe->bp);
390			relpbuf(aiocbe->bp);
391			aiocbe->bp = NULL;
392		}
393	}
394	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
395		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
396		zfree(aiolio_zone, lj);
397	}
398	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
399	aiocbe->jobstate = JOBST_NULL;
400	return 0;
401}
402
403/*
404 * Rundown the jobs for a given process.
405 */
406void
407aio_proc_rundown(struct proc *p)
408{
409	int s;
410	struct kaioinfo *ki;
411	struct aio_liojob *lj, *ljn;
412	struct aiocblist *aiocbe, *aiocbn;
413
414	ki = p->p_aioinfo;
415	if (ki == NULL)
416		return;
417
418	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
419	while ((ki->kaio_active_count > 0) ||
420		(ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
421		ki->kaio_flags |= KAIO_RUNDOWN;
422		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
423			break;
424	}
425
426restart1:
427	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
428		aiocbe;
429		aiocbe = aiocbn) {
430		aiocbn = TAILQ_NEXT(aiocbe, plist);
431		if (aio_free_entry(aiocbe))
432			goto restart1;
433	}
434
435restart2:
436	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
437		aiocbe;
438		aiocbe = aiocbn) {
439		aiocbn = TAILQ_NEXT(aiocbe, plist);
440		if (aio_free_entry(aiocbe))
441			goto restart2;
442	}
443
444/*
445 * Note the use of lots of splbio here, trying to avoid
446 * splbio for long chains of I/O.  Probably unnecessary.
447 */
448
449restart3:
450	s = splbio();
451	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
452		ki->kaio_flags |= KAIO_WAKEUP;
453		tsleep (p, PRIBIO, "aioprn", 0);
454		splx(s);
455		goto restart3;
456	}
457	splx(s);
458
459restart4:
460	s = splbio();
461	for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
462		aiocbe;
463		aiocbe = aiocbn) {
464		aiocbn = TAILQ_NEXT(aiocbe, plist);
465		if (aio_free_entry(aiocbe)) {
466			splx(s);
467			goto restart4;
468		}
469	}
470	splx(s);
471
472	for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
473		  lj;
474		  lj = ljn) {
475			ljn = TAILQ_NEXT(lj, lioj_list);
476			if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
477				TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
478				zfree(aiolio_zone, lj);
479			} else {
480#if defined(DIAGNOSTIC)
481				printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
482					lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
483					lj->lioj_queue_count, lj->lioj_queue_finished_count);
484#endif
485			}
486	}
487
488	zfree(kaio_zone, ki);
489	p->p_aioinfo = NULL;
490}
491
492/*
493 * Select a job to run (called by an AIO daemon)
494 */
495static struct aiocblist *
496aio_selectjob(struct aioproclist *aiop)
497{
498
499	struct aiocblist *aiocbe;
500
501	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
502	if (aiocbe) {
503		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
504		return aiocbe;
505	}
506
507	for (aiocbe = TAILQ_FIRST(&aio_jobs);
508		aiocbe;
509		aiocbe = TAILQ_NEXT(aiocbe, list)) {
510		struct kaioinfo *ki;
511		struct proc *userp;
512
513		userp = aiocbe->userproc;
514		ki = userp->p_aioinfo;
515
516		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
517			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
518			return aiocbe;
519		}
520	}
521
522	return NULL;
523}
524
525/*
526 * The AIO processing activity.  This is the code that does the
527 * I/O request for the non-physio version of the operations.  The
528 * normal vn operations are used, and this code should work in
529 * all instances for every type of file, including pipes, sockets,
530 * fifos, and regular files.
531 */
532void
533aio_process(struct aiocblist *aiocbe)
534{
535	struct filedesc *fdp;
536	struct proc *userp, *mycp;
537	struct aiocb *cb;
538	struct file *fp;
539	struct uio auio;
540	struct iovec aiov;
541	unsigned int fd;
542	int cnt;
543	static nperline=0;
544	int error;
545	off_t offset;
546	int oublock_st, oublock_end;
547	int inblock_st, inblock_end;
548
549	userp = aiocbe->userproc;
550	cb = &aiocbe->uaiocb;
551
552	mycp = curproc;
553
554	fdp = mycp->p_fd;
555	fd = cb->aio_fildes;
556	fp = fdp->fd_ofiles[fd];
557
558	aiov.iov_base = cb->aio_buf;
559	aiov.iov_len = cb->aio_nbytes;
560
561	auio.uio_iov = &aiov;
562	auio.uio_iovcnt = 1;
563	auio.uio_offset = offset = cb->aio_offset;
564	auio.uio_resid = cb->aio_nbytes;
565	cnt = cb->aio_nbytes;
566	auio.uio_segflg = UIO_USERSPACE;
567	auio.uio_procp = mycp;
568
569	inblock_st = mycp->p_stats->p_ru.ru_inblock;
570	oublock_st = mycp->p_stats->p_ru.ru_oublock;
571	if (cb->aio_lio_opcode == LIO_READ) {
572		auio.uio_rw = UIO_READ;
573		error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
574	} else {
575		auio.uio_rw = UIO_WRITE;
576		error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
577	}
578	inblock_end = mycp->p_stats->p_ru.ru_inblock;
579	oublock_end = mycp->p_stats->p_ru.ru_oublock;
580
581	aiocbe->inputcharge = inblock_end - inblock_st;
582	aiocbe->outputcharge = oublock_end - oublock_st;
583
584	if (error) {
585		if (auio.uio_resid != cnt) {
586			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
587				error = 0;
588			if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
589				psignal(userp, SIGPIPE);
590		}
591	}
592
593	cnt -= auio.uio_resid;
594	cb->_aiocb_private.error = error;
595	cb->_aiocb_private.status = cnt;
596
597	return;
598
599}
600
601/*
602 * The AIO daemon, most of the actual work is done in aio_process,
603 * but the setup (and address space mgmt) is done in this routine.
604 */
605static void
606aio_daemon(void *uproc)
607{
608	int s;
609	struct aioproclist *aiop;
610	struct vmspace *myvm, *aiovm;
611	struct proc *mycp;
612
613	/*
614	 * Local copies of curproc (cp) and vmspace (myvm)
615	 */
616	mycp = curproc;
617	myvm = mycp->p_vmspace;
618
619	/*
620	 * We manage to create only one VM space for all AIOD processes.
621	 * The VM space for the first AIOD created becomes the shared VM
622	 * space for all of them.  We add an additional reference count,
623	 * even for the first AIOD, so the address space does not go away,
624	 * and we continue to use that original VM space even if the first
625	 * AIOD exits.
626	 */
627	if ((aiovm = aiovmspace) == NULL) {
628		aiovmspace = myvm;
629		myvm->vm_refcnt++;
630		/*
631		 * Remove userland cruft from address space.
632		 */
633		if (myvm->vm_shm)
634			shmexit(mycp);
635		pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
636		vm_map_remove(&myvm->vm_map, 0, USRSTACK);
637		myvm->vm_tsize = 0;
638		myvm->vm_dsize = 0;
639		myvm->vm_ssize = 0;
640	} else {
641		aiovm->vm_refcnt++;
642		mycp->p_vmspace = aiovm;
643		pmap_activate(mycp);
644		vmspace_free(myvm);
645		myvm = aiovm;
646	}
647
648	if (mycp->p_textvp) {
649		vrele(mycp->p_textvp);
650		mycp->p_textvp = NULL;
651	}
652
653	/*
654	 * Allocate and ready the aio control info.  There is one
655	 * aiop structure per daemon.
656	 */
657	aiop = zalloc(aiop_zone);
658	aiop->aioproc = mycp;
659	aiop->aioprocflags |= AIOP_FREE;
660	TAILQ_INIT(&aiop->jobtorun);
661
662	/*
663	 * Place thread (lightweight process) onto the AIO free thread list
664	 */
665	if (TAILQ_EMPTY(&aio_freeproc))
666		wakeup(&aio_freeproc);
667	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
668
669	/*
670	 * Make up a name for the daemon
671	 */
672	strcpy(mycp->p_comm, "aiod");
673
674	/*
675	 * Get rid of our current filedescriptors.  AIOD's don't need any
676	 * filedescriptors, except as temporarily inherited from the client.
677	 * Credentials are also cloned, and made equivalent to "root."
678	 */
679	fdfree(mycp);
680	mycp->p_fd = NULL;
681	mycp->p_ucred = crcopy(mycp->p_ucred);
682	mycp->p_ucred->cr_uid = 0;
683	mycp->p_ucred->cr_ngroups = 1;
684	mycp->p_ucred->cr_groups[0] = 1;
685
686	/*
687	 * The daemon resides in it's own pgrp.
688	 */
689	enterpgrp(mycp, mycp->p_pid, 1);
690
691	/*
692	 * Mark special process type
693	 */
694	mycp->p_flag |= P_SYSTEM|P_KTHREADP;
695
696	/*
697	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
698	 * creating to many daemons.)
699	 */
700	wakeup(mycp);
701
702	while(1) {
703		struct proc *curcp;
704		struct	aiocblist *aiocbe;
705
706		/*
707		 * curcp is the current daemon process context.
708		 * userp is the current user process context.
709		 */
710		curcp = mycp;
711
712		/*
713		 * Take daemon off of free queue
714		 */
715		if (aiop->aioprocflags & AIOP_FREE) {
716			TAILQ_REMOVE(&aio_freeproc, aiop, list);
717			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
718			aiop->aioprocflags &= ~AIOP_FREE;
719		}
720		aiop->aioprocflags &= ~AIOP_SCHED;
721
722		/*
723		 * Check for jobs
724		 */
725		while ( aiocbe = aio_selectjob(aiop)) {
726			struct proc *userp;
727			struct aiocb *cb;
728			struct kaioinfo *ki;
729			struct aio_liojob *lj;
730
731			cb = &aiocbe->uaiocb;
732			userp = aiocbe->userproc;
733
734			aiocbe->jobstate = JOBST_JOBRUNNING;
735
736			/*
737			 * Connect to process address space for user program
738			 */
739			if (userp != curcp) {
740				struct vmspace *tmpvm;
741				/*
742				 * Save the current address space that we are connected to.
743				 */
744				tmpvm = mycp->p_vmspace;
745				/*
746				 * Point to the new user address space, and refer to it.
747				 */
748				mycp->p_vmspace = userp->p_vmspace;
749				mycp->p_vmspace->vm_refcnt++;
750				/*
751				 * Activate the new mapping.
752				 */
753				pmap_activate(mycp);
754				/*
755				 * If the old address space wasn't the daemons own address
756				 * space, then we need to remove the daemon's reference from
757				 * the other process that it was acting on behalf of.
758				 */
759				if (tmpvm != myvm) {
760					vmspace_free(tmpvm);
761				}
762				/*
763				 * Disassociate from previous clients file descriptors, and
764				 * associate to the new clients descriptors.  Note that
765				 * the daemon doesn't need to worry about it's orginal
766				 * descriptors, because they were originally freed.
767				 */
768				if (mycp->p_fd)
769					fdfree(mycp);
770				mycp->p_fd = fdshare(userp);
771				curcp = userp;
772			}
773
774			ki = userp->p_aioinfo;
775			lj = aiocbe->lio;
776
777			/*
778			 * Account for currently active jobs
779			 */
780			ki->kaio_active_count++;
781
782			/*
783			 * Do the I/O function
784			 */
785			aiocbe->jobaioproc = aiop;
786			aio_process(aiocbe);
787
788			/*
789			 * decrement the active job count
790			 */
791			ki->kaio_active_count--;
792
793			/*
794			 * increment the completion count for wakeup/signal comparisons
795			 */
796			aiocbe->jobflags |= AIOCBLIST_DONE;
797			ki->kaio_queue_finished_count++;
798			if (lj) {
799				lj->lioj_queue_finished_count++;
800			}
801			if ((ki->kaio_flags & KAIO_WAKEUP) ||
802				(ki->kaio_flags & KAIO_RUNDOWN) &&
803				(ki->kaio_active_count == 0)) {
804				ki->kaio_flags &= ~KAIO_WAKEUP;
805				wakeup(userp);
806			}
807
808			s = splbio();
809			if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
810				LIOJ_SIGNAL) {
811				if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
812					(lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
813						psignal(userp, lj->lioj_signal.sigev_signo);
814						lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
815				}
816			}
817			splx(s);
818
819			aiocbe->jobstate = JOBST_JOBFINISHED;
820
821			/*
822			 * If the I/O request should be automatically rundown, do the
823			 * needed cleanup.  Otherwise, place the queue entry for
824			 * the just finished I/O request into the done queue for the
825			 * associated client.
826			 */
827			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
828				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
829				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
830			} else {
831				TAILQ_REMOVE(&ki->kaio_jobqueue,
832					aiocbe, plist);
833				TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
834					aiocbe, plist);
835			}
836
837			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
838				wakeup(aiocbe);
839				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
840			}
841
842			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
843				psignal(userp, cb->aio_sigevent.sigev_signo);
844			}
845		}
846
847		/*
848		 * Disconnect from user address space
849		 */
850		if (curcp != mycp) {
851			struct vmspace *tmpvm;
852			/*
853			 * Get the user address space to disconnect from.
854			 */
855			tmpvm = mycp->p_vmspace;
856			/*
857			 * Get original address space for daemon.
858			 */
859			mycp->p_vmspace = myvm;
860			/*
861			 * Activate the daemon's address space.
862			 */
863			pmap_activate(mycp);
864#if defined(DIAGNOSTIC)
865			if (tmpvm == myvm)
866				printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
867#endif
868			/*
869			 * remove our vmspace reference.
870			 */
871			vmspace_free(tmpvm);
872			/*
873			 * disassociate from the user process's file descriptors.
874			 */
875			if (mycp->p_fd)
876				fdfree(mycp);
877			mycp->p_fd = NULL;
878			curcp = mycp;
879		}
880
881		/*
882		 * If we are the first to be put onto the free queue, wakeup
883		 * anyone waiting for a daemon.
884		 */
885		TAILQ_REMOVE(&aio_activeproc, aiop, list);
886		if (TAILQ_EMPTY(&aio_freeproc))
887			wakeup(&aio_freeproc);
888		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
889		aiop->aioprocflags |= AIOP_FREE;
890
891		/*
892		 * If daemon is inactive for a long time, allow it to exit, thereby
893		 * freeing resources.
894		 */
895		if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
896			tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
897			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
898				(TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
899				if ((aiop->aioprocflags & AIOP_FREE) &&
900					(num_aio_procs > target_aio_procs)) {
901					TAILQ_REMOVE(&aio_freeproc, aiop, list);
902					zfree(aiop_zone, aiop);
903					num_aio_procs--;
904#if defined(DIAGNOSTIC)
905					if (mycp->p_vmspace->vm_refcnt <= 1)
906						printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
907							mycp->p_vmspace->vm_refcnt);
908#endif
909					exit1(mycp, 0);
910				}
911			}
912		}
913	}
914}
915
916/*
917 * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.
918 * The AIO daemon modifies it's environment itself.
919 */
920static int
921aio_newproc()
922{
923	int error;
924	struct rfork_args rfa;
925	struct proc *p, *np;
926
927	rfa.flags = RFPROC | RFCFDG;
928
929	p = curproc;
930	if (error = rfork(p, &rfa))
931		return error;
932
933	np = pfind(p->p_retval[0]);
934	cpu_set_fork_handler(np, aio_daemon, p);
935
936	/*
937	 * Wait until daemon is started, but continue on just in case (to
938	 * handle error conditions.
939	 */
940	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
941	num_aio_procs++;
942
943	return error;
944
945}
946
947/*
948 * Try the high-performance physio method for eligible VCHR devices.  This
949 * routine doesn't require the use of any additional threads, and have
950 * overhead.
951 */
952int
953aio_qphysio(p, aiocbe)
954	struct proc *p;
955	struct aiocblist *aiocbe;
956{
957	int error;
958	caddr_t sa;
959	struct aiocb *cb;
960	struct file *fp;
961	struct buf *bp;
962	int bflags;
963	struct vnode *vp;
964	struct kaioinfo *ki;
965	struct filedesc *fdp;
966	struct aio_liojob *lj;
967	int fd;
968	int majordev;
969	int s;
970	int cnt;
971	dev_t dev;
972	int rw;
973	d_strategy_t *fstrategy;
974	struct cdevsw *cdev;
975	struct bdevsw *bdev;
976
977	cb = &aiocbe->uaiocb;
978	fdp = p->p_fd;
979	fd = cb->aio_fildes;
980	fp = fdp->fd_ofiles[fd];
981
982	if (fp->f_type != DTYPE_VNODE) {
983		return -1;
984	}
985
986	vp = (struct vnode *)fp->f_data;
987	if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
988		return -1;
989	}
990
991	if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
992		return -1;
993	}
994
995	if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
996		return -1;
997	}
998
999	majordev = major(vp->v_rdev);
1000	if (majordev == NODEV) {
1001		return -1;
1002	}
1003
1004	cdev = cdevsw[major(vp->v_rdev)];
1005	if (cdev == NULL) {
1006		return -1;
1007	}
1008	bdev = cdev->d_bdev;
1009	if (bdev == NULL) {
1010		return -1;
1011	}
1012
1013	ki = p->p_aioinfo;
1014	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1015		return -1;
1016	}
1017
1018	cnt = cb->aio_nbytes;
1019	if (cnt > MAXPHYS) {
1020		return -1;
1021	}
1022
1023	dev = makedev(bdev->d_maj, minor(vp->v_rdev));
1024
1025	/*
1026	 * Physical I/O is charged directly to the process, so we don't have
1027	 * to fake it.
1028	 */
1029	aiocbe->inputcharge = 0;
1030	aiocbe->outputcharge = 0;
1031
1032	ki->kaio_buffer_count++;
1033
1034	lj = aiocbe->lio;
1035	if (lj) {
1036		lj->lioj_buffer_count++;
1037	}
1038
1039	/* create and build a buffer header for a transfer */
1040	bp = (struct buf *)getpbuf();
1041
1042	/*
1043	 * get a copy of the kva from the physical buffer
1044	 */
1045	bp->b_proc = p;
1046	bp->b_dev = dev;
1047	error = bp->b_error = 0;
1048
1049	if (cb->aio_lio_opcode == LIO_WRITE) {
1050		rw = 0;
1051		bflags = B_WRITE;
1052	} else {
1053		rw = 1;
1054		bflags = B_READ;
1055	}
1056
1057	bp->b_bcount = cb->aio_nbytes;
1058	bp->b_bufsize = cb->aio_nbytes;
1059	bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
1060	bp->b_iodone = aio_physwakeup;
1061	bp->b_saveaddr = bp->b_data;
1062	bp->b_data = cb->aio_buf;
1063	bp->b_blkno = btodb(cb->aio_offset);
1064
1065	if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
1066		error = EFAULT;
1067		goto doerror;
1068	}
1069	if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
1070		error = EFAULT;
1071		goto doerror;
1072	}
1073
1074	/* bring buffer into kernel space */
1075	vmapbuf(bp);
1076
1077	s = splbio();
1078	aiocbe->bp = bp;
1079	bp->b_spc = (void *)aiocbe;
1080	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1081	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1082	aiocbe->jobstate = JOBST_JOBQBUF;
1083	cb->_aiocb_private.status = cb->aio_nbytes;
1084	num_buf_aio++;
1085	fstrategy = bdev->d_strategy;
1086	bp->b_error = 0;
1087
1088	splx(s);
1089	/* perform transfer */
1090	(*fstrategy)(bp);
1091
1092	s = splbio();
1093	/*
1094	 * If we had an error invoking the request, or an error in processing
1095	 * the request before we have returned, we process it as an error
1096	 * in transfer.  Note that such an I/O error is not indicated immediately,
1097	 * but is returned using the aio_error mechanism.  In this case, aio_suspend
1098	 * will return immediately.
1099	 */
1100	if (bp->b_error || (bp->b_flags & B_ERROR)) {
1101		struct aiocb *job = aiocbe->uuaiocb;
1102
1103		aiocbe->uaiocb._aiocb_private.status = 0;
1104		suword(&job->_aiocb_private.status, 0);
1105		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1106		suword(&job->_aiocb_private.error, bp->b_error);
1107
1108		ki->kaio_buffer_finished_count++;
1109
1110		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1111			aiocbe->jobstate = JOBST_JOBBFINISHED;
1112			aiocbe->jobflags |= AIOCBLIST_DONE;
1113			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1114			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1115			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1116		}
1117	}
1118	splx(s);
1119	return 0;
1120
1121doerror:
1122	ki->kaio_buffer_count--;
1123	if (lj) {
1124		lj->lioj_buffer_count--;
1125	}
1126	aiocbe->bp = NULL;
1127	relpbuf(bp);
1128	return error;
1129}
1130
1131/*
1132 * This waits/tests physio completion.
1133 */
1134int
1135aio_fphysio(p, iocb, flgwait)
1136	struct proc *p;
1137	struct aiocblist *iocb;
1138	int flgwait;
1139{
1140	int s;
1141	struct buf *bp;
1142	int error;
1143
1144	bp = iocb->bp;
1145
1146	s = splbio();
1147	if (flgwait == 0) {
1148		if ((bp->b_flags & B_DONE) == 0) {
1149			splx(s);
1150			return EINPROGRESS;
1151		}
1152	}
1153
1154	while ((bp->b_flags & B_DONE) == 0) {
1155		if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1156			if ((bp->b_flags & B_DONE) == 0) {
1157				splx(s);
1158				return EINPROGRESS;
1159			} else {
1160				break;
1161			}
1162		}
1163	}
1164
1165	/* release mapping into kernel space */
1166	vunmapbuf(bp);
1167	iocb->bp = 0;
1168
1169	error = 0;
1170	/*
1171	 * check for an error
1172	 */
1173	if (bp->b_flags & B_ERROR) {
1174		error = bp->b_error;
1175	}
1176
1177	relpbuf(bp);
1178	return (error);
1179}
1180
1181/*
1182 * Queue a new AIO request.  Choosing either the threaded or direct physio
1183 * VCHR technique is done in this code.
1184 */
1185static int
1186_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1187{
1188	struct filedesc *fdp;
1189	struct file *fp;
1190	unsigned int fd;
1191
1192	int error;
1193	int opcode;
1194	struct aiocblist *aiocbe;
1195	struct aioproclist *aiop;
1196	struct kaioinfo *ki;
1197
1198	if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
1199		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1200	} else {
1201		aiocbe = zalloc (aiocb_zone);
1202	}
1203
1204	aiocbe->inputcharge = 0;
1205	aiocbe->outputcharge = 0;
1206
1207	suword(&job->_aiocb_private.status, -1);
1208	suword(&job->_aiocb_private.error, 0);
1209	suword(&job->_aiocb_private.kernelinfo, -1);
1210
1211	error = copyin((caddr_t)job,
1212		(caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
1213	if (error) {
1214		suword(&job->_aiocb_private.error, error);
1215
1216		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1217		return error;
1218	}
1219
1220	/*
1221	 * Save userspace address of the job info
1222	 */
1223	aiocbe->uuaiocb = job;
1224
1225	/*
1226	 * Get the opcode
1227	 */
1228	if (type != LIO_NOP) {
1229		aiocbe->uaiocb.aio_lio_opcode = type;
1230	}
1231	opcode = aiocbe->uaiocb.aio_lio_opcode;
1232
1233	/*
1234	 * Get the fd info for process
1235	 */
1236	fdp = p->p_fd;
1237
1238	/*
1239	 * Range check file descriptor
1240	 */
1241	fd = aiocbe->uaiocb.aio_fildes;
1242	if (fd >= fdp->fd_nfiles) {
1243		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1244		if (type == 0) {
1245			suword(&job->_aiocb_private.error, EBADF);
1246		}
1247		return EBADF;
1248	}
1249
1250	fp = fdp->fd_ofiles[fd];
1251	if ((fp == NULL) ||
1252		((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
1253		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1254		if (type == 0) {
1255			suword(&job->_aiocb_private.error, EBADF);
1256		}
1257		return EBADF;
1258	}
1259
1260	if (aiocbe->uaiocb.aio_offset == -1LL) {
1261		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1262		if (type == 0) {
1263			suword(&job->_aiocb_private.error, EINVAL);
1264		}
1265		return EINVAL;
1266	}
1267
1268	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1269	if (error) {
1270		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1271		if (type == 0) {
1272			suword(&job->_aiocb_private.error, EINVAL);
1273		}
1274		return error;
1275	}
1276
1277	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)jobrefid;
1278	jobrefid++;
1279	if (jobrefid > INT_MAX)
1280		jobrefid = 1;
1281
1282	if (opcode == LIO_NOP) {
1283		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1284		if (type == 0) {
1285			suword(&job->_aiocb_private.error, 0);
1286			suword(&job->_aiocb_private.status, 0);
1287			suword(&job->_aiocb_private.kernelinfo, 0);
1288		}
1289		return 0;
1290	}
1291
1292	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1293		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1294		if (type == 0) {
1295			suword(&job->_aiocb_private.status, 0);
1296			suword(&job->_aiocb_private.error, EINVAL);
1297		}
1298		return EINVAL;
1299	}
1300
1301	suword(&job->_aiocb_private.error, EINPROGRESS);
1302	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1303	aiocbe->userproc = p;
1304	aiocbe->jobflags = 0;
1305	aiocbe->lio = lj;
1306	ki = p->p_aioinfo;
1307
1308	if ((error = aio_qphysio(p, aiocbe)) == 0) {
1309		return 0;
1310	} else if (error > 0) {
1311		suword(&job->_aiocb_private.status, 0);
1312		aiocbe->uaiocb._aiocb_private.error = error;
1313		suword(&job->_aiocb_private.error, error);
1314		return error;
1315	}
1316
1317	/*
1318	 * No buffer for daemon I/O
1319	 */
1320	aiocbe->bp = NULL;
1321
1322	ki->kaio_queue_count++;
1323	if (lj) {
1324		lj->lioj_queue_count++;
1325	}
1326	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1327	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1328	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1329
1330	num_queue_count++;
1331	error = 0;
1332
1333	/*
1334	 * If we don't have a free AIO process, and we are below our
1335	 * quota, then start one.  Otherwise, depend on the subsequent
1336	 * I/O completions to pick-up this job.  If we don't sucessfully
1337	 * create the new process (thread) due to resource issues, we
1338	 * return an error for now (EAGAIN), which is likely not the
1339	 * correct thing to do.
1340	 */
1341retryproc:
1342	if (aiop = TAILQ_FIRST(&aio_freeproc)) {
1343		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1344		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1345		aiop->aioprocflags &= ~AIOP_FREE;
1346		wakeup(aiop->aioproc);
1347	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1348			((ki->kaio_active_count + num_aio_resv_start) <
1349				ki->kaio_maxactive_count)) {
1350		num_aio_resv_start++;
1351		if ((error = aio_newproc()) == 0) {
1352			num_aio_resv_start--;
1353			goto retryproc;
1354		}
1355		num_aio_resv_start--;
1356	}
1357	return error;
1358}
1359
1360/*
1361 * This routine queues an AIO request, checking for quotas.
1362 */
1363static int
1364aio_aqueue(struct proc *p, struct aiocb *job, int type)
1365{
1366	struct kaioinfo *ki;
1367
1368	if (p->p_aioinfo == NULL) {
1369		aio_init_aioinfo(p);
1370	}
1371
1372	if (num_queue_count >= max_queue_count)
1373		return EAGAIN;
1374
1375	ki = p->p_aioinfo;
1376	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1377		return EAGAIN;
1378
1379	return _aio_aqueue(p, job, NULL, type);
1380}
1381
1382/*
1383 * Support the aio_return system call, as a side-effect, kernel
1384 * resources are released.
1385 */
1386int
1387aio_return(struct proc *p, struct aio_return_args *uap)
1388{
1389	int s;
1390	int jobref, status;
1391	struct aiocblist *cb, *ncb;
1392	struct aiocb *ujob;
1393	struct kaioinfo *ki;
1394	struct proc *userp;
1395
1396	ki = p->p_aioinfo;
1397	if (ki == NULL) {
1398		return EINVAL;
1399	}
1400
1401	ujob = uap->aiocbp;
1402
1403	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1404	if (jobref == -1 || jobref == 0)
1405		return EINVAL;
1406
1407	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1408		cb;
1409		cb = TAILQ_NEXT(cb, plist)) {
1410		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1411			if (ujob == cb->uuaiocb) {
1412				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1413			} else {
1414				p->p_retval[0] = EFAULT;
1415			}
1416			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1417				curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
1418				cb->outputcharge = 0;
1419			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1420				curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
1421				cb->inputcharge = 0;
1422			}
1423			aio_free_entry(cb);
1424			return 0;
1425		}
1426	}
1427
1428	s = splbio();
1429	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1430		cb;
1431		cb = ncb) {
1432		ncb = TAILQ_NEXT(cb, plist);
1433		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1434			splx(s);
1435			if (ujob == cb->uuaiocb) {
1436				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1437			} else {
1438				p->p_retval[0] = EFAULT;
1439			}
1440			aio_free_entry(cb);
1441			return 0;
1442		}
1443	}
1444	splx(s);
1445
1446	return (EINVAL);
1447}
1448
1449/*
1450 * Allow a process to wakeup when any of the I/O requests are
1451 * completed.
1452 */
1453int
1454aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1455{
1456	struct timeval atv;
1457	struct timespec ts;
1458	struct aiocb *const *cbptr, *cbp;
1459	struct kaioinfo *ki;
1460	struct aiocblist *cb;
1461	int i;
1462	int njoblist;
1463	int error, s, timo;
1464	int *ijoblist;
1465	struct aiocb **ujoblist;
1466
1467	if (uap->nent >= AIO_LISTIO_MAX)
1468		return EINVAL;
1469
1470	timo = 0;
1471	if (uap->timeout) {
1472		/*
1473		 * Get timespec struct
1474		 */
1475		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
1476			return error;
1477		}
1478
1479		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1480			return (EINVAL);
1481
1482		TIMESPEC_TO_TIMEVAL(&atv, &ts)
1483		if (itimerfix(&atv))
1484			return (EINVAL);
1485		/*
1486		 * XXX this is not as careful as settimeofday() about minimising
1487		 * interrupt latency.  The hzto() interface is inconvenient as usual.
1488		 */
1489		s = splclock();
1490		timevaladd(&atv, &time);
1491		timo = hzto(&atv);
1492		splx(s);
1493		if (timo == 0)
1494			timo = 1;
1495	}
1496
1497	ki = p->p_aioinfo;
1498	if (ki == NULL)
1499		return EAGAIN;
1500
1501	njoblist = 0;
1502	ijoblist = zalloc(aiol_zone);
1503	ujoblist = zalloc(aiol_zone);
1504	cbptr = uap->aiocbp;
1505
1506	for(i = 0; i < uap->nent; i++) {
1507		cbp = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1508		if (cbp == 0)
1509			continue;
1510		ujoblist[njoblist] = cbp;
1511		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1512		njoblist++;
1513	}
1514	if (njoblist == 0) {
1515		zfree(aiol_zone, ijoblist);
1516		zfree(aiol_zone, ujoblist);
1517		return 0;
1518	}
1519
1520	error = 0;
1521	while (1) {
1522		for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1523			cb; cb = TAILQ_NEXT(cb, plist)) {
1524			for(i = 0; i < njoblist; i++) {
1525				if (((int) cb->uaiocb._aiocb_private.kernelinfo) ==
1526					ijoblist[i]) {
1527					if (ujoblist[i] != cb->uuaiocb)
1528						error = EINVAL;
1529					zfree(aiol_zone, ijoblist);
1530					zfree(aiol_zone, ujoblist);
1531					return error;
1532				}
1533			}
1534		}
1535
1536		s = splbio();
1537		for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1538			cb; cb = TAILQ_NEXT(cb, plist)) {
1539			for(i = 0; i < njoblist; i++) {
1540				if (((int) cb->uaiocb._aiocb_private.kernelinfo) ==
1541					ijoblist[i]) {
1542					splx(s);
1543					if (ujoblist[i] != cb->uuaiocb)
1544						error = EINVAL;
1545					zfree(aiol_zone, ijoblist);
1546					zfree(aiol_zone, ujoblist);
1547					return error;
1548				}
1549			}
1550		}
1551
1552		ki->kaio_flags |= KAIO_WAKEUP;
1553		error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
1554		splx(s);
1555
1556		if (error == EINTR) {
1557			zfree(aiol_zone, ijoblist);
1558			zfree(aiol_zone, ujoblist);
1559			return EINTR;
1560		} else if (error == EWOULDBLOCK) {
1561			zfree(aiol_zone, ijoblist);
1562			zfree(aiol_zone, ujoblist);
1563			return EAGAIN;
1564		}
1565	}
1566
1567/* NOTREACHED */
1568	return EINVAL;
1569}
1570
1571/*
1572 * aio_cancel at the kernel level is a NOOP right now.  It
1573 * might be possible to support it partially in user mode, or
1574 * in kernel mode later on.
1575 */
1576int
1577aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1578{
1579	return AIO_NOTCANCELLED;
1580}
1581
1582/*
1583 * aio_error is implemented in the kernel level for compatibility
1584 * purposes only.  For a user mode async implementation, it would be
1585 * best to do it in a userland subroutine.
1586 */
1587int
1588aio_error(struct proc *p, struct aio_error_args *uap)
1589{
1590	int s;
1591	struct aiocblist *cb;
1592	struct kaioinfo *ki;
1593	int jobref;
1594	int error, status;
1595
1596	ki = p->p_aioinfo;
1597	if (ki == NULL)
1598		return EINVAL;
1599
1600	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1601	if ((jobref == -1) || (jobref == 0))
1602		return EINVAL;
1603
1604	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1605		cb;
1606		cb = TAILQ_NEXT(cb, plist)) {
1607
1608		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1609			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1610			return 0;
1611		}
1612	}
1613
1614	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
1615		cb;
1616		cb = TAILQ_NEXT(cb, plist)) {
1617
1618		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1619			p->p_retval[0] = EINPROGRESS;
1620			return 0;
1621		}
1622	}
1623
1624	s = splbio();
1625	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1626		cb;
1627		cb = TAILQ_NEXT(cb, plist)) {
1628		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1629			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1630			splx(s);
1631			return 0;
1632		}
1633	}
1634
1635	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
1636		cb;
1637		cb = TAILQ_NEXT(cb, plist)) {
1638		if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1639			p->p_retval[0] = EINPROGRESS;
1640			splx(s);
1641			return 0;
1642		}
1643	}
1644	splx(s);
1645
1646
1647	/*
1648	 * Hack for lio
1649	 */
1650/*
1651	status = fuword(&uap->aiocbp->_aiocb_private.status);
1652	if (status == -1) {
1653		return fuword(&uap->aiocbp->_aiocb_private.error);
1654	}
1655*/
1656	return EINVAL;
1657}
1658
1659int
1660aio_read(struct proc *p, struct aio_read_args *uap)
1661{
1662	struct filedesc *fdp;
1663	struct file *fp;
1664	struct uio auio;
1665	struct iovec aiov;
1666	unsigned int fd;
1667	int cnt;
1668	struct aiocb iocb;
1669	int error, pmodes;
1670
1671	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1672	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1673		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1674	}
1675
1676	/*
1677	 * Get control block
1678	 */
1679	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1680		return error;
1681
1682	/*
1683	 * Get the fd info for process
1684	 */
1685	fdp = p->p_fd;
1686
1687	/*
1688	 * Range check file descriptor
1689	 */
1690	fd = iocb.aio_fildes;
1691	if (fd >= fdp->fd_nfiles)
1692		return EBADF;
1693	fp = fdp->fd_ofiles[fd];
1694	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1695		return EBADF;
1696	if (iocb.aio_offset == -1LL)
1697		return EINVAL;
1698
1699	auio.uio_resid = iocb.aio_nbytes;
1700	if (auio.uio_resid < 0)
1701		return (EINVAL);
1702
1703	/*
1704	 * Process sync simply -- queue async request.
1705	 */
1706	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
1707		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1708	}
1709
1710	aiov.iov_base = iocb.aio_buf;
1711	aiov.iov_len = iocb.aio_nbytes;
1712
1713	auio.uio_iov = &aiov;
1714	auio.uio_iovcnt = 1;
1715	auio.uio_offset = iocb.aio_offset;
1716	auio.uio_rw = UIO_READ;
1717	auio.uio_segflg = UIO_USERSPACE;
1718	auio.uio_procp = p;
1719
1720	cnt = iocb.aio_nbytes;
1721	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
1722	if (error &&
1723		(auio.uio_resid != cnt) &&
1724		(error == ERESTART || error == EINTR || error == EWOULDBLOCK))
1725			error = 0;
1726	cnt -= auio.uio_resid;
1727	p->p_retval[0] = cnt;
1728	return error;
1729}
1730
1731int
1732aio_write(struct proc *p, struct aio_write_args *uap)
1733{
1734	struct filedesc *fdp;
1735	struct file *fp;
1736	struct uio auio;
1737	struct iovec aiov;
1738	unsigned int fd;
1739	int cnt;
1740	struct aiocb iocb;
1741	int error;
1742	int pmodes;
1743
1744	/*
1745	 * Process sync simply -- queue async request.
1746	 */
1747	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1748	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1749		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
1750	}
1751
1752	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1753		return error;
1754
1755	/*
1756	 * Get the fd info for process
1757	 */
1758	fdp = p->p_fd;
1759
1760	/*
1761	 * Range check file descriptor
1762	 */
1763	fd = iocb.aio_fildes;
1764	if (fd >= fdp->fd_nfiles)
1765		return EBADF;
1766	fp = fdp->fd_ofiles[fd];
1767	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1768		return EBADF;
1769	if (iocb.aio_offset == -1LL)
1770		return EINVAL;
1771
1772	aiov.iov_base = iocb.aio_buf;
1773	aiov.iov_len = iocb.aio_nbytes;
1774	auio.uio_iov = &aiov;
1775	auio.uio_iovcnt = 1;
1776	auio.uio_offset = iocb.aio_offset;
1777
1778	auio.uio_resid = iocb.aio_nbytes;
1779	if (auio.uio_resid < 0)
1780		return (EINVAL);
1781
1782	auio.uio_rw = UIO_WRITE;
1783	auio.uio_segflg = UIO_USERSPACE;
1784	auio.uio_procp = p;
1785
1786	cnt = iocb.aio_nbytes;
1787	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
1788	if (error) {
1789		if (auio.uio_resid != cnt) {
1790			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
1791				error = 0;
1792			if (error == EPIPE)
1793				psignal(p, SIGPIPE);
1794		}
1795	}
1796	cnt -= auio.uio_resid;
1797	p->p_retval[0] = cnt;
1798	return error;
1799}
1800
1801int
1802lio_listio(struct proc *p, struct lio_listio_args *uap)
1803{
1804	int nent, nentqueued;
1805	struct aiocb *iocb, * const *cbptr;
1806	struct aiocblist *cb;
1807	struct kaioinfo *ki;
1808	struct aio_liojob *lj;
1809	int error, runningcode;
1810	int nerror;
1811	int i;
1812	int s;
1813
1814	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
1815		return EINVAL;
1816	}
1817
1818	nent = uap->nent;
1819	if (nent > AIO_LISTIO_MAX) {
1820		return EINVAL;
1821	}
1822
1823	if (p->p_aioinfo == NULL) {
1824		aio_init_aioinfo(p);
1825	}
1826
1827	if ((nent + num_queue_count) > max_queue_count) {
1828		return EAGAIN;
1829	}
1830
1831	ki = p->p_aioinfo;
1832	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
1833		return EAGAIN;
1834	}
1835
1836	lj = zalloc(aiolio_zone);
1837	if (!lj) {
1838		return EAGAIN;
1839	}
1840
1841	lj->lioj_flags = 0;
1842	lj->lioj_buffer_count = 0;
1843	lj->lioj_buffer_finished_count = 0;
1844	lj->lioj_queue_count = 0;
1845	lj->lioj_queue_finished_count = 0;
1846	lj->lioj_ki = ki;
1847	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1848
1849	/*
1850	 * Setup signal
1851	 */
1852	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1853		error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
1854		if (error)
1855			return error;
1856		lj->lioj_flags |= LIOJ_SIGNAL;
1857		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1858	} else {
1859		lj->lioj_flags &= ~LIOJ_SIGNAL;
1860	}
1861
1862/*
1863 * get pointers to the list of I/O requests
1864 */
1865
1866	nerror = 0;
1867	nentqueued = 0;
1868	cbptr = uap->acb_list;
1869	for(i = 0; i < uap->nent; i++) {
1870		iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1871		if (((int) iocb != -1) && ((int) iocb != NULL)) {
1872			error = _aio_aqueue(p, iocb, lj, 0);
1873			if (error == 0) {
1874				nentqueued++;
1875			} else {
1876				nerror++;
1877			}
1878		}
1879	}
1880
1881	/*
1882	 * If we haven't queued any, then just return error
1883	 */
1884	if (nentqueued == 0) {
1885		return 0;
1886	}
1887
1888	/*
1889	 * Calculate the appropriate error return
1890	 */
1891	runningcode = 0;
1892	if (nerror)
1893		runningcode = EIO;
1894
1895	if (uap->mode == LIO_WAIT) {
1896		while (1) {
1897			int found;
1898			found = 0;
1899			for(i = 0; i < uap->nent; i++) {
1900				int jobref, command;
1901
1902				/*
1903				 * Fetch address of the control buf pointer in user space
1904				 */
1905				iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1906				if (((int) iocb == -1) || ((int) iocb == 0))
1907					continue;
1908
1909				/*
1910				 * Fetch the associated command from user space
1911				 */
1912				command = fuword(&iocb->aio_lio_opcode);
1913				if (command == LIO_NOP) {
1914					found++;
1915					continue;
1916				}
1917
1918				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1919
1920				for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1921					cb;
1922					cb = TAILQ_NEXT(cb, plist)) {
1923					if (((int) cb->uaiocb._aiocb_private.kernelinfo) ==
1924						jobref) {
1925						if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1926							curproc->p_stats->p_ru.ru_oublock +=
1927								cb->outputcharge;
1928							cb->outputcharge = 0;
1929						} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1930							curproc->p_stats->p_ru.ru_inblock +=
1931								cb->inputcharge;
1932							cb->inputcharge = 0;
1933						}
1934						found++;
1935						break;
1936					}
1937				}
1938
1939				s = splbio();
1940				for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1941					cb;
1942					cb = TAILQ_NEXT(cb, plist)) {
1943					if (((int) cb->uaiocb._aiocb_private.kernelinfo) ==
1944						jobref) {
1945						found++;
1946						break;
1947					}
1948				}
1949				splx(s);
1950
1951			}
1952
1953			/*
1954			 * If all I/Os have been disposed of, then we can return
1955			 */
1956			if (found == nentqueued) {
1957				return runningcode;
1958			}
1959
1960			ki->kaio_flags |= KAIO_WAKEUP;
1961			error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
1962
1963			if (error == EINTR) {
1964				return EINTR;
1965			} else if (error == EWOULDBLOCK) {
1966				return EAGAIN;
1967			}
1968
1969		}
1970	}
1971
1972	return runningcode;
1973}
1974
1975/*
1976 * This is a wierd hack so that we can post a signal.  It is safe
1977 * to do so from a timeout routine, but *not* from an interrupt routine.
1978 */
1979static void
1980process_signal(void *ljarg)
1981{
1982	struct aio_liojob *lj = ljarg;
1983	if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
1984		if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
1985			psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
1986			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1987		}
1988	}
1989}
1990
1991/*
1992 * Interrupt handler for physio, performs the necessary process wakeups,
1993 * and signals.
1994 */
1995static void
1996aio_physwakeup(bp)
1997	struct buf *bp;
1998{
1999	struct aiocblist *aiocbe;
2000	struct proc *p;
2001	struct kaioinfo *ki;
2002	struct aio_liojob *lj;
2003	int s;
2004	s = splbio();
2005
2006	wakeup((caddr_t) bp);
2007	bp->b_flags &= ~B_CALL;
2008	bp->b_flags |= B_DONE;
2009
2010	aiocbe = (struct aiocblist *)bp->b_spc;
2011	if (aiocbe) {
2012		p = bp->b_proc;
2013
2014		aiocbe->jobstate = JOBST_JOBBFINISHED;
2015		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2016		aiocbe->uaiocb._aiocb_private.error = 0;
2017		aiocbe->jobflags |= AIOCBLIST_DONE;
2018
2019		if (bp->b_flags & B_ERROR) {
2020			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2021		}
2022
2023		lj = aiocbe->lio;
2024		if (lj) {
2025			lj->lioj_buffer_finished_count++;
2026			/*
2027			 * wakeup/signal if all of the interrupt jobs are done
2028			 */
2029			if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
2030				/*
2031				 * post a signal if it is called for
2032				 */
2033				if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2034					LIOJ_SIGNAL) {
2035					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2036					timeout(process_signal, lj, 0);
2037				}
2038			}
2039		}
2040
2041		ki = p->p_aioinfo;
2042		if (ki) {
2043			ki->kaio_buffer_finished_count++;
2044			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2045			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2046			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2047			/*
2048			 * and do the wakeup
2049			 */
2050			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2051				ki->kaio_flags &= ~KAIO_WAKEUP;
2052				wakeup(p);
2053			}
2054		}
2055	}
2056	splx(s);
2057}
2058