vfs_aio.c revision 37406
1/*
2 * Copyright (c) 1997 John S. Dyson.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 *    derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author.  This software is distributed AS-IS.
15 *
16 * $Id: vfs_aio.c,v 1.30 1998/07/04 22:30:22 julian Exp $
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/sysproto.h>
26#include <sys/filedesc.h>
27#include <sys/kernel.h>
28#include <sys/fcntl.h>
29#include <sys/file.h>
30#include <sys/lock.h>
31#include <sys/unistd.h>
32#include <sys/proc.h>
33#include <sys/uio.h>
34#include <sys/malloc.h>
35#include <sys/signalvar.h>
36#include <sys/sysctl.h>
37#include <sys/vnode.h>
38#include <sys/conf.h>
39#include <miscfs/specfs/specdev.h>
40
41#include <vm/vm.h>
42#include <vm/vm_param.h>
43#include <vm/vm_extern.h>
44#include <vm/pmap.h>
45#include <vm/vm_map.h>
46#include <vm/vm_zone.h>
47#include <sys/aio.h>
48#include <sys/shm.h>
49#include <sys/user.h>
50
51#include <machine/cpu.h>
52#include <machine/limits.h>
53
54static	long jobrefid;
55
56#define JOBST_NULL			0x0
57#define	JOBST_JOBQPROC		0x1
58#define JOBST_JOBQGLOBAL	0x2
59#define JOBST_JOBRUNNING	0x3
60#define JOBST_JOBFINISHED	0x4
61#define	JOBST_JOBQBUF		0x5
62#define	JOBST_JOBBFINISHED	0x6
63
64#ifndef MAX_AIO_PER_PROC
65#define MAX_AIO_PER_PROC	32
66#endif
67
68#ifndef MAX_AIO_QUEUE_PER_PROC
69#define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
70#endif
71
72#ifndef MAX_AIO_PROCS
73#define MAX_AIO_PROCS		32
74#endif
75
76#ifndef MAX_AIO_QUEUE
77#define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
78#endif
79
80#ifndef TARGET_AIO_PROCS
81#define TARGET_AIO_PROCS	0
82#endif
83
84#ifndef MAX_BUF_AIO
85#define MAX_BUF_AIO 16
86#endif
87
88#ifndef AIOD_TIMEOUT_DEFAULT
89#define	AIOD_TIMEOUT_DEFAULT (10 * hz)
90#endif
91
92#ifndef AIOD_LIFETIME_DEFAULT
93#define AIOD_LIFETIME_DEFAULT (30 * hz)
94#endif
95
96static int max_aio_procs = MAX_AIO_PROCS;
97static int num_aio_procs = 0;
98static int target_aio_procs = TARGET_AIO_PROCS;
99static int max_queue_count = MAX_AIO_QUEUE;
100static int num_queue_count = 0;
101static int num_buf_aio = 0;
102static int num_aio_resv_start = 0;
103static int aiod_timeout;
104static int aiod_lifetime;
105
106static int max_aio_per_proc = MAX_AIO_PER_PROC,
107	max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC;
108
109static int max_buf_aio = MAX_BUF_AIO;
110
111SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
112
113SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
114	CTLFLAG_RW, &max_aio_per_proc, 0, "");
115
116SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
117	CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
118
119SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
120	CTLFLAG_RW, &max_aio_procs, 0, "");
121
122SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
123	CTLFLAG_RD, &num_aio_procs, 0, "");
124
125SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
126	CTLFLAG_RD, &num_queue_count, 0, "");
127
128SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
129	CTLFLAG_RW, &max_queue_count, 0, "");
130
131SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
132	CTLFLAG_RW, &target_aio_procs, 0, "");
133
134SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
135	CTLFLAG_RW, &max_buf_aio, 0, "");
136
137SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
138	CTLFLAG_RD, &num_buf_aio, 0, "");
139
140SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
141	CTLFLAG_RW, &aiod_lifetime, 0, "");
142
143SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
144	CTLFLAG_RW, &aiod_timeout, 0, "");
145
146
147/*
148 * Job queue item
149 */
150
151#define AIOCBLIST_CANCELLED	0x1
152#define AIOCBLIST_RUNDOWN	0x4
153#define AIOCBLIST_ASYNCFREE	0x8
154#define AIOCBLIST_DONE		0x10
155
156struct aiocblist {
157	TAILQ_ENTRY (aiocblist) list;		/* List of jobs */
158	TAILQ_ENTRY (aiocblist) plist;		/* List of jobs for proc */
159	int	jobflags;
160	int	jobstate;
161	int inputcharge, outputcharge;
162	struct	buf *bp;				/* buffer pointer */
163	struct	proc *userproc;			/* User process */
164	struct	aioproclist	*jobaioproc;	/* AIO process descriptor */
165	struct	aio_liojob	*lio;		/* optional lio job */
166	struct	aiocb *uuaiocb;			/* pointer in userspace of aiocb */
167	struct	aiocb uaiocb;			/* Kernel I/O control block */
168};
169
170
171/*
172 * AIO process info
173 */
174#define AIOP_FREE	0x1			/* proc on free queue */
175#define AIOP_SCHED	0x2			/* proc explicitly scheduled */
176
177struct aioproclist {
178	int aioprocflags;			/* AIO proc flags */
179	TAILQ_ENTRY(aioproclist) list;		/* List of processes */
180	struct proc *aioproc;			/* The AIO thread */
181	TAILQ_HEAD (,aiocblist) jobtorun;	/* suggested job to run */
182};
183
184/*
185 * data-structure for lio signal management
186 */
187struct aio_liojob {
188	int lioj_flags;
189	int	lioj_buffer_count;
190	int	lioj_buffer_finished_count;
191	int	lioj_queue_count;
192	int	lioj_queue_finished_count;
193	struct sigevent lioj_signal;	/* signal on all I/O done */
194	TAILQ_ENTRY (aio_liojob) lioj_list;
195	struct kaioinfo *lioj_ki;
196};
197#define	LIOJ_SIGNAL			0x1 /* signal on all done (lio) */
198#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
199
200/*
201 * per process aio data structure
202 */
203struct kaioinfo {
204	int	kaio_flags;			/* per process kaio flags */
205	int	kaio_maxactive_count;	/* maximum number of AIOs */
206	int	kaio_active_count;	/* number of currently used AIOs */
207	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
208	int	kaio_queue_count;	/* size of AIO queue */
209	int	kaio_ballowed_count;	/* maximum number of buffers */
210	int	kaio_queue_finished_count;	/* number of daemon jobs finished */
211	int	kaio_buffer_count;	/* number of physio buffers */
212	int	kaio_buffer_finished_count;	/* count of I/O done */
213	struct proc *kaio_p;			/* process that uses this kaio block */
214	TAILQ_HEAD (,aio_liojob) kaio_liojoblist;	/* list of lio jobs */
215	TAILQ_HEAD (,aiocblist)	kaio_jobqueue;	/* job queue for process */
216	TAILQ_HEAD (,aiocblist)	kaio_jobdone;	/* done queue for process */
217	TAILQ_HEAD (,aiocblist)	kaio_bufqueue;	/* buffer job queue for process */
218	TAILQ_HEAD (,aiocblist)	kaio_bufdone;	/* buffer done queue for process */
219};
220
221#define KAIO_RUNDOWN 0x1		/* process is being run down */
222#define KAIO_WAKEUP 0x2			/* wakeup process when there is a significant
223								   event */
224
225
226static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc;
227static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
228static TAILQ_HEAD(,aiocblist) aio_bufjobs;		/* Phys I/O job list */
229static TAILQ_HEAD(,aiocblist) aio_freejobs;		/* Pool of free jobs */
230
231static void aio_init_aioinfo(struct proc *p) ;
232static void aio_onceonly(void *) ;
233static int aio_free_entry(struct aiocblist *aiocbe);
234static void aio_process(struct aiocblist *aiocbe);
235static int aio_newproc(void) ;
236static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ;
237static void aio_physwakeup(struct buf *bp);
238static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
239static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
240static void aio_daemon(void *uproc);
241
242SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
243
244static vm_zone_t kaio_zone=0, aiop_zone=0,
245	aiocb_zone=0, aiol_zone=0, aiolio_zone=0;
246
247/*
248 * Single AIOD vmspace shared amongst all of them
249 */
250static struct vmspace *aiovmspace = NULL;
251
252/*
253 * Startup initialization
254 */
255void
256aio_onceonly(void *na)
257{
258	TAILQ_INIT(&aio_freeproc);
259	TAILQ_INIT(&aio_activeproc);
260	TAILQ_INIT(&aio_jobs);
261	TAILQ_INIT(&aio_bufjobs);
262	TAILQ_INIT(&aio_freejobs);
263	kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
264	aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
265	aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
266	aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
267	aiolio_zone = zinit("AIOLIO",
268		AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1);
269	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
270	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
271	jobrefid = 1;
272}
273
274/*
275 * Init the per-process aioinfo structure.
276 * The aioinfo limits are set per-process for user limit (resource) management.
277 */
278void
279aio_init_aioinfo(struct proc *p)
280{
281	struct kaioinfo *ki;
282	if (p->p_aioinfo == NULL) {
283		ki = zalloc(kaio_zone);
284		p->p_aioinfo = ki;
285		ki->kaio_flags = 0;
286		ki->kaio_maxactive_count = max_aio_per_proc;
287		ki->kaio_active_count = 0;
288		ki->kaio_qallowed_count = max_aio_queue_per_proc;
289		ki->kaio_queue_count = 0;
290		ki->kaio_ballowed_count = max_buf_aio;
291		ki->kaio_buffer_count = 0;
292		ki->kaio_buffer_finished_count = 0;
293		ki->kaio_p = p;
294		TAILQ_INIT(&ki->kaio_jobdone);
295		TAILQ_INIT(&ki->kaio_jobqueue);
296		TAILQ_INIT(&ki->kaio_bufdone);
297		TAILQ_INIT(&ki->kaio_bufqueue);
298		TAILQ_INIT(&ki->kaio_liojoblist);
299	}
300}
301
302/*
303 * Free a job entry.  Wait for completion if it is currently
304 * active, but don't delay forever.  If we delay, we return
305 * a flag that says that we have to restart the queue scan.
306 */
307int
308aio_free_entry(struct aiocblist *aiocbe)
309{
310	struct kaioinfo *ki;
311	struct aioproclist *aiop;
312	struct aio_liojob *lj;
313	struct proc *p;
314	int error;
315	int s;
316
317	if (aiocbe->jobstate == JOBST_NULL)
318		panic("aio_free_entry: freeing already free job");
319
320	p = aiocbe->userproc;
321	ki = p->p_aioinfo;
322	lj = aiocbe->lio;
323	if (ki == NULL)
324		panic("aio_free_entry: missing p->p_aioinfo");
325
326	if (aiocbe->jobstate == JOBST_JOBRUNNING) {
327		if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
328			return 0;
329		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
330		tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
331	}
332	aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
333
334	if (aiocbe->bp == NULL) {
335		if (ki->kaio_queue_count <= 0)
336			panic("aio_free_entry: process queue size <= 0");
337		if (num_queue_count <= 0)
338			panic("aio_free_entry: system wide queue size <= 0");
339
340		if(lj) {
341			lj->lioj_queue_count--;
342			if (aiocbe->jobflags & AIOCBLIST_DONE)
343				lj->lioj_queue_finished_count--;
344		}
345		ki->kaio_queue_count--;
346		if (aiocbe->jobflags & AIOCBLIST_DONE)
347			ki->kaio_queue_finished_count--;
348		num_queue_count--;
349
350	} else {
351		if(lj) {
352			lj->lioj_buffer_count--;
353			if (aiocbe->jobflags & AIOCBLIST_DONE)
354				lj->lioj_buffer_finished_count--;
355		}
356		if (aiocbe->jobflags & AIOCBLIST_DONE)
357			ki->kaio_buffer_finished_count--;
358		ki->kaio_buffer_count--;
359		num_buf_aio--;
360
361	}
362
363	if ((ki->kaio_flags & KAIO_WAKEUP) ||
364		(ki->kaio_flags & KAIO_RUNDOWN) &&
365		((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) {
366		ki->kaio_flags &= ~KAIO_WAKEUP;
367		wakeup(p);
368	}
369
370	if ( aiocbe->jobstate == JOBST_JOBQBUF) {
371		if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
372			return error;
373		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
374			panic("aio_free_entry: invalid physio finish-up state");
375		s = splbio();
376		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
377		splx(s);
378	} else if ( aiocbe->jobstate == JOBST_JOBQPROC) {
379		aiop = aiocbe->jobaioproc;
380		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
381	} else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) {
382		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
383	} else if ( aiocbe->jobstate == JOBST_JOBFINISHED) {
384		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
385	} else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) {
386		s = splbio();
387		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
388		splx(s);
389		if (aiocbe->bp) {
390			vunmapbuf(aiocbe->bp);
391			relpbuf(aiocbe->bp);
392			aiocbe->bp = NULL;
393		}
394	}
395	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
396		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
397		zfree(aiolio_zone, lj);
398	}
399	TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
400	aiocbe->jobstate = JOBST_NULL;
401	return 0;
402}
403
404/*
405 * Rundown the jobs for a given process.
406 */
407void
408aio_proc_rundown(struct proc *p)
409{
410	int s;
411	struct kaioinfo *ki;
412	struct aio_liojob *lj, *ljn;
413	struct aiocblist *aiocbe, *aiocbn;
414
415	ki = p->p_aioinfo;
416	if (ki == NULL)
417		return;
418
419	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
420	while ((ki->kaio_active_count > 0) ||
421		(ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) {
422		ki->kaio_flags |= KAIO_RUNDOWN;
423		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
424			break;
425	}
426
427restart1:
428	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone);
429		aiocbe;
430		aiocbe = aiocbn) {
431		aiocbn = TAILQ_NEXT(aiocbe, plist);
432		if (aio_free_entry(aiocbe))
433			goto restart1;
434	}
435
436restart2:
437	for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue);
438		aiocbe;
439		aiocbe = aiocbn) {
440		aiocbn = TAILQ_NEXT(aiocbe, plist);
441		if (aio_free_entry(aiocbe))
442			goto restart2;
443	}
444
445/*
446 * Note the use of lots of splbio here, trying to avoid
447 * splbio for long chains of I/O.  Probably unnecessary.
448 */
449
450restart3:
451	s = splbio();
452	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
453		ki->kaio_flags |= KAIO_WAKEUP;
454		tsleep (p, PRIBIO, "aioprn", 0);
455		splx(s);
456		goto restart3;
457	}
458	splx(s);
459
460restart4:
461	s = splbio();
462	for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone);
463		aiocbe;
464		aiocbe = aiocbn) {
465		aiocbn = TAILQ_NEXT(aiocbe, plist);
466		if (aio_free_entry(aiocbe)) {
467			splx(s);
468			goto restart4;
469		}
470	}
471	splx(s);
472
473	for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist);
474		  lj;
475		  lj = ljn) {
476			ljn = TAILQ_NEXT(lj, lioj_list);
477			if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
478				TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
479				zfree(aiolio_zone, lj);
480			} else {
481#if defined(DIAGNOSTIC)
482				printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n",
483					lj->lioj_buffer_count, lj->lioj_buffer_finished_count,
484					lj->lioj_queue_count, lj->lioj_queue_finished_count);
485#endif
486			}
487	}
488
489	zfree(kaio_zone, ki);
490	p->p_aioinfo = NULL;
491}
492
493/*
494 * Select a job to run (called by an AIO daemon)
495 */
496static struct aiocblist *
497aio_selectjob(struct aioproclist *aiop)
498{
499
500	struct aiocblist *aiocbe;
501
502	aiocbe = TAILQ_FIRST(&aiop->jobtorun);
503	if (aiocbe) {
504		TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
505		return aiocbe;
506	}
507
508	for (aiocbe = TAILQ_FIRST(&aio_jobs);
509		aiocbe;
510		aiocbe = TAILQ_NEXT(aiocbe, list)) {
511		struct kaioinfo *ki;
512		struct proc *userp;
513
514		userp = aiocbe->userproc;
515		ki = userp->p_aioinfo;
516
517		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
518			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
519			return aiocbe;
520		}
521	}
522
523	return NULL;
524}
525
526/*
527 * The AIO processing activity.  This is the code that does the
528 * I/O request for the non-physio version of the operations.  The
529 * normal vn operations are used, and this code should work in
530 * all instances for every type of file, including pipes, sockets,
531 * fifos, and regular files.
532 */
533void
534aio_process(struct aiocblist *aiocbe)
535{
536	struct filedesc *fdp;
537	struct proc *userp, *mycp;
538	struct aiocb *cb;
539	struct file *fp;
540	struct uio auio;
541	struct iovec aiov;
542	unsigned int fd;
543	int cnt;
544	static nperline=0;
545	int error;
546	off_t offset;
547	int oublock_st, oublock_end;
548	int inblock_st, inblock_end;
549
550	userp = aiocbe->userproc;
551	cb = &aiocbe->uaiocb;
552
553	mycp = curproc;
554
555	fdp = mycp->p_fd;
556	fd = cb->aio_fildes;
557	fp = fdp->fd_ofiles[fd];
558
559	aiov.iov_base = (void *) cb->aio_buf;
560	aiov.iov_len = cb->aio_nbytes;
561
562	auio.uio_iov = &aiov;
563	auio.uio_iovcnt = 1;
564	auio.uio_offset = offset = cb->aio_offset;
565	auio.uio_resid = cb->aio_nbytes;
566	cnt = cb->aio_nbytes;
567	auio.uio_segflg = UIO_USERSPACE;
568	auio.uio_procp = mycp;
569
570	inblock_st = mycp->p_stats->p_ru.ru_inblock;
571	oublock_st = mycp->p_stats->p_ru.ru_oublock;
572	if (cb->aio_lio_opcode == LIO_READ) {
573		auio.uio_rw = UIO_READ;
574		error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
575	} else {
576		auio.uio_rw = UIO_WRITE;
577		error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
578	}
579	inblock_end = mycp->p_stats->p_ru.ru_inblock;
580	oublock_end = mycp->p_stats->p_ru.ru_oublock;
581
582	aiocbe->inputcharge = inblock_end - inblock_st;
583	aiocbe->outputcharge = oublock_end - oublock_st;
584
585	if (error) {
586		if (auio.uio_resid != cnt) {
587			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
588				error = 0;
589			if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
590				psignal(userp, SIGPIPE);
591		}
592	}
593
594	cnt -= auio.uio_resid;
595	cb->_aiocb_private.error = error;
596	cb->_aiocb_private.status = cnt;
597
598	return;
599
600}
601
602/*
603 * The AIO daemon, most of the actual work is done in aio_process,
604 * but the setup (and address space mgmt) is done in this routine.
605 */
606static void
607aio_daemon(void *uproc)
608{
609	int s;
610	struct aioproclist *aiop;
611	struct vmspace *myvm, *aiovm;
612	struct proc *mycp;
613
614	/*
615	 * Local copies of curproc (cp) and vmspace (myvm)
616	 */
617	mycp = curproc;
618	myvm = mycp->p_vmspace;
619
620	/*
621	 * We manage to create only one VM space for all AIOD processes.
622	 * The VM space for the first AIOD created becomes the shared VM
623	 * space for all of them.  We add an additional reference count,
624	 * even for the first AIOD, so the address space does not go away,
625	 * and we continue to use that original VM space even if the first
626	 * AIOD exits.
627	 */
628	if ((aiovm = aiovmspace) == NULL) {
629		aiovmspace = myvm;
630		myvm->vm_refcnt++;
631		/*
632		 * Remove userland cruft from address space.
633		 */
634		if (myvm->vm_shm)
635			shmexit(mycp);
636		pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK);
637		vm_map_remove(&myvm->vm_map, 0, USRSTACK);
638		myvm->vm_tsize = 0;
639		myvm->vm_dsize = 0;
640		myvm->vm_ssize = 0;
641	} else {
642		aiovm->vm_refcnt++;
643		mycp->p_vmspace = aiovm;
644		pmap_activate(mycp);
645		vmspace_free(myvm);
646		myvm = aiovm;
647	}
648
649	if (mycp->p_textvp) {
650		vrele(mycp->p_textvp);
651		mycp->p_textvp = NULL;
652	}
653
654	/*
655	 * Allocate and ready the aio control info.  There is one
656	 * aiop structure per daemon.
657	 */
658	aiop = zalloc(aiop_zone);
659	aiop->aioproc = mycp;
660	aiop->aioprocflags |= AIOP_FREE;
661	TAILQ_INIT(&aiop->jobtorun);
662
663	/*
664	 * Place thread (lightweight process) onto the AIO free thread list
665	 */
666	if (TAILQ_EMPTY(&aio_freeproc))
667		wakeup(&aio_freeproc);
668	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
669
670	/*
671	 * Make up a name for the daemon
672	 */
673	strcpy(mycp->p_comm, "aiod");
674
675	/*
676	 * Get rid of our current filedescriptors.  AIOD's don't need any
677	 * filedescriptors, except as temporarily inherited from the client.
678	 * Credentials are also cloned, and made equivalent to "root."
679	 */
680	fdfree(mycp);
681	mycp->p_fd = NULL;
682	mycp->p_ucred = crcopy(mycp->p_ucred);
683	mycp->p_ucred->cr_uid = 0;
684	mycp->p_ucred->cr_ngroups = 1;
685	mycp->p_ucred->cr_groups[0] = 1;
686
687	/*
688	 * The daemon resides in its own pgrp.
689	 */
690	enterpgrp(mycp, mycp->p_pid, 1);
691
692	/*
693	 * Mark special process type
694	 */
695	mycp->p_flag |= P_SYSTEM|P_KTHREADP;
696
697	/*
698	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
699	 * creating to many daemons.)
700	 */
701	wakeup(mycp);
702
703	while(1) {
704		struct proc *curcp;
705		struct	aiocblist *aiocbe;
706
707		/*
708		 * curcp is the current daemon process context.
709		 * userp is the current user process context.
710		 */
711		curcp = mycp;
712
713		/*
714		 * Take daemon off of free queue
715		 */
716		if (aiop->aioprocflags & AIOP_FREE) {
717			TAILQ_REMOVE(&aio_freeproc, aiop, list);
718			TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
719			aiop->aioprocflags &= ~AIOP_FREE;
720		}
721		aiop->aioprocflags &= ~AIOP_SCHED;
722
723		/*
724		 * Check for jobs
725		 */
726		while ( aiocbe = aio_selectjob(aiop)) {
727			struct proc *userp;
728			struct aiocb *cb;
729			struct kaioinfo *ki;
730			struct aio_liojob *lj;
731
732			cb = &aiocbe->uaiocb;
733			userp = aiocbe->userproc;
734
735			aiocbe->jobstate = JOBST_JOBRUNNING;
736
737			/*
738			 * Connect to process address space for user program
739			 */
740			if (userp != curcp) {
741				struct vmspace *tmpvm;
742				/*
743				 * Save the current address space that we are connected to.
744				 */
745				tmpvm = mycp->p_vmspace;
746				/*
747				 * Point to the new user address space, and refer to it.
748				 */
749				mycp->p_vmspace = userp->p_vmspace;
750				mycp->p_vmspace->vm_refcnt++;
751				/*
752				 * Activate the new mapping.
753				 */
754				pmap_activate(mycp);
755				/*
756				 * If the old address space wasn't the daemons own address
757				 * space, then we need to remove the daemon's reference from
758				 * the other process that it was acting on behalf of.
759				 */
760				if (tmpvm != myvm) {
761					vmspace_free(tmpvm);
762				}
763				/*
764				 * Disassociate from previous clients file descriptors, and
765				 * associate to the new clients descriptors.  Note that
766				 * the daemon doesn't need to worry about its orginal
767				 * descriptors, because they were originally freed.
768				 */
769				if (mycp->p_fd)
770					fdfree(mycp);
771				mycp->p_fd = fdshare(userp);
772				curcp = userp;
773			}
774
775			ki = userp->p_aioinfo;
776			lj = aiocbe->lio;
777
778			/*
779			 * Account for currently active jobs
780			 */
781			ki->kaio_active_count++;
782
783			/*
784			 * Do the I/O function
785			 */
786			aiocbe->jobaioproc = aiop;
787			aio_process(aiocbe);
788
789			/*
790			 * decrement the active job count
791			 */
792			ki->kaio_active_count--;
793
794			/*
795			 * increment the completion count for wakeup/signal comparisons
796			 */
797			aiocbe->jobflags |= AIOCBLIST_DONE;
798			ki->kaio_queue_finished_count++;
799			if (lj) {
800				lj->lioj_queue_finished_count++;
801			}
802			if ((ki->kaio_flags & KAIO_WAKEUP) ||
803				(ki->kaio_flags & KAIO_RUNDOWN) &&
804				(ki->kaio_active_count == 0)) {
805				ki->kaio_flags &= ~KAIO_WAKEUP;
806				wakeup(userp);
807			}
808
809			s = splbio();
810			if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
811				LIOJ_SIGNAL) {
812				if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) &&
813					(lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) {
814						psignal(userp, lj->lioj_signal.sigev_signo);
815						lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
816				}
817			}
818			splx(s);
819
820			aiocbe->jobstate = JOBST_JOBFINISHED;
821
822			/*
823			 * If the I/O request should be automatically rundown, do the
824			 * needed cleanup.  Otherwise, place the queue entry for
825			 * the just finished I/O request into the done queue for the
826			 * associated client.
827			 */
828			if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
829				aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
830				TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
831			} else {
832				TAILQ_REMOVE(&ki->kaio_jobqueue,
833					aiocbe, plist);
834				TAILQ_INSERT_TAIL(&ki->kaio_jobdone,
835					aiocbe, plist);
836			}
837
838			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
839				wakeup(aiocbe);
840				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
841			}
842
843			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
844				psignal(userp, cb->aio_sigevent.sigev_signo);
845			}
846		}
847
848		/*
849		 * Disconnect from user address space
850		 */
851		if (curcp != mycp) {
852			struct vmspace *tmpvm;
853			/*
854			 * Get the user address space to disconnect from.
855			 */
856			tmpvm = mycp->p_vmspace;
857			/*
858			 * Get original address space for daemon.
859			 */
860			mycp->p_vmspace = myvm;
861			/*
862			 * Activate the daemon's address space.
863			 */
864			pmap_activate(mycp);
865#if defined(DIAGNOSTIC)
866			if (tmpvm == myvm)
867				printf("AIOD: vmspace problem -- %d\n", mycp->p_pid);
868#endif
869			/*
870			 * remove our vmspace reference.
871			 */
872			vmspace_free(tmpvm);
873			/*
874			 * disassociate from the user process's file descriptors.
875			 */
876			if (mycp->p_fd)
877				fdfree(mycp);
878			mycp->p_fd = NULL;
879			curcp = mycp;
880		}
881
882		/*
883		 * If we are the first to be put onto the free queue, wakeup
884		 * anyone waiting for a daemon.
885		 */
886		TAILQ_REMOVE(&aio_activeproc, aiop, list);
887		if (TAILQ_EMPTY(&aio_freeproc))
888			wakeup(&aio_freeproc);
889		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
890		aiop->aioprocflags |= AIOP_FREE;
891
892		/*
893		 * If daemon is inactive for a long time, allow it to exit, thereby
894		 * freeing resources.
895		 */
896		if (((aiop->aioprocflags & AIOP_SCHED) == 0) &&
897			tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) {
898			if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
899				(TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
900				if ((aiop->aioprocflags & AIOP_FREE) &&
901					(num_aio_procs > target_aio_procs)) {
902					TAILQ_REMOVE(&aio_freeproc, aiop, list);
903					zfree(aiop_zone, aiop);
904					num_aio_procs--;
905#if defined(DIAGNOSTIC)
906					if (mycp->p_vmspace->vm_refcnt <= 1)
907						printf("AIOD: bad vm refcnt for exiting daemon: %d\n",
908							mycp->p_vmspace->vm_refcnt);
909#endif
910					exit1(mycp, 0);
911				}
912			}
913		}
914	}
915}
916
917/*
918 * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.
919 * The AIO daemon modifies its environment itself.
920 */
921static int
922aio_newproc()
923{
924	int error;
925	struct rfork_args rfa;
926	struct proc *p, *np;
927
928	rfa.flags = RFPROC | RFCFDG;
929
930	p = curproc;
931	if (error = rfork(p, &rfa))
932		return error;
933
934	np = pfind(p->p_retval[0]);
935	cpu_set_fork_handler(np, aio_daemon, p);
936
937	/*
938	 * Wait until daemon is started, but continue on just in case (to
939	 * handle error conditions.
940	 */
941	error = tsleep(np, PZERO, "aiosta", aiod_timeout);
942	num_aio_procs++;
943
944	return error;
945
946}
947
948/*
949 * Try the high-performance physio method for eligible VCHR devices.  This
950 * routine doesn't require the use of any additional threads, and have
951 * overhead.
952 */
953int
954aio_qphysio(p, aiocbe)
955	struct proc *p;
956	struct aiocblist *aiocbe;
957{
958	int error;
959	caddr_t sa;
960	struct aiocb *cb;
961	struct file *fp;
962	struct buf *bp;
963	int bflags;
964	struct vnode *vp;
965	struct kaioinfo *ki;
966	struct filedesc *fdp;
967	struct aio_liojob *lj;
968	int fd;
969	int majordev;
970	int s;
971	int cnt;
972	dev_t dev;
973	int rw;
974	d_strategy_t *fstrategy;
975	struct cdevsw *cdev;
976	struct cdevsw *bdev;
977
978	cb = &aiocbe->uaiocb;
979	fdp = p->p_fd;
980	fd = cb->aio_fildes;
981	fp = fdp->fd_ofiles[fd];
982
983	if (fp->f_type != DTYPE_VNODE) {
984		return -1;
985	}
986
987	vp = (struct vnode *)fp->f_data;
988	if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) {
989		return -1;
990	}
991
992	if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) {
993		return -1;
994	}
995
996	if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) {
997		return -1;
998	}
999
1000	majordev = major(vp->v_rdev);
1001	if (majordev == NODEV) {
1002		return -1;
1003	}
1004
1005	cdev = cdevsw[major(vp->v_rdev)];
1006	if (cdev == NULL) {
1007		return -1;
1008	}
1009
1010	if (cdev->d_bmaj == -1) {
1011		return -1;
1012	}
1013	bdev = cdev;
1014
1015	ki = p->p_aioinfo;
1016	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
1017		return -1;
1018	}
1019
1020	cnt = cb->aio_nbytes;
1021	if (cnt > MAXPHYS) {
1022		return -1;
1023	}
1024
1025	dev = makedev(bdev->d_bmaj, minor(vp->v_rdev));
1026
1027	/*
1028	 * Physical I/O is charged directly to the process, so we don't have
1029	 * to fake it.
1030	 */
1031	aiocbe->inputcharge = 0;
1032	aiocbe->outputcharge = 0;
1033
1034	ki->kaio_buffer_count++;
1035
1036	lj = aiocbe->lio;
1037	if (lj) {
1038		lj->lioj_buffer_count++;
1039	}
1040
1041	/* create and build a buffer header for a transfer */
1042	bp = (struct buf *)getpbuf();
1043
1044	/*
1045	 * get a copy of the kva from the physical buffer
1046	 */
1047	bp->b_proc = p;
1048	bp->b_dev = dev;
1049	error = bp->b_error = 0;
1050
1051	if (cb->aio_lio_opcode == LIO_WRITE) {
1052		rw = 0;
1053		bflags = B_WRITE;
1054	} else {
1055		rw = 1;
1056		bflags = B_READ;
1057	}
1058
1059	bp->b_bcount = cb->aio_nbytes;
1060	bp->b_bufsize = cb->aio_nbytes;
1061	bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags;
1062	bp->b_iodone = aio_physwakeup;
1063	bp->b_saveaddr = bp->b_data;
1064	bp->b_data = (void *) cb->aio_buf;
1065	bp->b_blkno = btodb(cb->aio_offset);
1066
1067	if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
1068		error = EFAULT;
1069		goto doerror;
1070	}
1071	if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
1072		error = EFAULT;
1073		goto doerror;
1074	}
1075
1076	/* bring buffer into kernel space */
1077	vmapbuf(bp);
1078
1079	s = splbio();
1080	aiocbe->bp = bp;
1081	bp->b_spc = (void *)aiocbe;
1082	TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1083	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1084	aiocbe->jobstate = JOBST_JOBQBUF;
1085	cb->_aiocb_private.status = cb->aio_nbytes;
1086	num_buf_aio++;
1087	fstrategy = bdev->d_strategy;
1088	bp->b_error = 0;
1089
1090	splx(s);
1091	/* perform transfer */
1092	(*fstrategy)(bp);
1093
1094	s = splbio();
1095	/*
1096	 * If we had an error invoking the request, or an error in processing
1097	 * the request before we have returned, we process it as an error
1098	 * in transfer.  Note that such an I/O error is not indicated immediately,
1099	 * but is returned using the aio_error mechanism.  In this case, aio_suspend
1100	 * will return immediately.
1101	 */
1102	if (bp->b_error || (bp->b_flags & B_ERROR)) {
1103		struct aiocb *job = aiocbe->uuaiocb;
1104
1105		aiocbe->uaiocb._aiocb_private.status = 0;
1106		suword(&job->_aiocb_private.status, 0);
1107		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1108		suword(&job->_aiocb_private.error, bp->b_error);
1109
1110		ki->kaio_buffer_finished_count++;
1111
1112		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1113			aiocbe->jobstate = JOBST_JOBBFINISHED;
1114			aiocbe->jobflags |= AIOCBLIST_DONE;
1115			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1116			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1117			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1118		}
1119	}
1120	splx(s);
1121	return 0;
1122
1123doerror:
1124	ki->kaio_buffer_count--;
1125	if (lj) {
1126		lj->lioj_buffer_count--;
1127	}
1128	aiocbe->bp = NULL;
1129	relpbuf(bp);
1130	return error;
1131}
1132
1133/*
1134 * This waits/tests physio completion.
1135 */
1136int
1137aio_fphysio(p, iocb, flgwait)
1138	struct proc *p;
1139	struct aiocblist *iocb;
1140	int flgwait;
1141{
1142	int s;
1143	struct buf *bp;
1144	int error;
1145
1146	bp = iocb->bp;
1147
1148	s = splbio();
1149	if (flgwait == 0) {
1150		if ((bp->b_flags & B_DONE) == 0) {
1151			splx(s);
1152			return EINPROGRESS;
1153		}
1154	}
1155
1156	while ((bp->b_flags & B_DONE) == 0) {
1157		if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1158			if ((bp->b_flags & B_DONE) == 0) {
1159				splx(s);
1160				return EINPROGRESS;
1161			} else {
1162				break;
1163			}
1164		}
1165	}
1166
1167	/* release mapping into kernel space */
1168	vunmapbuf(bp);
1169	iocb->bp = 0;
1170
1171	error = 0;
1172	/*
1173	 * check for an error
1174	 */
1175	if (bp->b_flags & B_ERROR) {
1176		error = bp->b_error;
1177	}
1178
1179	relpbuf(bp);
1180	return (error);
1181}
1182
1183/*
1184 * Queue a new AIO request.  Choosing either the threaded or direct physio
1185 * VCHR technique is done in this code.
1186 */
1187static int
1188_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1189{
1190	struct filedesc *fdp;
1191	struct file *fp;
1192	unsigned int fd;
1193
1194	int error;
1195	int opcode;
1196	struct aiocblist *aiocbe;
1197	struct aioproclist *aiop;
1198	struct kaioinfo *ki;
1199
1200	if (aiocbe = TAILQ_FIRST(&aio_freejobs)) {
1201		TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1202	} else {
1203		aiocbe = zalloc (aiocb_zone);
1204	}
1205
1206	aiocbe->inputcharge = 0;
1207	aiocbe->outputcharge = 0;
1208
1209	suword(&job->_aiocb_private.status, -1);
1210	suword(&job->_aiocb_private.error, 0);
1211	suword(&job->_aiocb_private.kernelinfo, -1);
1212
1213	error = copyin((caddr_t)job,
1214		(caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb);
1215	if (error) {
1216		suword(&job->_aiocb_private.error, error);
1217
1218		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1219		return error;
1220	}
1221
1222	/*
1223	 * Save userspace address of the job info
1224	 */
1225	aiocbe->uuaiocb = job;
1226
1227	/*
1228	 * Get the opcode
1229	 */
1230	if (type != LIO_NOP) {
1231		aiocbe->uaiocb.aio_lio_opcode = type;
1232	}
1233	opcode = aiocbe->uaiocb.aio_lio_opcode;
1234
1235	/*
1236	 * Get the fd info for process
1237	 */
1238	fdp = p->p_fd;
1239
1240	/*
1241	 * Range check file descriptor
1242	 */
1243	fd = aiocbe->uaiocb.aio_fildes;
1244	if (fd >= fdp->fd_nfiles) {
1245		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1246		if (type == 0) {
1247			suword(&job->_aiocb_private.error, EBADF);
1248		}
1249		return EBADF;
1250	}
1251
1252	fp = fdp->fd_ofiles[fd];
1253	if ((fp == NULL) ||
1254		((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) {
1255		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1256		if (type == 0) {
1257			suword(&job->_aiocb_private.error, EBADF);
1258		}
1259		return EBADF;
1260	}
1261
1262	if (aiocbe->uaiocb.aio_offset == -1LL) {
1263		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1264		if (type == 0) {
1265			suword(&job->_aiocb_private.error, EINVAL);
1266		}
1267		return EINVAL;
1268	}
1269
1270	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1271	if (error) {
1272		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1273		if (type == 0) {
1274			suword(&job->_aiocb_private.error, EINVAL);
1275		}
1276		return error;
1277	}
1278
1279	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)jobrefid;
1280	jobrefid++;
1281	if (jobrefid > INT_MAX)
1282		jobrefid = 1;
1283
1284	if (opcode == LIO_NOP) {
1285		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1286		if (type == 0) {
1287			suword(&job->_aiocb_private.error, 0);
1288			suword(&job->_aiocb_private.status, 0);
1289			suword(&job->_aiocb_private.kernelinfo, 0);
1290		}
1291		return 0;
1292	}
1293
1294	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1295		TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1296		if (type == 0) {
1297			suword(&job->_aiocb_private.status, 0);
1298			suword(&job->_aiocb_private.error, EINVAL);
1299		}
1300		return EINVAL;
1301	}
1302
1303	suword(&job->_aiocb_private.error, EINPROGRESS);
1304	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1305	aiocbe->userproc = p;
1306	aiocbe->jobflags = 0;
1307	aiocbe->lio = lj;
1308	ki = p->p_aioinfo;
1309
1310	if ((error = aio_qphysio(p, aiocbe)) == 0) {
1311		return 0;
1312	} else if (error > 0) {
1313		suword(&job->_aiocb_private.status, 0);
1314		aiocbe->uaiocb._aiocb_private.error = error;
1315		suword(&job->_aiocb_private.error, error);
1316		return error;
1317	}
1318
1319	/*
1320	 * No buffer for daemon I/O
1321	 */
1322	aiocbe->bp = NULL;
1323
1324	ki->kaio_queue_count++;
1325	if (lj) {
1326		lj->lioj_queue_count++;
1327	}
1328	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1329	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1330	aiocbe->jobstate = JOBST_JOBQGLOBAL;
1331
1332	num_queue_count++;
1333	error = 0;
1334
1335	/*
1336	 * If we don't have a free AIO process, and we are below our
1337	 * quota, then start one.  Otherwise, depend on the subsequent
1338	 * I/O completions to pick-up this job.  If we don't sucessfully
1339	 * create the new process (thread) due to resource issues, we
1340	 * return an error for now (EAGAIN), which is likely not the
1341	 * correct thing to do.
1342	 */
1343retryproc:
1344	if (aiop = TAILQ_FIRST(&aio_freeproc)) {
1345		TAILQ_REMOVE(&aio_freeproc, aiop, list);
1346		TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1347		aiop->aioprocflags &= ~AIOP_FREE;
1348		wakeup(aiop->aioproc);
1349	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1350			((ki->kaio_active_count + num_aio_resv_start) <
1351				ki->kaio_maxactive_count)) {
1352		num_aio_resv_start++;
1353		if ((error = aio_newproc()) == 0) {
1354			num_aio_resv_start--;
1355			p->p_retval[0] = 0;
1356			goto retryproc;
1357		}
1358		num_aio_resv_start--;
1359	}
1360	return error;
1361}
1362
1363/*
1364 * This routine queues an AIO request, checking for quotas.
1365 */
1366static int
1367aio_aqueue(struct proc *p, struct aiocb *job, int type)
1368{
1369	struct kaioinfo *ki;
1370
1371	if (p->p_aioinfo == NULL) {
1372		aio_init_aioinfo(p);
1373	}
1374
1375	if (num_queue_count >= max_queue_count)
1376		return EAGAIN;
1377
1378	ki = p->p_aioinfo;
1379	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1380		return EAGAIN;
1381
1382	return _aio_aqueue(p, job, NULL, type);
1383}
1384
1385/*
1386 * Support the aio_return system call, as a side-effect, kernel
1387 * resources are released.
1388 */
1389int
1390aio_return(struct proc *p, struct aio_return_args *uap)
1391{
1392	int s;
1393	int jobref, status;
1394	struct aiocblist *cb, *ncb;
1395	struct aiocb *ujob;
1396	struct kaioinfo *ki;
1397	struct proc *userp;
1398
1399	ki = p->p_aioinfo;
1400	if (ki == NULL) {
1401		return EINVAL;
1402	}
1403
1404	ujob = uap->aiocbp;
1405
1406	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1407	if (jobref == -1 || jobref == 0)
1408		return EINVAL;
1409
1410	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1411		cb;
1412		cb = TAILQ_NEXT(cb, plist)) {
1413		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1414			if (ujob == cb->uuaiocb) {
1415				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1416			} else {
1417				p->p_retval[0] = EFAULT;
1418			}
1419			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1420				curproc->p_stats->p_ru.ru_oublock += cb->outputcharge;
1421				cb->outputcharge = 0;
1422			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1423				curproc->p_stats->p_ru.ru_inblock += cb->inputcharge;
1424				cb->inputcharge = 0;
1425			}
1426			aio_free_entry(cb);
1427			return 0;
1428		}
1429	}
1430
1431	s = splbio();
1432	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1433		cb;
1434		cb = ncb) {
1435		ncb = TAILQ_NEXT(cb, plist);
1436		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1437			splx(s);
1438			if (ujob == cb->uuaiocb) {
1439				p->p_retval[0] = cb->uaiocb._aiocb_private.status;
1440			} else {
1441				p->p_retval[0] = EFAULT;
1442			}
1443			aio_free_entry(cb);
1444			return 0;
1445		}
1446	}
1447	splx(s);
1448
1449	return (EINVAL);
1450}
1451
1452/*
1453 * Allow a process to wakeup when any of the I/O requests are
1454 * completed.
1455 */
1456int
1457aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1458{
1459	struct timeval atv;
1460	struct timespec ts;
1461	struct aiocb *const *cbptr, *cbp;
1462	struct kaioinfo *ki;
1463	struct aiocblist *cb;
1464	int i;
1465	int njoblist;
1466	int error, s, timo;
1467	int *ijoblist;
1468	struct aiocb **ujoblist;
1469
1470	if (uap->nent >= AIO_LISTIO_MAX)
1471		return EINVAL;
1472
1473	timo = 0;
1474	if (uap->timeout) {
1475		/*
1476		 * Get timespec struct
1477		 */
1478		if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) {
1479			return error;
1480		}
1481
1482		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1483			return (EINVAL);
1484
1485		TIMESPEC_TO_TIMEVAL(&atv, &ts)
1486		if (itimerfix(&atv))
1487			return (EINVAL);
1488		timo = tvtohz(&atv);
1489	}
1490
1491	ki = p->p_aioinfo;
1492	if (ki == NULL)
1493		return EAGAIN;
1494
1495	njoblist = 0;
1496	ijoblist = zalloc(aiol_zone);
1497	ujoblist = zalloc(aiol_zone);
1498	cbptr = uap->aiocbp;
1499
1500	for(i = 0; i < uap->nent; i++) {
1501		cbp = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1502		if (cbp == 0)
1503			continue;
1504		ujoblist[njoblist] = cbp;
1505		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1506		njoblist++;
1507	}
1508	if (njoblist == 0) {
1509		zfree(aiol_zone, ijoblist);
1510		zfree(aiol_zone, ujoblist);
1511		return 0;
1512	}
1513
1514	error = 0;
1515	while (1) {
1516		for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1517			cb; cb = TAILQ_NEXT(cb, plist)) {
1518			for(i = 0; i < njoblist; i++) {
1519				if (((long) cb->uaiocb._aiocb_private.kernelinfo) ==
1520					ijoblist[i]) {
1521					if (ujoblist[i] != cb->uuaiocb)
1522						error = EINVAL;
1523					zfree(aiol_zone, ijoblist);
1524					zfree(aiol_zone, ujoblist);
1525					return error;
1526				}
1527			}
1528		}
1529
1530		s = splbio();
1531		for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1532			cb; cb = TAILQ_NEXT(cb, plist)) {
1533			for(i = 0; i < njoblist; i++) {
1534				if (((long) cb->uaiocb._aiocb_private.kernelinfo) ==
1535					ijoblist[i]) {
1536					splx(s);
1537					if (ujoblist[i] != cb->uuaiocb)
1538						error = EINVAL;
1539					zfree(aiol_zone, ijoblist);
1540					zfree(aiol_zone, ujoblist);
1541					return error;
1542				}
1543			}
1544		}
1545
1546		ki->kaio_flags |= KAIO_WAKEUP;
1547		error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo);
1548		splx(s);
1549
1550		if (error == EINTR) {
1551			zfree(aiol_zone, ijoblist);
1552			zfree(aiol_zone, ujoblist);
1553			return EINTR;
1554		} else if (error == EWOULDBLOCK) {
1555			zfree(aiol_zone, ijoblist);
1556			zfree(aiol_zone, ujoblist);
1557			return EAGAIN;
1558		}
1559	}
1560
1561/* NOTREACHED */
1562	return EINVAL;
1563}
1564
1565/*
1566 * aio_cancel at the kernel level is a NOOP right now.  It
1567 * might be possible to support it partially in user mode, or
1568 * in kernel mode later on.
1569 */
1570int
1571aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1572{
1573      return ENOSYS;
1574}
1575
1576/*
1577 * aio_error is implemented in the kernel level for compatibility
1578 * purposes only.  For a user mode async implementation, it would be
1579 * best to do it in a userland subroutine.
1580 */
1581int
1582aio_error(struct proc *p, struct aio_error_args *uap)
1583{
1584	int s;
1585	struct aiocblist *cb;
1586	struct kaioinfo *ki;
1587	int jobref;
1588	int error, status;
1589
1590	ki = p->p_aioinfo;
1591	if (ki == NULL)
1592		return EINVAL;
1593
1594	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1595	if ((jobref == -1) || (jobref == 0))
1596		return EINVAL;
1597
1598	for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1599		cb;
1600		cb = TAILQ_NEXT(cb, plist)) {
1601
1602		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1603			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1604			return 0;
1605		}
1606	}
1607
1608	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue);
1609		cb;
1610		cb = TAILQ_NEXT(cb, plist)) {
1611
1612		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1613			p->p_retval[0] = EINPROGRESS;
1614			return 0;
1615		}
1616	}
1617
1618	s = splbio();
1619	for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1620		cb;
1621		cb = TAILQ_NEXT(cb, plist)) {
1622		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1623			p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1624			splx(s);
1625			return 0;
1626		}
1627	}
1628
1629	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue);
1630		cb;
1631		cb = TAILQ_NEXT(cb, plist)) {
1632		if (((long) cb->uaiocb._aiocb_private.kernelinfo) == jobref) {
1633			p->p_retval[0] = EINPROGRESS;
1634			splx(s);
1635			return 0;
1636		}
1637	}
1638	splx(s);
1639
1640
1641	/*
1642	 * Hack for lio
1643	 */
1644/*
1645	status = fuword(&uap->aiocbp->_aiocb_private.status);
1646	if (status == -1) {
1647		return fuword(&uap->aiocbp->_aiocb_private.error);
1648	}
1649*/
1650	return EINVAL;
1651}
1652
1653int
1654aio_read(struct proc *p, struct aio_read_args *uap)
1655{
1656	struct filedesc *fdp;
1657	struct file *fp;
1658	struct uio auio;
1659	struct iovec aiov;
1660	unsigned int fd;
1661	int cnt;
1662	struct aiocb iocb;
1663	int error, pmodes;
1664
1665	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1666	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1667		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1668	}
1669
1670	/*
1671	 * Get control block
1672	 */
1673	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1674		return error;
1675
1676	/*
1677	 * Get the fd info for process
1678	 */
1679	fdp = p->p_fd;
1680
1681	/*
1682	 * Range check file descriptor
1683	 */
1684	fd = iocb.aio_fildes;
1685	if (fd >= fdp->fd_nfiles)
1686		return EBADF;
1687	fp = fdp->fd_ofiles[fd];
1688	if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1689		return EBADF;
1690	if (iocb.aio_offset == -1LL)
1691		return EINVAL;
1692
1693	auio.uio_resid = iocb.aio_nbytes;
1694	if (auio.uio_resid < 0)
1695		return (EINVAL);
1696
1697	/*
1698	 * Process sync simply -- queue async request.
1699	 */
1700	if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) {
1701		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ);
1702	}
1703
1704	aiov.iov_base = (void *) iocb.aio_buf;
1705	aiov.iov_len = iocb.aio_nbytes;
1706
1707	auio.uio_iov = &aiov;
1708	auio.uio_iovcnt = 1;
1709	auio.uio_offset = iocb.aio_offset;
1710	auio.uio_rw = UIO_READ;
1711	auio.uio_segflg = UIO_USERSPACE;
1712	auio.uio_procp = p;
1713
1714	cnt = iocb.aio_nbytes;
1715	error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred);
1716	if (error &&
1717		(auio.uio_resid != cnt) &&
1718		(error == ERESTART || error == EINTR || error == EWOULDBLOCK))
1719			error = 0;
1720	cnt -= auio.uio_resid;
1721	p->p_retval[0] = cnt;
1722	return error;
1723}
1724
1725int
1726aio_write(struct proc *p, struct aio_write_args *uap)
1727{
1728	struct filedesc *fdp;
1729	struct file *fp;
1730	struct uio auio;
1731	struct iovec aiov;
1732	unsigned int fd;
1733	int cnt;
1734	struct aiocb iocb;
1735	int error;
1736	int pmodes;
1737
1738	/*
1739	 * Process sync simply -- queue async request.
1740	 */
1741	pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1742	if ((pmodes & AIO_PMODE_SYNC) == 0) {
1743		return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE);
1744	}
1745
1746	if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb))
1747		return error;
1748
1749	/*
1750	 * Get the fd info for process
1751	 */
1752	fdp = p->p_fd;
1753
1754	/*
1755	 * Range check file descriptor
1756	 */
1757	fd = iocb.aio_fildes;
1758	if (fd >= fdp->fd_nfiles)
1759		return EBADF;
1760	fp = fdp->fd_ofiles[fd];
1761	if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1762		return EBADF;
1763	if (iocb.aio_offset == -1LL)
1764		return EINVAL;
1765
1766	aiov.iov_base = (void *) iocb.aio_buf;
1767	aiov.iov_len = iocb.aio_nbytes;
1768	auio.uio_iov = &aiov;
1769	auio.uio_iovcnt = 1;
1770	auio.uio_offset = iocb.aio_offset;
1771
1772	auio.uio_resid = iocb.aio_nbytes;
1773	if (auio.uio_resid < 0)
1774		return (EINVAL);
1775
1776	auio.uio_rw = UIO_WRITE;
1777	auio.uio_segflg = UIO_USERSPACE;
1778	auio.uio_procp = p;
1779
1780	cnt = iocb.aio_nbytes;
1781	error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred);
1782	if (error) {
1783		if (auio.uio_resid != cnt) {
1784			if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
1785				error = 0;
1786			if (error == EPIPE)
1787				psignal(p, SIGPIPE);
1788		}
1789	}
1790	cnt -= auio.uio_resid;
1791	p->p_retval[0] = cnt;
1792	return error;
1793}
1794
1795int
1796lio_listio(struct proc *p, struct lio_listio_args *uap)
1797{
1798	int nent, nentqueued;
1799	struct aiocb *iocb, * const *cbptr;
1800	struct aiocblist *cb;
1801	struct kaioinfo *ki;
1802	struct aio_liojob *lj;
1803	int error, runningcode;
1804	int nerror;
1805	int i;
1806	int s;
1807
1808	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) {
1809		return EINVAL;
1810	}
1811
1812	nent = uap->nent;
1813	if (nent > AIO_LISTIO_MAX) {
1814		return EINVAL;
1815	}
1816
1817	if (p->p_aioinfo == NULL) {
1818		aio_init_aioinfo(p);
1819	}
1820
1821	if ((nent + num_queue_count) > max_queue_count) {
1822		return EAGAIN;
1823	}
1824
1825	ki = p->p_aioinfo;
1826	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) {
1827		return EAGAIN;
1828	}
1829
1830	lj = zalloc(aiolio_zone);
1831	if (!lj) {
1832		return EAGAIN;
1833	}
1834
1835	lj->lioj_flags = 0;
1836	lj->lioj_buffer_count = 0;
1837	lj->lioj_buffer_finished_count = 0;
1838	lj->lioj_queue_count = 0;
1839	lj->lioj_queue_finished_count = 0;
1840	lj->lioj_ki = ki;
1841	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
1842
1843	/*
1844	 * Setup signal
1845	 */
1846	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1847		error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal);
1848		if (error)
1849			return error;
1850		lj->lioj_flags |= LIOJ_SIGNAL;
1851		lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1852	} else {
1853		lj->lioj_flags &= ~LIOJ_SIGNAL;
1854	}
1855
1856/*
1857 * get pointers to the list of I/O requests
1858 */
1859
1860	nerror = 0;
1861	nentqueued = 0;
1862	cbptr = uap->acb_list;
1863	for(i = 0; i < uap->nent; i++) {
1864		iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1865		if (((long) iocb != -1) && ((long) iocb != NULL)) {
1866			error = _aio_aqueue(p, iocb, lj, 0);
1867			if (error == 0) {
1868				nentqueued++;
1869			} else {
1870				nerror++;
1871			}
1872		}
1873	}
1874
1875	/*
1876	 * If we haven't queued any, then just return error
1877	 */
1878	if (nentqueued == 0) {
1879		return 0;
1880	}
1881
1882	/*
1883	 * Calculate the appropriate error return
1884	 */
1885	runningcode = 0;
1886	if (nerror)
1887		runningcode = EIO;
1888
1889	if (uap->mode == LIO_WAIT) {
1890		while (1) {
1891			int found;
1892			found = 0;
1893			for(i = 0; i < uap->nent; i++) {
1894				int jobref, command;
1895
1896				/*
1897				 * Fetch address of the control buf pointer in user space
1898				 */
1899				iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]);
1900				if (((long) iocb == -1) || ((long) iocb == 0))
1901					continue;
1902
1903				/*
1904				 * Fetch the associated command from user space
1905				 */
1906				command = fuword(&iocb->aio_lio_opcode);
1907				if (command == LIO_NOP) {
1908					found++;
1909					continue;
1910				}
1911
1912				jobref = fuword(&iocb->_aiocb_private.kernelinfo);
1913
1914				for (cb = TAILQ_FIRST(&ki->kaio_jobdone);
1915					cb;
1916					cb = TAILQ_NEXT(cb, plist)) {
1917					if (((long) cb->uaiocb._aiocb_private.kernelinfo) ==
1918						jobref) {
1919						if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1920							curproc->p_stats->p_ru.ru_oublock +=
1921								cb->outputcharge;
1922							cb->outputcharge = 0;
1923						} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1924							curproc->p_stats->p_ru.ru_inblock +=
1925								cb->inputcharge;
1926							cb->inputcharge = 0;
1927						}
1928						found++;
1929						break;
1930					}
1931				}
1932
1933				s = splbio();
1934				for (cb = TAILQ_FIRST(&ki->kaio_bufdone);
1935					cb;
1936					cb = TAILQ_NEXT(cb, plist)) {
1937					if (((long) cb->uaiocb._aiocb_private.kernelinfo) ==
1938						jobref) {
1939						found++;
1940						break;
1941					}
1942				}
1943				splx(s);
1944
1945			}
1946
1947			/*
1948			 * If all I/Os have been disposed of, then we can return
1949			 */
1950			if (found == nentqueued) {
1951				return runningcode;
1952			}
1953
1954			ki->kaio_flags |= KAIO_WAKEUP;
1955			error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0);
1956
1957			if (error == EINTR) {
1958				return EINTR;
1959			} else if (error == EWOULDBLOCK) {
1960				return EAGAIN;
1961			}
1962
1963		}
1964	}
1965
1966	return runningcode;
1967}
1968
1969/*
1970 * This is a wierd hack so that we can post a signal.  It is safe
1971 * to do so from a timeout routine, but *not* from an interrupt routine.
1972 */
1973static void
1974process_signal(void *ljarg)
1975{
1976	struct aio_liojob *lj = ljarg;
1977	if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) {
1978		if (lj->lioj_queue_count == lj->lioj_queue_finished_count) {
1979			psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
1980			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
1981		}
1982	}
1983}
1984
1985/*
1986 * Interrupt handler for physio, performs the necessary process wakeups,
1987 * and signals.
1988 */
1989static void
1990aio_physwakeup(bp)
1991	struct buf *bp;
1992{
1993	struct aiocblist *aiocbe;
1994	struct proc *p;
1995	struct kaioinfo *ki;
1996	struct aio_liojob *lj;
1997	int s;
1998	s = splbio();
1999
2000	wakeup((caddr_t) bp);
2001	bp->b_flags &= ~B_CALL;
2002	bp->b_flags |= B_DONE;
2003
2004	aiocbe = (struct aiocblist *)bp->b_spc;
2005	if (aiocbe) {
2006		p = bp->b_proc;
2007
2008		aiocbe->jobstate = JOBST_JOBBFINISHED;
2009		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2010		aiocbe->uaiocb._aiocb_private.error = 0;
2011		aiocbe->jobflags |= AIOCBLIST_DONE;
2012
2013		if (bp->b_flags & B_ERROR) {
2014			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2015		}
2016
2017		lj = aiocbe->lio;
2018		if (lj) {
2019			lj->lioj_buffer_finished_count++;
2020			/*
2021			 * wakeup/signal if all of the interrupt jobs are done
2022			 */
2023			if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) {
2024				/*
2025				 * post a signal if it is called for
2026				 */
2027				if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2028					LIOJ_SIGNAL) {
2029					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2030					timeout(process_signal, lj, 0);
2031				}
2032			}
2033		}
2034
2035		ki = p->p_aioinfo;
2036		if (ki) {
2037			ki->kaio_buffer_finished_count++;
2038			TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2039			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2040			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2041			/*
2042			 * and do the wakeup
2043			 */
2044			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2045				ki->kaio_flags &= ~KAIO_WAKEUP;
2046				wakeup(p);
2047			}
2048		}
2049	}
2050	splx(s);
2051}
2052