Deleted Added
full compact
vfs_aio.c (90576) vfs_aio.c (91140)
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
16 * $FreeBSD: head/sys/kern/vfs_aio.c 90576 2002-02-12 17:40:41Z alc $
16 * $FreeBSD: head/sys/kern/vfs_aio.c 91140 2002-02-23 11:12:57Z tanimura $
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/malloc.h>
25#include <sys/bio.h>
26#include <sys/buf.h>
27#include <sys/sysproto.h>
28#include <sys/filedesc.h>
29#include <sys/kernel.h>
30#include <sys/kthread.h>
31#include <sys/fcntl.h>
32#include <sys/file.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/unistd.h>
36#include <sys/proc.h>
37#include <sys/resourcevar.h>
38#include <sys/signalvar.h>
39#include <sys/protosw.h>
40#include <sys/socketvar.h>
41#include <sys/syscall.h>
42#include <sys/sysent.h>
43#include <sys/sysctl.h>
44#include <sys/vnode.h>
45#include <sys/conf.h>
46#include <sys/event.h>
47
48#include <vm/vm.h>
49#include <vm/vm_extern.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_zone.h>
53#include <sys/aio.h>
54
55#include <machine/limits.h>
56
57#include "opt_vfs_aio.h"
58
59static long jobrefid;
60
61#define JOBST_NULL 0x0
62#define JOBST_JOBQGLOBAL 0x2
63#define JOBST_JOBRUNNING 0x3
64#define JOBST_JOBFINISHED 0x4
65#define JOBST_JOBQBUF 0x5
66#define JOBST_JOBBFINISHED 0x6
67
68#ifndef MAX_AIO_PER_PROC
69#define MAX_AIO_PER_PROC 32
70#endif
71
72#ifndef MAX_AIO_QUEUE_PER_PROC
73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
74#endif
75
76#ifndef MAX_AIO_PROCS
77#define MAX_AIO_PROCS 32
78#endif
79
80#ifndef MAX_AIO_QUEUE
81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
82#endif
83
84#ifndef TARGET_AIO_PROCS
85#define TARGET_AIO_PROCS 4
86#endif
87
88#ifndef MAX_BUF_AIO
89#define MAX_BUF_AIO 16
90#endif
91
92#ifndef AIOD_TIMEOUT_DEFAULT
93#define AIOD_TIMEOUT_DEFAULT (10 * hz)
94#endif
95
96#ifndef AIOD_LIFETIME_DEFAULT
97#define AIOD_LIFETIME_DEFAULT (30 * hz)
98#endif
99
100static int max_aio_procs = MAX_AIO_PROCS;
101static int num_aio_procs = 0;
102static int target_aio_procs = TARGET_AIO_PROCS;
103static int max_queue_count = MAX_AIO_QUEUE;
104static int num_queue_count = 0;
105static int num_buf_aio = 0;
106static int num_aio_resv_start = 0;
107static int aiod_timeout;
108static int aiod_lifetime;
109static int unloadable = 0;
110
111static int max_aio_per_proc = MAX_AIO_PER_PROC;
112static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
113static int max_buf_aio = MAX_BUF_AIO;
114
115SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
116
117SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
118 CTLFLAG_RW, &max_aio_per_proc, 0, "");
119
120SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
121 CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
122
123SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
124 CTLFLAG_RW, &max_aio_procs, 0, "");
125
126SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
127 CTLFLAG_RD, &num_aio_procs, 0, "");
128
129SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
130 CTLFLAG_RD, &num_queue_count, 0, "");
131
132SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
133 CTLFLAG_RW, &max_queue_count, 0, "");
134
135SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
136 CTLFLAG_RW, &target_aio_procs, 0, "");
137
138SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
139 CTLFLAG_RW, &max_buf_aio, 0, "");
140
141SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
142 CTLFLAG_RD, &num_buf_aio, 0, "");
143
144SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
145 CTLFLAG_RW, &aiod_lifetime, 0, "");
146
147SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
148 CTLFLAG_RW, &aiod_timeout, 0, "");
149
150SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
151 "Allow unload of aio (not recommended)");
152
153struct aiocblist {
154 TAILQ_ENTRY(aiocblist) list; /* List of jobs */
155 TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */
156 int jobflags;
157 int jobstate;
158 int inputcharge;
159 int outputcharge;
160 struct callout_handle timeouthandle;
161 struct buf *bp; /* Buffer pointer */
162 struct proc *userproc; /* User process */ /* Not td! */
163 struct file *fd_file; /* Pointer to file structure */
164 struct aiothreadlist *jobaiothread; /* AIO process descriptor */
165 struct aio_liojob *lio; /* Optional lio job */
166 struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */
167 struct klist klist; /* list of knotes */
168 struct aiocb uaiocb; /* Kernel I/O control block */
169};
170
171/* jobflags */
172#define AIOCBLIST_RUNDOWN 0x4
173#define AIOCBLIST_ASYNCFREE 0x8
174#define AIOCBLIST_DONE 0x10
175
176/*
177 * AIO process info
178 */
179#define AIOP_FREE 0x1 /* proc on free queue */
180#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
181
182struct aiothreadlist {
183 int aiothreadflags; /* AIO proc flags */
184 TAILQ_ENTRY(aiothreadlist) list; /* List of processes */
185 struct thread *aiothread; /* The AIO thread */
186};
187
188/*
189 * data-structure for lio signal management
190 */
191struct aio_liojob {
192 int lioj_flags;
193 int lioj_buffer_count;
194 int lioj_buffer_finished_count;
195 int lioj_queue_count;
196 int lioj_queue_finished_count;
197 struct sigevent lioj_signal; /* signal on all I/O done */
198 TAILQ_ENTRY(aio_liojob) lioj_list;
199 struct kaioinfo *lioj_ki;
200};
201#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
202#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
203
204/*
205 * per process aio data structure
206 */
207struct kaioinfo {
208 int kaio_flags; /* per process kaio flags */
209 int kaio_maxactive_count; /* maximum number of AIOs */
210 int kaio_active_count; /* number of currently used AIOs */
211 int kaio_qallowed_count; /* maxiumu size of AIO queue */
212 int kaio_queue_count; /* size of AIO queue */
213 int kaio_ballowed_count; /* maximum number of buffers */
214 int kaio_queue_finished_count; /* number of daemon jobs finished */
215 int kaio_buffer_count; /* number of physio buffers */
216 int kaio_buffer_finished_count; /* count of I/O done */
217 struct proc *kaio_p; /* process that uses this kaio block */
218 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
219 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
220 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
221 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
222 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
223 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
224};
225
226#define KAIO_RUNDOWN 0x1 /* process is being run down */
227#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
228
229static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc;
230static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
231static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
232
233static void aio_init_aioinfo(struct proc *p);
234static void aio_onceonly(void);
235static int aio_free_entry(struct aiocblist *aiocbe);
236static void aio_process(struct aiocblist *aiocbe);
237static int aio_newproc(void);
238static int aio_aqueue(struct thread *td, struct aiocb *job, int type);
239static void aio_physwakeup(struct buf *bp);
240static void aio_proc_rundown(struct proc *p);
241static int aio_fphysio(struct aiocblist *aiocbe);
242static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
243static void aio_daemon(void *uproc);
244static void aio_swake_cb(struct socket *, struct sockbuf *);
245static int aio_unload(void);
246static void process_signal(void *aioj);
247static int filt_aioattach(struct knote *kn);
248static void filt_aiodetach(struct knote *kn);
249static int filt_aio(struct knote *kn, long hint);
250
251static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone;
252static vm_zone_t aiolio_zone;
253
254static struct filterops aio_filtops =
255 { 0, filt_aioattach, filt_aiodetach, filt_aio };
256
257static int
258aio_modload(struct module *module, int cmd, void *arg)
259{
260 int error = 0;
261
262 switch (cmd) {
263 case MOD_LOAD:
264 aio_onceonly();
265 break;
266 case MOD_UNLOAD:
267 error = aio_unload();
268 break;
269 case MOD_SHUTDOWN:
270 break;
271 default:
272 error = EINVAL;
273 break;
274 }
275 return (error);
276}
277
278static moduledata_t aio_mod = {
279 "aio",
280 &aio_modload,
281 NULL
282};
283
284SYSCALL_MODULE_HELPER(aio_return);
285SYSCALL_MODULE_HELPER(aio_suspend);
286SYSCALL_MODULE_HELPER(aio_cancel);
287SYSCALL_MODULE_HELPER(aio_error);
288SYSCALL_MODULE_HELPER(aio_read);
289SYSCALL_MODULE_HELPER(aio_write);
290SYSCALL_MODULE_HELPER(aio_waitcomplete);
291SYSCALL_MODULE_HELPER(lio_listio);
292
293DECLARE_MODULE(aio, aio_mod,
294 SI_SUB_VFS, SI_ORDER_ANY);
295MODULE_VERSION(aio, 1);
296
297/*
298 * Startup initialization
299 */
300static void
301aio_onceonly(void)
302{
303
304 /* XXX: should probably just use so->callback */
305 aio_swake = &aio_swake_cb;
306 at_exit(aio_proc_rundown);
307 at_exec(aio_proc_rundown);
308 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
309 TAILQ_INIT(&aio_freeproc);
310 TAILQ_INIT(&aio_activeproc);
311 TAILQ_INIT(&aio_jobs);
312 TAILQ_INIT(&aio_bufjobs);
313 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
314 aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1);
315 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
316 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
317 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
318 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
319 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
320 jobrefid = 1;
321}
322
323static int
324aio_unload(void)
325{
326
327 /*
328 * XXX: no unloads by default, it's too dangerous.
329 * perhaps we could do it if locked out callers and then
330 * did an aio_proc_rundown() on each process.
331 */
332 if (!unloadable)
333 return (EOPNOTSUPP);
334
335 aio_swake = NULL;
336 rm_at_exit(aio_proc_rundown);
337 rm_at_exec(aio_proc_rundown);
338 kqueue_del_filteropts(EVFILT_AIO);
339 return (0);
340}
341
342/*
343 * Init the per-process aioinfo structure. The aioinfo limits are set
344 * per-process for user limit (resource) management.
345 */
346static void
347aio_init_aioinfo(struct proc *p)
348{
349 struct kaioinfo *ki;
350 if (p->p_aioinfo == NULL) {
351 ki = zalloc(kaio_zone);
352 p->p_aioinfo = ki;
353 ki->kaio_flags = 0;
354 ki->kaio_maxactive_count = max_aio_per_proc;
355 ki->kaio_active_count = 0;
356 ki->kaio_qallowed_count = max_aio_queue_per_proc;
357 ki->kaio_queue_count = 0;
358 ki->kaio_ballowed_count = max_buf_aio;
359 ki->kaio_buffer_count = 0;
360 ki->kaio_buffer_finished_count = 0;
361 ki->kaio_p = p;
362 TAILQ_INIT(&ki->kaio_jobdone);
363 TAILQ_INIT(&ki->kaio_jobqueue);
364 TAILQ_INIT(&ki->kaio_bufdone);
365 TAILQ_INIT(&ki->kaio_bufqueue);
366 TAILQ_INIT(&ki->kaio_liojoblist);
367 TAILQ_INIT(&ki->kaio_sockqueue);
368 }
369
370 while (num_aio_procs < target_aio_procs)
371 aio_newproc();
372}
373
374/*
375 * Free a job entry. Wait for completion if it is currently active, but don't
376 * delay forever. If we delay, we return a flag that says that we have to
377 * restart the queue scan.
378 */
379static int
380aio_free_entry(struct aiocblist *aiocbe)
381{
382 struct kaioinfo *ki;
383 struct aio_liojob *lj;
384 struct proc *p;
385 int error;
386 int s;
387
388 if (aiocbe->jobstate == JOBST_NULL)
389 panic("aio_free_entry: freeing already free job");
390
391 p = aiocbe->userproc;
392 ki = p->p_aioinfo;
393 lj = aiocbe->lio;
394 if (ki == NULL)
395 panic("aio_free_entry: missing p->p_aioinfo");
396
397 while (aiocbe->jobstate == JOBST_JOBRUNNING) {
398 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
399 return 0;
400 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
401 tsleep(aiocbe, PRIBIO, "jobwai", 0);
402 }
403 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
404
405 if (aiocbe->bp == NULL) {
406 if (ki->kaio_queue_count <= 0)
407 panic("aio_free_entry: process queue size <= 0");
408 if (num_queue_count <= 0)
409 panic("aio_free_entry: system wide queue size <= 0");
410
411 if (lj) {
412 lj->lioj_queue_count--;
413 if (aiocbe->jobflags & AIOCBLIST_DONE)
414 lj->lioj_queue_finished_count--;
415 }
416 ki->kaio_queue_count--;
417 if (aiocbe->jobflags & AIOCBLIST_DONE)
418 ki->kaio_queue_finished_count--;
419 num_queue_count--;
420 } else {
421 if (lj) {
422 lj->lioj_buffer_count--;
423 if (aiocbe->jobflags & AIOCBLIST_DONE)
424 lj->lioj_buffer_finished_count--;
425 }
426 if (aiocbe->jobflags & AIOCBLIST_DONE)
427 ki->kaio_buffer_finished_count--;
428 ki->kaio_buffer_count--;
429 num_buf_aio--;
430 }
431
432 /* aiocbe is going away, we need to destroy any knotes */
433 /* XXXKSE Note the thread here is used to eventually find the
434 * owning process again, but it is also used to do a fo_close
435 * and that requires the thread. (but does it require the
436 * OWNING thread? (or maybe the running thread?)
437 * There is a semantic problem here...
438 */
439 knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
440
441 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
442 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
443 ki->kaio_flags &= ~KAIO_WAKEUP;
444 wakeup(p);
445 }
446
447 if (aiocbe->jobstate == JOBST_JOBQBUF) {
448 if ((error = aio_fphysio(aiocbe)) != 0)
449 return error;
450 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
451 panic("aio_free_entry: invalid physio finish-up state");
452 s = splbio();
453 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
454 splx(s);
455 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
456 s = splnet();
457 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
458 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
459 splx(s);
460 } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
461 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
462 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
463 s = splbio();
464 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
465 splx(s);
466 if (aiocbe->bp) {
467 vunmapbuf(aiocbe->bp);
468 relpbuf(aiocbe->bp, NULL);
469 aiocbe->bp = NULL;
470 }
471 }
472 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
473 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
474 zfree(aiolio_zone, lj);
475 }
476 aiocbe->jobstate = JOBST_NULL;
477 untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
478 zfree(aiocb_zone, aiocbe);
479 return 0;
480}
481
482/*
483 * Rundown the jobs for a given process.
484 */
485static void
486aio_proc_rundown(struct proc *p)
487{
488 int s;
489 struct kaioinfo *ki;
490 struct aio_liojob *lj, *ljn;
491 struct aiocblist *aiocbe, *aiocbn;
492 struct file *fp;
493 struct filedesc *fdp;
494 struct socket *so;
495
496 ki = p->p_aioinfo;
497 if (ki == NULL)
498 return;
499
500 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
501 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
502 ki->kaio_buffer_finished_count)) {
503 ki->kaio_flags |= KAIO_RUNDOWN;
504 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
505 break;
506 }
507
508 /*
509 * Move any aio ops that are waiting on socket I/O to the normal job
510 * queues so they are cleaned up with any others.
511 */
512 fdp = p->p_fd;
513
514 s = splnet();
515 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
516 aiocbn) {
517 aiocbn = TAILQ_NEXT(aiocbe, plist);
518 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
519
520 /*
521 * Under some circumstances, the aio_fildes and the file
522 * structure don't match. This would leave aiocbe's in the
523 * TAILQ associated with the socket and cause a panic later.
524 *
525 * Detect and fix.
526 */
527 if ((fp == NULL) || (fp != aiocbe->fd_file))
528 fp = aiocbe->fd_file;
529 if (fp) {
530 so = (struct socket *)fp->f_data;
531 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
532 if (TAILQ_EMPTY(&so->so_aiojobq)) {
533 so->so_snd.sb_flags &= ~SB_AIO;
534 so->so_rcv.sb_flags &= ~SB_AIO;
535 }
536 }
537 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
538 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
539 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
540 }
541 splx(s);
542
543restart1:
544 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
545 aiocbn = TAILQ_NEXT(aiocbe, plist);
546 if (aio_free_entry(aiocbe))
547 goto restart1;
548 }
549
550restart2:
551 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
552 aiocbn) {
553 aiocbn = TAILQ_NEXT(aiocbe, plist);
554 if (aio_free_entry(aiocbe))
555 goto restart2;
556 }
557
558/*
559 * Note the use of lots of splbio here, trying to avoid splbio for long chains
560 * of I/O. Probably unnecessary.
561 */
562restart3:
563 s = splbio();
564 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
565 ki->kaio_flags |= KAIO_WAKEUP;
566 tsleep(p, PRIBIO, "aioprn", 0);
567 splx(s);
568 goto restart3;
569 }
570 splx(s);
571
572restart4:
573 s = splbio();
574 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
575 aiocbn = TAILQ_NEXT(aiocbe, plist);
576 if (aio_free_entry(aiocbe)) {
577 splx(s);
578 goto restart4;
579 }
580 }
581 splx(s);
582
583 /*
584 * If we've slept, jobs might have moved from one queue to another.
585 * Retry rundown if we didn't manage to empty the queues.
586 */
587 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
588 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
589 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
590 TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
591 goto restart1;
592
593 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
594 ljn = TAILQ_NEXT(lj, lioj_list);
595 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
596 0)) {
597 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
598 zfree(aiolio_zone, lj);
599 } else {
600#ifdef DIAGNOSTIC
601 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
602 "QF:%d\n", lj->lioj_buffer_count,
603 lj->lioj_buffer_finished_count,
604 lj->lioj_queue_count,
605 lj->lioj_queue_finished_count);
606#endif
607 }
608 }
609
610 zfree(kaio_zone, ki);
611 p->p_aioinfo = NULL;
612}
613
614/*
615 * Select a job to run (called by an AIO daemon).
616 */
617static struct aiocblist *
618aio_selectjob(struct aiothreadlist *aiop)
619{
620 int s;
621 struct aiocblist *aiocbe;
622 struct kaioinfo *ki;
623 struct proc *userp;
624
625 s = splnet();
626 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
627 TAILQ_NEXT(aiocbe, list)) {
628 userp = aiocbe->userproc;
629 ki = userp->p_aioinfo;
630
631 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
632 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
633 splx(s);
634 return aiocbe;
635 }
636 }
637 splx(s);
638
639 return NULL;
640}
641
642/*
643 * The AIO processing activity. This is the code that does the I/O request for
644 * the non-physio version of the operations. The normal vn operations are used,
645 * and this code should work in all instances for every type of file, including
646 * pipes, sockets, fifos, and regular files.
647 */
648static void
649aio_process(struct aiocblist *aiocbe)
650{
651 struct filedesc *fdp;
652 struct thread *td;
653 struct proc *userp;
654 struct proc *mycp;
655 struct aiocb *cb;
656 struct file *fp;
657 struct uio auio;
658 struct iovec aiov;
659 unsigned int fd;
660 int cnt;
661 int error;
662 off_t offset;
663 int oublock_st, oublock_end;
664 int inblock_st, inblock_end;
665
666 userp = aiocbe->userproc;
667 td = curthread;
668 mycp = td->td_proc;
669 cb = &aiocbe->uaiocb;
670
671 fdp = mycp->p_fd;
672 fd = cb->aio_fildes;
673 fp = fdp->fd_ofiles[fd];
674
675 if ((fp == NULL) || (fp != aiocbe->fd_file)) {
676 cb->_aiocb_private.error = EBADF;
677 cb->_aiocb_private.status = -1;
678 return;
679 }
680
681 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
682 aiov.iov_len = cb->aio_nbytes;
683
684 auio.uio_iov = &aiov;
685 auio.uio_iovcnt = 1;
686 auio.uio_offset = offset = cb->aio_offset;
687 auio.uio_resid = cb->aio_nbytes;
688 cnt = cb->aio_nbytes;
689 auio.uio_segflg = UIO_USERSPACE;
690 auio.uio_td = td;
691
692 inblock_st = mycp->p_stats->p_ru.ru_inblock;
693 oublock_st = mycp->p_stats->p_ru.ru_oublock;
694 /*
695 * Temporarily bump the ref count while reading to avoid the
696 * descriptor being ripped out from under us.
697 */
698 fhold(fp);
699 if (cb->aio_lio_opcode == LIO_READ) {
700 auio.uio_rw = UIO_READ;
701 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
702 } else {
703 auio.uio_rw = UIO_WRITE;
704 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
705 }
706 fdrop(fp, td);
707 inblock_end = mycp->p_stats->p_ru.ru_inblock;
708 oublock_end = mycp->p_stats->p_ru.ru_oublock;
709
710 aiocbe->inputcharge = inblock_end - inblock_st;
711 aiocbe->outputcharge = oublock_end - oublock_st;
712
713 if ((error) && (auio.uio_resid != cnt)) {
714 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
715 error = 0;
716 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
717 PROC_LOCK(userp);
718 psignal(userp, SIGPIPE);
719 PROC_UNLOCK(userp);
720 }
721 }
722
723 cnt -= auio.uio_resid;
724 cb->_aiocb_private.error = error;
725 cb->_aiocb_private.status = cnt;
726}
727
728/*
729 * The AIO daemon, most of the actual work is done in aio_process,
730 * but the setup (and address space mgmt) is done in this routine.
731 */
732static void
733aio_daemon(void *uproc)
734{
735 int s;
736 struct aio_liojob *lj;
737 struct aiocb *cb;
738 struct aiocblist *aiocbe;
739 struct aiothreadlist *aiop;
740 struct kaioinfo *ki;
741 struct proc *curcp, *mycp, *userp;
742 struct vmspace *myvm, *tmpvm;
743 struct thread *td = curthread;
26#include <sys/bio.h>
27#include <sys/buf.h>
28#include <sys/sysproto.h>
29#include <sys/filedesc.h>
30#include <sys/kernel.h>
31#include <sys/kthread.h>
32#include <sys/fcntl.h>
33#include <sys/file.h>
34#include <sys/lock.h>
35#include <sys/mutex.h>
36#include <sys/unistd.h>
37#include <sys/proc.h>
38#include <sys/resourcevar.h>
39#include <sys/signalvar.h>
40#include <sys/protosw.h>
41#include <sys/socketvar.h>
42#include <sys/syscall.h>
43#include <sys/sysent.h>
44#include <sys/sysctl.h>
45#include <sys/vnode.h>
46#include <sys/conf.h>
47#include <sys/event.h>
48
49#include <vm/vm.h>
50#include <vm/vm_extern.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_zone.h>
54#include <sys/aio.h>
55
56#include <machine/limits.h>
57
58#include "opt_vfs_aio.h"
59
60static long jobrefid;
61
62#define JOBST_NULL 0x0
63#define JOBST_JOBQGLOBAL 0x2
64#define JOBST_JOBRUNNING 0x3
65#define JOBST_JOBFINISHED 0x4
66#define JOBST_JOBQBUF 0x5
67#define JOBST_JOBBFINISHED 0x6
68
69#ifndef MAX_AIO_PER_PROC
70#define MAX_AIO_PER_PROC 32
71#endif
72
73#ifndef MAX_AIO_QUEUE_PER_PROC
74#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
75#endif
76
77#ifndef MAX_AIO_PROCS
78#define MAX_AIO_PROCS 32
79#endif
80
81#ifndef MAX_AIO_QUEUE
82#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
83#endif
84
85#ifndef TARGET_AIO_PROCS
86#define TARGET_AIO_PROCS 4
87#endif
88
89#ifndef MAX_BUF_AIO
90#define MAX_BUF_AIO 16
91#endif
92
93#ifndef AIOD_TIMEOUT_DEFAULT
94#define AIOD_TIMEOUT_DEFAULT (10 * hz)
95#endif
96
97#ifndef AIOD_LIFETIME_DEFAULT
98#define AIOD_LIFETIME_DEFAULT (30 * hz)
99#endif
100
101static int max_aio_procs = MAX_AIO_PROCS;
102static int num_aio_procs = 0;
103static int target_aio_procs = TARGET_AIO_PROCS;
104static int max_queue_count = MAX_AIO_QUEUE;
105static int num_queue_count = 0;
106static int num_buf_aio = 0;
107static int num_aio_resv_start = 0;
108static int aiod_timeout;
109static int aiod_lifetime;
110static int unloadable = 0;
111
112static int max_aio_per_proc = MAX_AIO_PER_PROC;
113static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
114static int max_buf_aio = MAX_BUF_AIO;
115
116SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
117
118SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
119 CTLFLAG_RW, &max_aio_per_proc, 0, "");
120
121SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
122 CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
123
124SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
125 CTLFLAG_RW, &max_aio_procs, 0, "");
126
127SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
128 CTLFLAG_RD, &num_aio_procs, 0, "");
129
130SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
131 CTLFLAG_RD, &num_queue_count, 0, "");
132
133SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
134 CTLFLAG_RW, &max_queue_count, 0, "");
135
136SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
137 CTLFLAG_RW, &target_aio_procs, 0, "");
138
139SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
140 CTLFLAG_RW, &max_buf_aio, 0, "");
141
142SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
143 CTLFLAG_RD, &num_buf_aio, 0, "");
144
145SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
146 CTLFLAG_RW, &aiod_lifetime, 0, "");
147
148SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
149 CTLFLAG_RW, &aiod_timeout, 0, "");
150
151SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
152 "Allow unload of aio (not recommended)");
153
154struct aiocblist {
155 TAILQ_ENTRY(aiocblist) list; /* List of jobs */
156 TAILQ_ENTRY(aiocblist) plist; /* List of jobs for proc */
157 int jobflags;
158 int jobstate;
159 int inputcharge;
160 int outputcharge;
161 struct callout_handle timeouthandle;
162 struct buf *bp; /* Buffer pointer */
163 struct proc *userproc; /* User process */ /* Not td! */
164 struct file *fd_file; /* Pointer to file structure */
165 struct aiothreadlist *jobaiothread; /* AIO process descriptor */
166 struct aio_liojob *lio; /* Optional lio job */
167 struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */
168 struct klist klist; /* list of knotes */
169 struct aiocb uaiocb; /* Kernel I/O control block */
170};
171
172/* jobflags */
173#define AIOCBLIST_RUNDOWN 0x4
174#define AIOCBLIST_ASYNCFREE 0x8
175#define AIOCBLIST_DONE 0x10
176
177/*
178 * AIO process info
179 */
180#define AIOP_FREE 0x1 /* proc on free queue */
181#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
182
183struct aiothreadlist {
184 int aiothreadflags; /* AIO proc flags */
185 TAILQ_ENTRY(aiothreadlist) list; /* List of processes */
186 struct thread *aiothread; /* The AIO thread */
187};
188
189/*
190 * data-structure for lio signal management
191 */
192struct aio_liojob {
193 int lioj_flags;
194 int lioj_buffer_count;
195 int lioj_buffer_finished_count;
196 int lioj_queue_count;
197 int lioj_queue_finished_count;
198 struct sigevent lioj_signal; /* signal on all I/O done */
199 TAILQ_ENTRY(aio_liojob) lioj_list;
200 struct kaioinfo *lioj_ki;
201};
202#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
203#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
204
205/*
206 * per process aio data structure
207 */
208struct kaioinfo {
209 int kaio_flags; /* per process kaio flags */
210 int kaio_maxactive_count; /* maximum number of AIOs */
211 int kaio_active_count; /* number of currently used AIOs */
212 int kaio_qallowed_count; /* maxiumu size of AIO queue */
213 int kaio_queue_count; /* size of AIO queue */
214 int kaio_ballowed_count; /* maximum number of buffers */
215 int kaio_queue_finished_count; /* number of daemon jobs finished */
216 int kaio_buffer_count; /* number of physio buffers */
217 int kaio_buffer_finished_count; /* count of I/O done */
218 struct proc *kaio_p; /* process that uses this kaio block */
219 TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
220 TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* job queue for process */
221 TAILQ_HEAD(,aiocblist) kaio_jobdone; /* done queue for process */
222 TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* buffer job queue for process */
223 TAILQ_HEAD(,aiocblist) kaio_bufdone; /* buffer done queue for process */
224 TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
225};
226
227#define KAIO_RUNDOWN 0x1 /* process is being run down */
228#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
229
230static TAILQ_HEAD(,aiothreadlist) aio_freeproc, aio_activeproc;
231static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
232static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
233
234static void aio_init_aioinfo(struct proc *p);
235static void aio_onceonly(void);
236static int aio_free_entry(struct aiocblist *aiocbe);
237static void aio_process(struct aiocblist *aiocbe);
238static int aio_newproc(void);
239static int aio_aqueue(struct thread *td, struct aiocb *job, int type);
240static void aio_physwakeup(struct buf *bp);
241static void aio_proc_rundown(struct proc *p);
242static int aio_fphysio(struct aiocblist *aiocbe);
243static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
244static void aio_daemon(void *uproc);
245static void aio_swake_cb(struct socket *, struct sockbuf *);
246static int aio_unload(void);
247static void process_signal(void *aioj);
248static int filt_aioattach(struct knote *kn);
249static void filt_aiodetach(struct knote *kn);
250static int filt_aio(struct knote *kn, long hint);
251
252static vm_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone;
253static vm_zone_t aiolio_zone;
254
255static struct filterops aio_filtops =
256 { 0, filt_aioattach, filt_aiodetach, filt_aio };
257
258static int
259aio_modload(struct module *module, int cmd, void *arg)
260{
261 int error = 0;
262
263 switch (cmd) {
264 case MOD_LOAD:
265 aio_onceonly();
266 break;
267 case MOD_UNLOAD:
268 error = aio_unload();
269 break;
270 case MOD_SHUTDOWN:
271 break;
272 default:
273 error = EINVAL;
274 break;
275 }
276 return (error);
277}
278
279static moduledata_t aio_mod = {
280 "aio",
281 &aio_modload,
282 NULL
283};
284
285SYSCALL_MODULE_HELPER(aio_return);
286SYSCALL_MODULE_HELPER(aio_suspend);
287SYSCALL_MODULE_HELPER(aio_cancel);
288SYSCALL_MODULE_HELPER(aio_error);
289SYSCALL_MODULE_HELPER(aio_read);
290SYSCALL_MODULE_HELPER(aio_write);
291SYSCALL_MODULE_HELPER(aio_waitcomplete);
292SYSCALL_MODULE_HELPER(lio_listio);
293
294DECLARE_MODULE(aio, aio_mod,
295 SI_SUB_VFS, SI_ORDER_ANY);
296MODULE_VERSION(aio, 1);
297
298/*
299 * Startup initialization
300 */
301static void
302aio_onceonly(void)
303{
304
305 /* XXX: should probably just use so->callback */
306 aio_swake = &aio_swake_cb;
307 at_exit(aio_proc_rundown);
308 at_exec(aio_proc_rundown);
309 kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
310 TAILQ_INIT(&aio_freeproc);
311 TAILQ_INIT(&aio_activeproc);
312 TAILQ_INIT(&aio_jobs);
313 TAILQ_INIT(&aio_bufjobs);
314 kaio_zone = zinit("AIO", sizeof(struct kaioinfo), 0, 0, 1);
315 aiop_zone = zinit("AIOP", sizeof(struct aiothreadlist), 0, 0, 1);
316 aiocb_zone = zinit("AIOCB", sizeof(struct aiocblist), 0, 0, 1);
317 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t), 0, 0, 1);
318 aiolio_zone = zinit("AIOLIO", sizeof(struct aio_liojob), 0, 0, 1);
319 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
320 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
321 jobrefid = 1;
322}
323
324static int
325aio_unload(void)
326{
327
328 /*
329 * XXX: no unloads by default, it's too dangerous.
330 * perhaps we could do it if locked out callers and then
331 * did an aio_proc_rundown() on each process.
332 */
333 if (!unloadable)
334 return (EOPNOTSUPP);
335
336 aio_swake = NULL;
337 rm_at_exit(aio_proc_rundown);
338 rm_at_exec(aio_proc_rundown);
339 kqueue_del_filteropts(EVFILT_AIO);
340 return (0);
341}
342
343/*
344 * Init the per-process aioinfo structure. The aioinfo limits are set
345 * per-process for user limit (resource) management.
346 */
347static void
348aio_init_aioinfo(struct proc *p)
349{
350 struct kaioinfo *ki;
351 if (p->p_aioinfo == NULL) {
352 ki = zalloc(kaio_zone);
353 p->p_aioinfo = ki;
354 ki->kaio_flags = 0;
355 ki->kaio_maxactive_count = max_aio_per_proc;
356 ki->kaio_active_count = 0;
357 ki->kaio_qallowed_count = max_aio_queue_per_proc;
358 ki->kaio_queue_count = 0;
359 ki->kaio_ballowed_count = max_buf_aio;
360 ki->kaio_buffer_count = 0;
361 ki->kaio_buffer_finished_count = 0;
362 ki->kaio_p = p;
363 TAILQ_INIT(&ki->kaio_jobdone);
364 TAILQ_INIT(&ki->kaio_jobqueue);
365 TAILQ_INIT(&ki->kaio_bufdone);
366 TAILQ_INIT(&ki->kaio_bufqueue);
367 TAILQ_INIT(&ki->kaio_liojoblist);
368 TAILQ_INIT(&ki->kaio_sockqueue);
369 }
370
371 while (num_aio_procs < target_aio_procs)
372 aio_newproc();
373}
374
375/*
376 * Free a job entry. Wait for completion if it is currently active, but don't
377 * delay forever. If we delay, we return a flag that says that we have to
378 * restart the queue scan.
379 */
380static int
381aio_free_entry(struct aiocblist *aiocbe)
382{
383 struct kaioinfo *ki;
384 struct aio_liojob *lj;
385 struct proc *p;
386 int error;
387 int s;
388
389 if (aiocbe->jobstate == JOBST_NULL)
390 panic("aio_free_entry: freeing already free job");
391
392 p = aiocbe->userproc;
393 ki = p->p_aioinfo;
394 lj = aiocbe->lio;
395 if (ki == NULL)
396 panic("aio_free_entry: missing p->p_aioinfo");
397
398 while (aiocbe->jobstate == JOBST_JOBRUNNING) {
399 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
400 return 0;
401 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
402 tsleep(aiocbe, PRIBIO, "jobwai", 0);
403 }
404 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
405
406 if (aiocbe->bp == NULL) {
407 if (ki->kaio_queue_count <= 0)
408 panic("aio_free_entry: process queue size <= 0");
409 if (num_queue_count <= 0)
410 panic("aio_free_entry: system wide queue size <= 0");
411
412 if (lj) {
413 lj->lioj_queue_count--;
414 if (aiocbe->jobflags & AIOCBLIST_DONE)
415 lj->lioj_queue_finished_count--;
416 }
417 ki->kaio_queue_count--;
418 if (aiocbe->jobflags & AIOCBLIST_DONE)
419 ki->kaio_queue_finished_count--;
420 num_queue_count--;
421 } else {
422 if (lj) {
423 lj->lioj_buffer_count--;
424 if (aiocbe->jobflags & AIOCBLIST_DONE)
425 lj->lioj_buffer_finished_count--;
426 }
427 if (aiocbe->jobflags & AIOCBLIST_DONE)
428 ki->kaio_buffer_finished_count--;
429 ki->kaio_buffer_count--;
430 num_buf_aio--;
431 }
432
433 /* aiocbe is going away, we need to destroy any knotes */
434 /* XXXKSE Note the thread here is used to eventually find the
435 * owning process again, but it is also used to do a fo_close
436 * and that requires the thread. (but does it require the
437 * OWNING thread? (or maybe the running thread?)
438 * There is a semantic problem here...
439 */
440 knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
441
442 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
443 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
444 ki->kaio_flags &= ~KAIO_WAKEUP;
445 wakeup(p);
446 }
447
448 if (aiocbe->jobstate == JOBST_JOBQBUF) {
449 if ((error = aio_fphysio(aiocbe)) != 0)
450 return error;
451 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
452 panic("aio_free_entry: invalid physio finish-up state");
453 s = splbio();
454 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
455 splx(s);
456 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
457 s = splnet();
458 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
459 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
460 splx(s);
461 } else if (aiocbe->jobstate == JOBST_JOBFINISHED)
462 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
463 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
464 s = splbio();
465 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
466 splx(s);
467 if (aiocbe->bp) {
468 vunmapbuf(aiocbe->bp);
469 relpbuf(aiocbe->bp, NULL);
470 aiocbe->bp = NULL;
471 }
472 }
473 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
474 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
475 zfree(aiolio_zone, lj);
476 }
477 aiocbe->jobstate = JOBST_NULL;
478 untimeout(process_signal, aiocbe, aiocbe->timeouthandle);
479 zfree(aiocb_zone, aiocbe);
480 return 0;
481}
482
483/*
484 * Rundown the jobs for a given process.
485 */
486static void
487aio_proc_rundown(struct proc *p)
488{
489 int s;
490 struct kaioinfo *ki;
491 struct aio_liojob *lj, *ljn;
492 struct aiocblist *aiocbe, *aiocbn;
493 struct file *fp;
494 struct filedesc *fdp;
495 struct socket *so;
496
497 ki = p->p_aioinfo;
498 if (ki == NULL)
499 return;
500
501 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
502 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
503 ki->kaio_buffer_finished_count)) {
504 ki->kaio_flags |= KAIO_RUNDOWN;
505 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
506 break;
507 }
508
509 /*
510 * Move any aio ops that are waiting on socket I/O to the normal job
511 * queues so they are cleaned up with any others.
512 */
513 fdp = p->p_fd;
514
515 s = splnet();
516 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
517 aiocbn) {
518 aiocbn = TAILQ_NEXT(aiocbe, plist);
519 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
520
521 /*
522 * Under some circumstances, the aio_fildes and the file
523 * structure don't match. This would leave aiocbe's in the
524 * TAILQ associated with the socket and cause a panic later.
525 *
526 * Detect and fix.
527 */
528 if ((fp == NULL) || (fp != aiocbe->fd_file))
529 fp = aiocbe->fd_file;
530 if (fp) {
531 so = (struct socket *)fp->f_data;
532 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
533 if (TAILQ_EMPTY(&so->so_aiojobq)) {
534 so->so_snd.sb_flags &= ~SB_AIO;
535 so->so_rcv.sb_flags &= ~SB_AIO;
536 }
537 }
538 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
539 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
540 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
541 }
542 splx(s);
543
544restart1:
545 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
546 aiocbn = TAILQ_NEXT(aiocbe, plist);
547 if (aio_free_entry(aiocbe))
548 goto restart1;
549 }
550
551restart2:
552 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
553 aiocbn) {
554 aiocbn = TAILQ_NEXT(aiocbe, plist);
555 if (aio_free_entry(aiocbe))
556 goto restart2;
557 }
558
559/*
560 * Note the use of lots of splbio here, trying to avoid splbio for long chains
561 * of I/O. Probably unnecessary.
562 */
563restart3:
564 s = splbio();
565 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
566 ki->kaio_flags |= KAIO_WAKEUP;
567 tsleep(p, PRIBIO, "aioprn", 0);
568 splx(s);
569 goto restart3;
570 }
571 splx(s);
572
573restart4:
574 s = splbio();
575 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
576 aiocbn = TAILQ_NEXT(aiocbe, plist);
577 if (aio_free_entry(aiocbe)) {
578 splx(s);
579 goto restart4;
580 }
581 }
582 splx(s);
583
584 /*
585 * If we've slept, jobs might have moved from one queue to another.
586 * Retry rundown if we didn't manage to empty the queues.
587 */
588 if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
589 TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
590 TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
591 TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
592 goto restart1;
593
594 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
595 ljn = TAILQ_NEXT(lj, lioj_list);
596 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
597 0)) {
598 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
599 zfree(aiolio_zone, lj);
600 } else {
601#ifdef DIAGNOSTIC
602 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
603 "QF:%d\n", lj->lioj_buffer_count,
604 lj->lioj_buffer_finished_count,
605 lj->lioj_queue_count,
606 lj->lioj_queue_finished_count);
607#endif
608 }
609 }
610
611 zfree(kaio_zone, ki);
612 p->p_aioinfo = NULL;
613}
614
615/*
616 * Select a job to run (called by an AIO daemon).
617 */
618static struct aiocblist *
619aio_selectjob(struct aiothreadlist *aiop)
620{
621 int s;
622 struct aiocblist *aiocbe;
623 struct kaioinfo *ki;
624 struct proc *userp;
625
626 s = splnet();
627 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
628 TAILQ_NEXT(aiocbe, list)) {
629 userp = aiocbe->userproc;
630 ki = userp->p_aioinfo;
631
632 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
633 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
634 splx(s);
635 return aiocbe;
636 }
637 }
638 splx(s);
639
640 return NULL;
641}
642
643/*
644 * The AIO processing activity. This is the code that does the I/O request for
645 * the non-physio version of the operations. The normal vn operations are used,
646 * and this code should work in all instances for every type of file, including
647 * pipes, sockets, fifos, and regular files.
648 */
649static void
650aio_process(struct aiocblist *aiocbe)
651{
652 struct filedesc *fdp;
653 struct thread *td;
654 struct proc *userp;
655 struct proc *mycp;
656 struct aiocb *cb;
657 struct file *fp;
658 struct uio auio;
659 struct iovec aiov;
660 unsigned int fd;
661 int cnt;
662 int error;
663 off_t offset;
664 int oublock_st, oublock_end;
665 int inblock_st, inblock_end;
666
667 userp = aiocbe->userproc;
668 td = curthread;
669 mycp = td->td_proc;
670 cb = &aiocbe->uaiocb;
671
672 fdp = mycp->p_fd;
673 fd = cb->aio_fildes;
674 fp = fdp->fd_ofiles[fd];
675
676 if ((fp == NULL) || (fp != aiocbe->fd_file)) {
677 cb->_aiocb_private.error = EBADF;
678 cb->_aiocb_private.status = -1;
679 return;
680 }
681
682 aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
683 aiov.iov_len = cb->aio_nbytes;
684
685 auio.uio_iov = &aiov;
686 auio.uio_iovcnt = 1;
687 auio.uio_offset = offset = cb->aio_offset;
688 auio.uio_resid = cb->aio_nbytes;
689 cnt = cb->aio_nbytes;
690 auio.uio_segflg = UIO_USERSPACE;
691 auio.uio_td = td;
692
693 inblock_st = mycp->p_stats->p_ru.ru_inblock;
694 oublock_st = mycp->p_stats->p_ru.ru_oublock;
695 /*
696 * Temporarily bump the ref count while reading to avoid the
697 * descriptor being ripped out from under us.
698 */
699 fhold(fp);
700 if (cb->aio_lio_opcode == LIO_READ) {
701 auio.uio_rw = UIO_READ;
702 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
703 } else {
704 auio.uio_rw = UIO_WRITE;
705 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
706 }
707 fdrop(fp, td);
708 inblock_end = mycp->p_stats->p_ru.ru_inblock;
709 oublock_end = mycp->p_stats->p_ru.ru_oublock;
710
711 aiocbe->inputcharge = inblock_end - inblock_st;
712 aiocbe->outputcharge = oublock_end - oublock_st;
713
714 if ((error) && (auio.uio_resid != cnt)) {
715 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
716 error = 0;
717 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
718 PROC_LOCK(userp);
719 psignal(userp, SIGPIPE);
720 PROC_UNLOCK(userp);
721 }
722 }
723
724 cnt -= auio.uio_resid;
725 cb->_aiocb_private.error = error;
726 cb->_aiocb_private.status = cnt;
727}
728
729/*
730 * The AIO daemon, most of the actual work is done in aio_process,
731 * but the setup (and address space mgmt) is done in this routine.
732 */
733static void
734aio_daemon(void *uproc)
735{
736 int s;
737 struct aio_liojob *lj;
738 struct aiocb *cb;
739 struct aiocblist *aiocbe;
740 struct aiothreadlist *aiop;
741 struct kaioinfo *ki;
742 struct proc *curcp, *mycp, *userp;
743 struct vmspace *myvm, *tmpvm;
744 struct thread *td = curthread;
745 struct pgrp *newpgrp;
746 struct session *newsess;
744
745 mtx_lock(&Giant);
746 /*
747 * Local copies of curproc (cp) and vmspace (myvm)
748 */
749 mycp = td->td_proc;
750 myvm = mycp->p_vmspace;
751
752 if (mycp->p_textvp) {
753 vrele(mycp->p_textvp);
754 mycp->p_textvp = NULL;
755 }
756
757 /*
758 * Allocate and ready the aio control info. There is one aiop structure
759 * per daemon.
760 */
761 aiop = zalloc(aiop_zone);
762 aiop->aiothread = td;
763 aiop->aiothreadflags |= AIOP_FREE;
764
765 s = splnet();
766
767 /*
768 * Place thread (lightweight process) onto the AIO free thread list.
769 */
770 if (TAILQ_EMPTY(&aio_freeproc))
771 wakeup(&aio_freeproc);
772 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
773
774 splx(s);
775
776 /*
777 * Get rid of our current filedescriptors. AIOD's don't need any
778 * filedescriptors, except as temporarily inherited from the client.
779 */
780 fdfree(td);
781 mycp->p_fd = NULL;
782
783 /* The daemon resides in its own pgrp. */
747
748 mtx_lock(&Giant);
749 /*
750 * Local copies of curproc (cp) and vmspace (myvm)
751 */
752 mycp = td->td_proc;
753 myvm = mycp->p_vmspace;
754
755 if (mycp->p_textvp) {
756 vrele(mycp->p_textvp);
757 mycp->p_textvp = NULL;
758 }
759
760 /*
761 * Allocate and ready the aio control info. There is one aiop structure
762 * per daemon.
763 */
764 aiop = zalloc(aiop_zone);
765 aiop->aiothread = td;
766 aiop->aiothreadflags |= AIOP_FREE;
767
768 s = splnet();
769
770 /*
771 * Place thread (lightweight process) onto the AIO free thread list.
772 */
773 if (TAILQ_EMPTY(&aio_freeproc))
774 wakeup(&aio_freeproc);
775 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
776
777 splx(s);
778
779 /*
780 * Get rid of our current filedescriptors. AIOD's don't need any
781 * filedescriptors, except as temporarily inherited from the client.
782 */
783 fdfree(td);
784 mycp->p_fd = NULL;
785
786 /* The daemon resides in its own pgrp. */
784 enterpgrp(mycp, mycp->p_pid, 1);
787 MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
788 MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
785
789
790 PGRPSESS_XLOCK();
791 enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
792 PGRPSESS_XUNLOCK();
793
786 /* Mark special process type. */
787 mycp->p_flag |= P_SYSTEM;
788
789 /*
790 * Wakeup parent process. (Parent sleeps to keep from blasting away
791 * and creating too many daemons.)
792 */
793 wakeup(mycp);
794
795 for (;;) {
796 /*
797 * curcp is the current daemon process context.
798 * userp is the current user process context.
799 */
800 curcp = mycp;
801
802 /*
803 * Take daemon off of free queue
804 */
805 if (aiop->aiothreadflags & AIOP_FREE) {
806 s = splnet();
807 TAILQ_REMOVE(&aio_freeproc, aiop, list);
808 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
809 aiop->aiothreadflags &= ~AIOP_FREE;
810 splx(s);
811 }
812 aiop->aiothreadflags &= ~AIOP_SCHED;
813
814 /*
815 * Check for jobs.
816 */
817 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
818 cb = &aiocbe->uaiocb;
819 userp = aiocbe->userproc;
820
821 aiocbe->jobstate = JOBST_JOBRUNNING;
822
823 /*
824 * Connect to process address space for user program.
825 */
826 if (userp != curcp) {
827 /*
828 * Save the current address space that we are
829 * connected to.
830 */
831 tmpvm = mycp->p_vmspace;
832
833 /*
834 * Point to the new user address space, and
835 * refer to it.
836 */
837 mycp->p_vmspace = userp->p_vmspace;
838 mycp->p_vmspace->vm_refcnt++;
839
840 /* Activate the new mapping. */
841 pmap_activate(FIRST_THREAD_IN_PROC(mycp));
842
843 /*
844 * If the old address space wasn't the daemons
845 * own address space, then we need to remove the
846 * daemon's reference from the other process
847 * that it was acting on behalf of.
848 */
849 if (tmpvm != myvm) {
850 vmspace_free(tmpvm);
851 }
852
853 /*
854 * Disassociate from previous clients file
855 * descriptors, and associate to the new clients
856 * descriptors. Note that the daemon doesn't
857 * need to worry about its orginal descriptors,
858 * because they were originally freed.
859 */
860 if (mycp->p_fd)
861 fdfree(td);
862 mycp->p_fd = fdshare(userp);
863 curcp = userp;
864 }
865
866 ki = userp->p_aioinfo;
867 lj = aiocbe->lio;
868
869 /* Account for currently active jobs. */
870 ki->kaio_active_count++;
871
872 /* Do the I/O function. */
873 aiocbe->jobaiothread = aiop;
874 aio_process(aiocbe);
875
876 /* Decrement the active job count. */
877 ki->kaio_active_count--;
878
879 /*
880 * Increment the completion count for wakeup/signal
881 * comparisons.
882 */
883 aiocbe->jobflags |= AIOCBLIST_DONE;
884 ki->kaio_queue_finished_count++;
885 if (lj)
886 lj->lioj_queue_finished_count++;
887 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
888 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
889 ki->kaio_flags &= ~KAIO_WAKEUP;
890 wakeup(userp);
891 }
892
893 s = splbio();
894 if (lj && (lj->lioj_flags &
895 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
896 if ((lj->lioj_queue_finished_count ==
897 lj->lioj_queue_count) &&
898 (lj->lioj_buffer_finished_count ==
899 lj->lioj_buffer_count)) {
900 PROC_LOCK(userp);
901 psignal(userp,
902 lj->lioj_signal.sigev_signo);
903 PROC_UNLOCK(userp);
904 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
905 }
906 }
907 splx(s);
908
909 aiocbe->jobstate = JOBST_JOBFINISHED;
910
911 /*
912 * If the I/O request should be automatically rundown,
913 * do the needed cleanup. Otherwise, place the queue
914 * entry for the just finished I/O request into the done
915 * queue for the associated client.
916 */
917 s = splnet();
918 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
919 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
920 zfree(aiocb_zone, aiocbe);
921 } else {
922 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
923 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
924 plist);
925 }
926 splx(s);
927 KNOTE(&aiocbe->klist, 0);
928
929 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
930 wakeup(aiocbe);
931 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
932 }
933
934 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
935 PROC_LOCK(userp);
936 psignal(userp, cb->aio_sigevent.sigev_signo);
937 PROC_UNLOCK(userp);
938 }
939 }
940
941 /*
942 * Disconnect from user address space.
943 */
944 if (curcp != mycp) {
945 /* Get the user address space to disconnect from. */
946 tmpvm = mycp->p_vmspace;
947
948 /* Get original address space for daemon. */
949 mycp->p_vmspace = myvm;
950
951 /* Activate the daemon's address space. */
952 pmap_activate(FIRST_THREAD_IN_PROC(mycp));
953#ifdef DIAGNOSTIC
954 if (tmpvm == myvm) {
955 printf("AIOD: vmspace problem -- %d\n",
956 mycp->p_pid);
957 }
958#endif
959 /* Remove our vmspace reference. */
960 vmspace_free(tmpvm);
961
962 /*
963 * Disassociate from the user process's file
964 * descriptors.
965 */
966 if (mycp->p_fd)
967 fdfree(td);
968 mycp->p_fd = NULL;
969 curcp = mycp;
970 }
971
972 /*
973 * If we are the first to be put onto the free queue, wakeup
974 * anyone waiting for a daemon.
975 */
976 s = splnet();
977 TAILQ_REMOVE(&aio_activeproc, aiop, list);
978 if (TAILQ_EMPTY(&aio_freeproc))
979 wakeup(&aio_freeproc);
980 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
981 aiop->aiothreadflags |= AIOP_FREE;
982 splx(s);
983
984 /*
985 * If daemon is inactive for a long time, allow it to exit,
986 * thereby freeing resources.
987 */
988 if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
989 tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
990 s = splnet();
991 if (TAILQ_EMPTY(&aio_jobs)) {
992 if ((aiop->aiothreadflags & AIOP_FREE) &&
993 (num_aio_procs > target_aio_procs)) {
994 TAILQ_REMOVE(&aio_freeproc, aiop, list);
995 splx(s);
996 zfree(aiop_zone, aiop);
997 num_aio_procs--;
998#ifdef DIAGNOSTIC
999 if (mycp->p_vmspace->vm_refcnt <= 1) {
1000 printf("AIOD: bad vm refcnt for"
1001 " exiting daemon: %d\n",
1002 mycp->p_vmspace->vm_refcnt);
1003 }
1004#endif
1005 kthread_exit(0);
1006 }
1007 }
1008 splx(s);
1009 }
1010 }
1011}
1012
1013/*
1014 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1015 * AIO daemon modifies its environment itself.
1016 */
1017static int
1018aio_newproc()
1019{
1020 int error;
1021 struct proc *p;
1022
1023 error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
1024 num_aio_procs);
1025 if (error)
1026 return error;
1027
1028 /*
1029 * Wait until daemon is started, but continue on just in case to
1030 * handle error conditions.
1031 */
1032 error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1033
1034 num_aio_procs++;
1035
1036 return error;
1037}
1038
1039/*
1040 * Try the high-performance, low-overhead physio method for eligible
1041 * VCHR devices. This method doesn't use an aio helper thread, and
1042 * thus has very low overhead.
1043 *
1044 * Assumes that the caller, _aio_aqueue(), has incremented the file
1045 * structure's reference count, preventing its deallocation for the
1046 * duration of this call.
1047 */
1048static int
1049aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1050{
1051 int error;
1052 struct aiocb *cb;
1053 struct file *fp;
1054 struct buf *bp;
1055 struct vnode *vp;
1056 struct kaioinfo *ki;
1057 struct aio_liojob *lj;
1058 int s;
1059 int notify;
1060
1061 cb = &aiocbe->uaiocb;
1062 fp = aiocbe->fd_file;
1063
1064 if (fp->f_type != DTYPE_VNODE)
1065 return (-1);
1066
1067 vp = (struct vnode *)fp->f_data;
1068
1069 /*
1070 * If its not a disk, we don't want to return a positive error.
1071 * It causes the aio code to not fall through to try the thread
1072 * way when you're talking to a regular file.
1073 */
1074 if (!vn_isdisk(vp, &error)) {
1075 if (error == ENOTBLK)
1076 return (-1);
1077 else
1078 return (error);
1079 }
1080
1081 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1082 return (-1);
1083
1084 if (cb->aio_nbytes >
1085 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1086 return (-1);
1087
1088 ki = p->p_aioinfo;
1089 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1090 return (-1);
1091
1092 ki->kaio_buffer_count++;
1093
1094 lj = aiocbe->lio;
1095 if (lj)
1096 lj->lioj_buffer_count++;
1097
1098 /* Create and build a buffer header for a transfer. */
1099 bp = (struct buf *)getpbuf(NULL);
1100 BUF_KERNPROC(bp);
1101
1102 /*
1103 * Get a copy of the kva from the physical buffer.
1104 */
1105 bp->b_caller1 = p;
1106 bp->b_dev = vp->v_rdev;
1107 error = bp->b_error = 0;
1108
1109 bp->b_bcount = cb->aio_nbytes;
1110 bp->b_bufsize = cb->aio_nbytes;
1111 bp->b_flags = B_PHYS;
1112 bp->b_iodone = aio_physwakeup;
1113 bp->b_saveaddr = bp->b_data;
1114 bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1115 bp->b_blkno = btodb(cb->aio_offset);
1116
1117 if (cb->aio_lio_opcode == LIO_WRITE) {
1118 bp->b_iocmd = BIO_WRITE;
1119 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1120 error = EFAULT;
1121 goto doerror;
1122 }
1123 } else {
1124 bp->b_iocmd = BIO_READ;
1125 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1126 error = EFAULT;
1127 goto doerror;
1128 }
1129 }
1130
1131 /* Bring buffer into kernel space. */
1132 vmapbuf(bp);
1133
1134 s = splbio();
1135 aiocbe->bp = bp;
1136 bp->b_spc = (void *)aiocbe;
1137 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1138 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1139 aiocbe->jobstate = JOBST_JOBQBUF;
1140 cb->_aiocb_private.status = cb->aio_nbytes;
1141 num_buf_aio++;
1142 bp->b_error = 0;
1143
1144 splx(s);
1145
1146 /* Perform transfer. */
1147 DEV_STRATEGY(bp, 0);
1148
1149 notify = 0;
1150 s = splbio();
1151
1152 /*
1153 * If we had an error invoking the request, or an error in processing
1154 * the request before we have returned, we process it as an error in
1155 * transfer. Note that such an I/O error is not indicated immediately,
1156 * but is returned using the aio_error mechanism. In this case,
1157 * aio_suspend will return immediately.
1158 */
1159 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1160 struct aiocb *job = aiocbe->uuaiocb;
1161
1162 aiocbe->uaiocb._aiocb_private.status = 0;
1163 suword(&job->_aiocb_private.status, 0);
1164 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1165 suword(&job->_aiocb_private.error, bp->b_error);
1166
1167 ki->kaio_buffer_finished_count++;
1168
1169 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1170 aiocbe->jobstate = JOBST_JOBBFINISHED;
1171 aiocbe->jobflags |= AIOCBLIST_DONE;
1172 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1173 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1174 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1175 notify = 1;
1176 }
1177 }
1178 splx(s);
1179 if (notify)
1180 KNOTE(&aiocbe->klist, 0);
1181 return 0;
1182
1183doerror:
1184 ki->kaio_buffer_count--;
1185 if (lj)
1186 lj->lioj_buffer_count--;
1187 aiocbe->bp = NULL;
1188 relpbuf(bp, NULL);
1189 return error;
1190}
1191
1192/*
1193 * This waits/tests physio completion.
1194 */
1195static int
1196aio_fphysio(struct aiocblist *iocb)
1197{
1198 int s;
1199 struct buf *bp;
1200 int error;
1201
1202 bp = iocb->bp;
1203
1204 s = splbio();
1205 while ((bp->b_flags & B_DONE) == 0) {
1206 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1207 if ((bp->b_flags & B_DONE) == 0) {
1208 splx(s);
1209 return EINPROGRESS;
1210 } else
1211 break;
1212 }
1213 }
1214 splx(s);
1215
1216 /* Release mapping into kernel space. */
1217 vunmapbuf(bp);
1218 iocb->bp = 0;
1219
1220 error = 0;
1221
1222 /* Check for an error. */
1223 if (bp->b_ioflags & BIO_ERROR)
1224 error = bp->b_error;
1225
1226 relpbuf(bp, NULL);
1227 return (error);
1228}
1229
1230/*
1231 * Wake up aio requests that may be serviceable now.
1232 */
1233static void
1234aio_swake_cb(struct socket *so, struct sockbuf *sb)
1235{
1236 struct aiocblist *cb,*cbn;
1237 struct proc *p;
1238 struct kaioinfo *ki = NULL;
1239 int opcode, wakecount = 0;
1240 struct aiothreadlist *aiop;
1241
1242 if (sb == &so->so_snd) {
1243 opcode = LIO_WRITE;
1244 so->so_snd.sb_flags &= ~SB_AIO;
1245 } else {
1246 opcode = LIO_READ;
1247 so->so_rcv.sb_flags &= ~SB_AIO;
1248 }
1249
1250 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1251 cbn = TAILQ_NEXT(cb, list);
1252 if (opcode == cb->uaiocb.aio_lio_opcode) {
1253 p = cb->userproc;
1254 ki = p->p_aioinfo;
1255 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1256 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1257 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1258 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1259 wakecount++;
1260 if (cb->jobstate != JOBST_JOBQGLOBAL)
1261 panic("invalid queue value");
1262 }
1263 }
1264
1265 while (wakecount--) {
1266 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1267 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1268 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1269 aiop->aiothreadflags &= ~AIOP_FREE;
1270 wakeup(aiop->aiothread);
1271 }
1272 }
1273}
1274
1275/*
1276 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1277 * technique is done in this code.
1278 */
1279static int
1280_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1281{
1282 struct proc *p = td->td_proc;
1283 struct filedesc *fdp;
1284 struct file *fp;
1285 unsigned int fd;
1286 struct socket *so;
1287 int s;
1288 int error;
1289 int opcode;
1290 struct aiocblist *aiocbe;
1291 struct aiothreadlist *aiop;
1292 struct kaioinfo *ki;
1293 struct kevent kev;
1294 struct kqueue *kq;
1295 struct file *kq_fp;
1296
1297 aiocbe = zalloc(aiocb_zone);
1298 aiocbe->inputcharge = 0;
1299 aiocbe->outputcharge = 0;
1300 callout_handle_init(&aiocbe->timeouthandle);
1301 SLIST_INIT(&aiocbe->klist);
1302
1303 suword(&job->_aiocb_private.status, -1);
1304 suword(&job->_aiocb_private.error, 0);
1305 suword(&job->_aiocb_private.kernelinfo, -1);
1306
1307 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1308 if (error) {
1309 suword(&job->_aiocb_private.error, error);
1310 zfree(aiocb_zone, aiocbe);
1311 return error;
1312 }
1313 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1314 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1315 zfree(aiocb_zone, aiocbe);
1316 return EINVAL;
1317 }
1318
1319 /* Save userspace address of the job info. */
1320 aiocbe->uuaiocb = job;
1321
1322 /* Get the opcode. */
1323 if (type != LIO_NOP)
1324 aiocbe->uaiocb.aio_lio_opcode = type;
1325 opcode = aiocbe->uaiocb.aio_lio_opcode;
1326
1327 /* Get the fd info for process. */
1328 fdp = p->p_fd;
1329
1330 /*
1331 * Range check file descriptor.
1332 */
1333 fd = aiocbe->uaiocb.aio_fildes;
1334 if (fd >= fdp->fd_nfiles) {
1335 zfree(aiocb_zone, aiocbe);
1336 if (type == 0)
1337 suword(&job->_aiocb_private.error, EBADF);
1338 return EBADF;
1339 }
1340
1341 fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1342 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1343 0))) {
1344 zfree(aiocb_zone, aiocbe);
1345 if (type == 0)
1346 suword(&job->_aiocb_private.error, EBADF);
1347 return EBADF;
1348 }
1349
1350 if (aiocbe->uaiocb.aio_offset == -1LL) {
1351 zfree(aiocb_zone, aiocbe);
1352 if (type == 0)
1353 suword(&job->_aiocb_private.error, EINVAL);
1354 return EINVAL;
1355 }
1356
1357 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1358 if (error) {
1359 zfree(aiocb_zone, aiocbe);
1360 if (type == 0)
1361 suword(&job->_aiocb_private.error, EINVAL);
1362 return error;
1363 }
1364
1365 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1366 if (jobrefid == LONG_MAX)
1367 jobrefid = 1;
1368 else
1369 jobrefid++;
1370
1371 if (opcode == LIO_NOP) {
1372 zfree(aiocb_zone, aiocbe);
1373 if (type == 0) {
1374 suword(&job->_aiocb_private.error, 0);
1375 suword(&job->_aiocb_private.status, 0);
1376 suword(&job->_aiocb_private.kernelinfo, 0);
1377 }
1378 return 0;
1379 }
1380
1381 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1382 zfree(aiocb_zone, aiocbe);
1383 if (type == 0) {
1384 suword(&job->_aiocb_private.status, 0);
1385 suword(&job->_aiocb_private.error, EINVAL);
1386 }
1387 return EINVAL;
1388 }
1389
1390 fhold(fp);
1391
1392 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1393 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1394 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1395 }
1396 else {
1397 /*
1398 * This method for requesting kevent-based notification won't
1399 * work on the alpha, since we're passing in a pointer
1400 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1401 * based method instead.
1402 */
1403 struct kevent *kevp;
1404
1405 kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
1406 if (kevp == NULL)
1407 goto no_kqueue;
1408
1409 error = copyin(kevp, &kev, sizeof(kev));
1410 if (error)
1411 goto aqueue_fail;
1412 }
1413 if ((u_int)kev.ident >= fdp->fd_nfiles ||
1414 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1415 (kq_fp->f_type != DTYPE_KQUEUE)) {
1416 error = EBADF;
1417 goto aqueue_fail;
1418 }
1419 kq = (struct kqueue *)kq_fp->f_data;
1420 kev.ident = (uintptr_t)aiocbe;
1421 kev.filter = EVFILT_AIO;
1422 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1423 error = kqueue_register(kq, &kev, td);
1424aqueue_fail:
1425 if (error) {
1426 zfree(aiocb_zone, aiocbe);
1427 if (type == 0)
1428 suword(&job->_aiocb_private.error, error);
1429 goto done;
1430 }
1431no_kqueue:
1432
1433 suword(&job->_aiocb_private.error, EINPROGRESS);
1434 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1435 aiocbe->userproc = p;
1436 aiocbe->jobflags = 0;
1437 aiocbe->lio = lj;
1438 ki = p->p_aioinfo;
1439
1440 if (fp->f_type == DTYPE_SOCKET) {
1441 /*
1442 * Alternate queueing for socket ops: Reach down into the
1443 * descriptor to get the socket data. Then check to see if the
1444 * socket is ready to be read or written (based on the requested
1445 * operation).
1446 *
1447 * If it is not ready for io, then queue the aiocbe on the
1448 * socket, and set the flags so we get a call when sbnotify()
1449 * happens.
1450 */
1451 so = (struct socket *)fp->f_data;
1452 s = splnet();
1453 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1454 LIO_WRITE) && (!sowriteable(so)))) {
1455 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1456 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1457 if (opcode == LIO_READ)
1458 so->so_rcv.sb_flags |= SB_AIO;
1459 else
1460 so->so_snd.sb_flags |= SB_AIO;
1461 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1462 ki->kaio_queue_count++;
1463 num_queue_count++;
1464 splx(s);
1465 error = 0;
1466 goto done;
1467 }
1468 splx(s);
1469 }
1470
1471 if ((error = aio_qphysio(p, aiocbe)) == 0)
1472 goto done;
1473 if (error > 0) {
1474 suword(&job->_aiocb_private.status, 0);
1475 aiocbe->uaiocb._aiocb_private.error = error;
1476 suword(&job->_aiocb_private.error, error);
1477 goto done;
1478 }
1479
1480 /* No buffer for daemon I/O. */
1481 aiocbe->bp = NULL;
1482
1483 ki->kaio_queue_count++;
1484 if (lj)
1485 lj->lioj_queue_count++;
1486 s = splnet();
1487 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1488 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1489 splx(s);
1490 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1491
1492 num_queue_count++;
1493 error = 0;
1494
1495 /*
1496 * If we don't have a free AIO process, and we are below our quota, then
1497 * start one. Otherwise, depend on the subsequent I/O completions to
1498 * pick-up this job. If we don't sucessfully create the new process
1499 * (thread) due to resource issues, we return an error for now (EAGAIN),
1500 * which is likely not the correct thing to do.
1501 */
1502 s = splnet();
1503retryproc:
1504 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1505 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1506 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1507 aiop->aiothreadflags &= ~AIOP_FREE;
1508 wakeup(aiop->aiothread);
1509 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1510 ((ki->kaio_active_count + num_aio_resv_start) <
1511 ki->kaio_maxactive_count)) {
1512 num_aio_resv_start++;
1513 if ((error = aio_newproc()) == 0) {
1514 num_aio_resv_start--;
1515 goto retryproc;
1516 }
1517 num_aio_resv_start--;
1518 }
1519 splx(s);
1520done:
1521 fdrop(fp, td);
1522 return error;
1523}
1524
1525/*
1526 * This routine queues an AIO request, checking for quotas.
1527 */
1528static int
1529aio_aqueue(struct thread *td, struct aiocb *job, int type)
1530{
1531 struct proc *p = td->td_proc;
1532 struct kaioinfo *ki;
1533
1534 if (p->p_aioinfo == NULL)
1535 aio_init_aioinfo(p);
1536
1537 if (num_queue_count >= max_queue_count)
1538 return EAGAIN;
1539
1540 ki = p->p_aioinfo;
1541 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1542 return EAGAIN;
1543
1544 return _aio_aqueue(td, job, NULL, type);
1545}
1546
1547/*
1548 * Support the aio_return system call, as a side-effect, kernel resources are
1549 * released.
1550 */
1551int
1552aio_return(struct thread *td, struct aio_return_args *uap)
1553{
1554 struct proc *p = td->td_proc;
1555 int s;
1556 int jobref;
1557 struct aiocblist *cb, *ncb;
1558 struct aiocb *ujob;
1559 struct kaioinfo *ki;
1560
1561 ki = p->p_aioinfo;
1562 if (ki == NULL)
1563 return EINVAL;
1564
1565 ujob = uap->aiocbp;
1566
1567 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1568 if (jobref == -1 || jobref == 0)
1569 return EINVAL;
1570
1571 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1572 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1573 jobref) {
1574 if (ujob == cb->uuaiocb) {
1575 td->td_retval[0] =
1576 cb->uaiocb._aiocb_private.status;
1577 } else
1578 td->td_retval[0] = EFAULT;
1579 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1580 p->p_stats->p_ru.ru_oublock +=
1581 cb->outputcharge;
1582 cb->outputcharge = 0;
1583 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1584 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1585 cb->inputcharge = 0;
1586 }
1587 aio_free_entry(cb);
1588 return 0;
1589 }
1590 }
1591 s = splbio();
1592 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1593 ncb = TAILQ_NEXT(cb, plist);
1594 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1595 == jobref) {
1596 splx(s);
1597 if (ujob == cb->uuaiocb) {
1598 td->td_retval[0] =
1599 cb->uaiocb._aiocb_private.status;
1600 } else
1601 td->td_retval[0] = EFAULT;
1602 aio_free_entry(cb);
1603 return 0;
1604 }
1605 }
1606 splx(s);
1607
1608 return (EINVAL);
1609}
1610
1611/*
1612 * Allow a process to wakeup when any of the I/O requests are completed.
1613 */
1614int
1615aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1616{
1617 struct proc *p = td->td_proc;
1618 struct timeval atv;
1619 struct timespec ts;
1620 struct aiocb *const *cbptr, *cbp;
1621 struct kaioinfo *ki;
1622 struct aiocblist *cb;
1623 int i;
1624 int njoblist;
1625 int error, s, timo;
1626 int *ijoblist;
1627 struct aiocb **ujoblist;
1628
1629 if (uap->nent > AIO_LISTIO_MAX)
1630 return EINVAL;
1631
1632 timo = 0;
1633 if (uap->timeout) {
1634 /* Get timespec struct. */
1635 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1636 return error;
1637
1638 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1639 return (EINVAL);
1640
1641 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1642 if (itimerfix(&atv))
1643 return (EINVAL);
1644 timo = tvtohz(&atv);
1645 }
1646
1647 ki = p->p_aioinfo;
1648 if (ki == NULL)
1649 return EAGAIN;
1650
1651 njoblist = 0;
1652 ijoblist = zalloc(aiol_zone);
1653 ujoblist = zalloc(aiol_zone);
1654 cbptr = uap->aiocbp;
1655
1656 for (i = 0; i < uap->nent; i++) {
1657 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1658 if (cbp == 0)
1659 continue;
1660 ujoblist[njoblist] = cbp;
1661 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1662 njoblist++;
1663 }
1664
1665 if (njoblist == 0) {
1666 zfree(aiol_zone, ijoblist);
1667 zfree(aiol_zone, ujoblist);
1668 return 0;
1669 }
1670
1671 error = 0;
1672 for (;;) {
1673 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1674 for (i = 0; i < njoblist; i++) {
1675 if (((intptr_t)
1676 cb->uaiocb._aiocb_private.kernelinfo) ==
1677 ijoblist[i]) {
1678 if (ujoblist[i] != cb->uuaiocb)
1679 error = EINVAL;
1680 zfree(aiol_zone, ijoblist);
1681 zfree(aiol_zone, ujoblist);
1682 return error;
1683 }
1684 }
1685 }
1686
1687 s = splbio();
1688 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1689 TAILQ_NEXT(cb, plist)) {
1690 for (i = 0; i < njoblist; i++) {
1691 if (((intptr_t)
1692 cb->uaiocb._aiocb_private.kernelinfo) ==
1693 ijoblist[i]) {
1694 splx(s);
1695 if (ujoblist[i] != cb->uuaiocb)
1696 error = EINVAL;
1697 zfree(aiol_zone, ijoblist);
1698 zfree(aiol_zone, ujoblist);
1699 return error;
1700 }
1701 }
1702 }
1703
1704 ki->kaio_flags |= KAIO_WAKEUP;
1705 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1706 splx(s);
1707
1708 if (error == ERESTART || error == EINTR) {
1709 zfree(aiol_zone, ijoblist);
1710 zfree(aiol_zone, ujoblist);
1711 return EINTR;
1712 } else if (error == EWOULDBLOCK) {
1713 zfree(aiol_zone, ijoblist);
1714 zfree(aiol_zone, ujoblist);
1715 return EAGAIN;
1716 }
1717 }
1718
1719/* NOTREACHED */
1720 return EINVAL;
1721}
1722
1723/*
1724 * aio_cancel cancels any non-physio aio operations not currently in
1725 * progress.
1726 */
1727int
1728aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1729{
1730 struct proc *p = td->td_proc;
1731 struct kaioinfo *ki;
1732 struct aiocblist *cbe, *cbn;
1733 struct file *fp;
1734 struct filedesc *fdp;
1735 struct socket *so;
1736 struct proc *po;
1737 int s,error;
1738 int cancelled=0;
1739 int notcancelled=0;
1740 struct vnode *vp;
1741
1742 fdp = p->p_fd;
1743 if ((u_int)uap->fd >= fdp->fd_nfiles ||
1744 (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1745 return (EBADF);
1746
1747 if (fp->f_type == DTYPE_VNODE) {
1748 vp = (struct vnode *)fp->f_data;
1749
1750 if (vn_isdisk(vp,&error)) {
1751 td->td_retval[0] = AIO_NOTCANCELED;
1752 return 0;
1753 }
1754 } else if (fp->f_type == DTYPE_SOCKET) {
1755 so = (struct socket *)fp->f_data;
1756
1757 s = splnet();
1758
1759 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1760 cbn = TAILQ_NEXT(cbe, list);
1761 if ((uap->aiocbp == NULL) ||
1762 (uap->aiocbp == cbe->uuaiocb) ) {
1763 po = cbe->userproc;
1764 ki = po->p_aioinfo;
1765 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1766 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1767 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1768 if (ki->kaio_flags & KAIO_WAKEUP) {
1769 wakeup(po);
1770 }
1771 cbe->jobstate = JOBST_JOBFINISHED;
1772 cbe->uaiocb._aiocb_private.status=-1;
1773 cbe->uaiocb._aiocb_private.error=ECANCELED;
1774 cancelled++;
1775/* XXX cancelled, knote? */
1776 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1777 SIGEV_SIGNAL) {
1778 PROC_LOCK(cbe->userproc);
1779 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1780 PROC_UNLOCK(cbe->userproc);
1781 }
1782 if (uap->aiocbp)
1783 break;
1784 }
1785 }
1786 splx(s);
1787
1788 if ((cancelled) && (uap->aiocbp)) {
1789 td->td_retval[0] = AIO_CANCELED;
1790 return 0;
1791 }
1792 }
1793 ki=p->p_aioinfo;
1794 s = splnet();
1795
1796 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1797 cbn = TAILQ_NEXT(cbe, plist);
1798
1799 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1800 ((uap->aiocbp == NULL ) ||
1801 (uap->aiocbp == cbe->uuaiocb))) {
1802
1803 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1804 TAILQ_REMOVE(&aio_jobs, cbe, list);
1805 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1806 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1807 plist);
1808 cancelled++;
1809 ki->kaio_queue_finished_count++;
1810 cbe->jobstate = JOBST_JOBFINISHED;
1811 cbe->uaiocb._aiocb_private.status = -1;
1812 cbe->uaiocb._aiocb_private.error = ECANCELED;
1813/* XXX cancelled, knote? */
1814 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1815 SIGEV_SIGNAL) {
1816 PROC_LOCK(cbe->userproc);
1817 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1818 PROC_UNLOCK(cbe->userproc);
1819 }
1820 } else {
1821 notcancelled++;
1822 }
1823 }
1824 }
1825 splx(s);
1826
1827 if (notcancelled) {
1828 td->td_retval[0] = AIO_NOTCANCELED;
1829 return 0;
1830 }
1831 if (cancelled) {
1832 td->td_retval[0] = AIO_CANCELED;
1833 return 0;
1834 }
1835 td->td_retval[0] = AIO_ALLDONE;
1836
1837 return 0;
1838}
1839
1840/*
1841 * aio_error is implemented in the kernel level for compatibility purposes only.
1842 * For a user mode async implementation, it would be best to do it in a userland
1843 * subroutine.
1844 */
1845int
1846aio_error(struct thread *td, struct aio_error_args *uap)
1847{
1848 struct proc *p = td->td_proc;
1849 int s;
1850 struct aiocblist *cb;
1851 struct kaioinfo *ki;
1852 int jobref;
1853
1854 ki = p->p_aioinfo;
1855 if (ki == NULL)
1856 return EINVAL;
1857
1858 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1859 if ((jobref == -1) || (jobref == 0))
1860 return EINVAL;
1861
1862 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1863 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1864 jobref) {
1865 td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1866 return 0;
1867 }
1868 }
1869
1870 s = splnet();
1871
1872 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1873 plist)) {
1874 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1875 jobref) {
1876 td->td_retval[0] = EINPROGRESS;
1877 splx(s);
1878 return 0;
1879 }
1880 }
1881
1882 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1883 plist)) {
1884 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1885 jobref) {
1886 td->td_retval[0] = EINPROGRESS;
1887 splx(s);
1888 return 0;
1889 }
1890 }
1891 splx(s);
1892
1893 s = splbio();
1894 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1895 plist)) {
1896 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1897 jobref) {
1898 td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1899 splx(s);
1900 return 0;
1901 }
1902 }
1903
1904 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1905 plist)) {
1906 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1907 jobref) {
1908 td->td_retval[0] = EINPROGRESS;
1909 splx(s);
1910 return 0;
1911 }
1912 }
1913 splx(s);
1914
1915#if (0)
1916 /*
1917 * Hack for lio.
1918 */
1919 status = fuword(&uap->aiocbp->_aiocb_private.status);
1920 if (status == -1)
1921 return fuword(&uap->aiocbp->_aiocb_private.error);
1922#endif
1923 return EINVAL;
1924}
1925
1926int
1927aio_read(struct thread *td, struct aio_read_args *uap)
1928{
1929
1930 return aio_aqueue(td, uap->aiocbp, LIO_READ);
1931}
1932
1933int
1934aio_write(struct thread *td, struct aio_write_args *uap)
1935{
1936
1937 return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1938}
1939
1940int
1941lio_listio(struct thread *td, struct lio_listio_args *uap)
1942{
1943 struct proc *p = td->td_proc;
1944 int nent, nentqueued;
1945 struct aiocb *iocb, * const *cbptr;
1946 struct aiocblist *cb;
1947 struct kaioinfo *ki;
1948 struct aio_liojob *lj;
1949 int error, runningcode;
1950 int nerror;
1951 int i;
1952 int s;
1953
1954 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1955 return EINVAL;
1956
1957 nent = uap->nent;
1958 if (nent > AIO_LISTIO_MAX)
1959 return EINVAL;
1960
1961 if (p->p_aioinfo == NULL)
1962 aio_init_aioinfo(p);
1963
1964 if ((nent + num_queue_count) > max_queue_count)
1965 return EAGAIN;
1966
1967 ki = p->p_aioinfo;
1968 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1969 return EAGAIN;
1970
1971 lj = zalloc(aiolio_zone);
1972 if (!lj)
1973 return EAGAIN;
1974
1975 lj->lioj_flags = 0;
1976 lj->lioj_buffer_count = 0;
1977 lj->lioj_buffer_finished_count = 0;
1978 lj->lioj_queue_count = 0;
1979 lj->lioj_queue_finished_count = 0;
1980 lj->lioj_ki = ki;
1981
1982 /*
1983 * Setup signal.
1984 */
1985 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1986 error = copyin(uap->sig, &lj->lioj_signal,
1987 sizeof(lj->lioj_signal));
1988 if (error) {
1989 zfree(aiolio_zone, lj);
1990 return error;
1991 }
1992 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
1993 zfree(aiolio_zone, lj);
1994 return EINVAL;
1995 }
1996 lj->lioj_flags |= LIOJ_SIGNAL;
1997 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
1998 } else
1999 lj->lioj_flags &= ~LIOJ_SIGNAL;
2000
2001 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2002 /*
2003 * Get pointers to the list of I/O requests.
2004 */
2005 nerror = 0;
2006 nentqueued = 0;
2007 cbptr = uap->acb_list;
2008 for (i = 0; i < uap->nent; i++) {
2009 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2010 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2011 error = _aio_aqueue(td, iocb, lj, 0);
2012 if (error == 0)
2013 nentqueued++;
2014 else
2015 nerror++;
2016 }
2017 }
2018
2019 /*
2020 * If we haven't queued any, then just return error.
2021 */
2022 if (nentqueued == 0)
2023 return 0;
2024
2025 /*
2026 * Calculate the appropriate error return.
2027 */
2028 runningcode = 0;
2029 if (nerror)
2030 runningcode = EIO;
2031
2032 if (uap->mode == LIO_WAIT) {
2033 int command, found, jobref;
2034
2035 for (;;) {
2036 found = 0;
2037 for (i = 0; i < uap->nent; i++) {
2038 /*
2039 * Fetch address of the control buf pointer in
2040 * user space.
2041 */
2042 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2043 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2044 == 0))
2045 continue;
2046
2047 /*
2048 * Fetch the associated command from user space.
2049 */
2050 command = fuword(&iocb->aio_lio_opcode);
2051 if (command == LIO_NOP) {
2052 found++;
2053 continue;
2054 }
2055
2056 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2057
2058 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2059 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2060 == jobref) {
2061 if (cb->uaiocb.aio_lio_opcode
2062 == LIO_WRITE) {
2063 p->p_stats->p_ru.ru_oublock
2064 +=
2065 cb->outputcharge;
2066 cb->outputcharge = 0;
2067 } else if (cb->uaiocb.aio_lio_opcode
2068 == LIO_READ) {
2069 p->p_stats->p_ru.ru_inblock
2070 += cb->inputcharge;
2071 cb->inputcharge = 0;
2072 }
2073 found++;
2074 break;
2075 }
2076 }
2077
2078 s = splbio();
2079 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2080 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2081 == jobref) {
2082 found++;
2083 break;
2084 }
2085 }
2086 splx(s);
2087 }
2088
2089 /*
2090 * If all I/Os have been disposed of, then we can
2091 * return.
2092 */
2093 if (found == nentqueued)
2094 return runningcode;
2095
2096 ki->kaio_flags |= KAIO_WAKEUP;
2097 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2098
2099 if (error == EINTR)
2100 return EINTR;
2101 else if (error == EWOULDBLOCK)
2102 return EAGAIN;
2103 }
2104 }
2105
2106 return runningcode;
2107}
2108
2109/*
2110 * This is a weird hack so that we can post a signal. It is safe to do so from
2111 * a timeout routine, but *not* from an interrupt routine.
2112 */
2113static void
2114process_signal(void *aioj)
2115{
2116 struct aiocblist *aiocbe = aioj;
2117 struct aio_liojob *lj = aiocbe->lio;
2118 struct aiocb *cb = &aiocbe->uaiocb;
2119
2120 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2121 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2122 PROC_LOCK(lj->lioj_ki->kaio_p);
2123 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2124 PROC_UNLOCK(lj->lioj_ki->kaio_p);
2125 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2126 }
2127
2128 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2129 PROC_LOCK(aiocbe->userproc);
2130 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2131 PROC_UNLOCK(aiocbe->userproc);
2132 }
2133}
2134
2135/*
2136 * Interrupt handler for physio, performs the necessary process wakeups, and
2137 * signals.
2138 */
2139static void
2140aio_physwakeup(struct buf *bp)
2141{
2142 struct aiocblist *aiocbe;
2143 struct proc *p;
2144 struct kaioinfo *ki;
2145 struct aio_liojob *lj;
2146
2147 wakeup(bp);
2148
2149 aiocbe = (struct aiocblist *)bp->b_spc;
2150 if (aiocbe) {
2151 p = bp->b_caller1;
2152
2153 aiocbe->jobstate = JOBST_JOBBFINISHED;
2154 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2155 aiocbe->uaiocb._aiocb_private.error = 0;
2156 aiocbe->jobflags |= AIOCBLIST_DONE;
2157
2158 if (bp->b_ioflags & BIO_ERROR)
2159 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2160
2161 lj = aiocbe->lio;
2162 if (lj) {
2163 lj->lioj_buffer_finished_count++;
2164
2165 /*
2166 * wakeup/signal if all of the interrupt jobs are done.
2167 */
2168 if (lj->lioj_buffer_finished_count ==
2169 lj->lioj_buffer_count) {
2170 /*
2171 * Post a signal if it is called for.
2172 */
2173 if ((lj->lioj_flags &
2174 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2175 LIOJ_SIGNAL) {
2176 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2177 aiocbe->timeouthandle =
2178 timeout(process_signal,
2179 aiocbe, 0);
2180 }
2181 }
2182 }
2183
2184 ki = p->p_aioinfo;
2185 if (ki) {
2186 ki->kaio_buffer_finished_count++;
2187 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2188 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2189 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2190
2191 KNOTE(&aiocbe->klist, 0);
2192 /* Do the wakeup. */
2193 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2194 ki->kaio_flags &= ~KAIO_WAKEUP;
2195 wakeup(p);
2196 }
2197 }
2198
2199 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2200 aiocbe->timeouthandle =
2201 timeout(process_signal, aiocbe, 0);
2202 }
2203}
2204
2205int
2206aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2207{
2208 struct proc *p = td->td_proc;
2209 struct timeval atv;
2210 struct timespec ts;
2211 struct aiocb **cbptr;
2212 struct kaioinfo *ki;
2213 struct aiocblist *cb = NULL;
2214 int error, s, timo;
2215
2216 suword(uap->aiocbp, (int)NULL);
2217
2218 timo = 0;
2219 if (uap->timeout) {
2220 /* Get timespec struct. */
2221 error = copyin(uap->timeout, &ts, sizeof(ts));
2222 if (error)
2223 return error;
2224
2225 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2226 return (EINVAL);
2227
2228 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2229 if (itimerfix(&atv))
2230 return (EINVAL);
2231 timo = tvtohz(&atv);
2232 }
2233
2234 ki = p->p_aioinfo;
2235 if (ki == NULL)
2236 return EAGAIN;
2237
2238 cbptr = uap->aiocbp;
2239
2240 for (;;) {
2241 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2242 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2243 td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2244 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2245 p->p_stats->p_ru.ru_oublock +=
2246 cb->outputcharge;
2247 cb->outputcharge = 0;
2248 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2249 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2250 cb->inputcharge = 0;
2251 }
2252 aio_free_entry(cb);
2253 return cb->uaiocb._aiocb_private.error;
2254 }
2255
2256 s = splbio();
2257 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2258 splx(s);
2259 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2260 td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2261 aio_free_entry(cb);
2262 return cb->uaiocb._aiocb_private.error;
2263 }
2264
2265 ki->kaio_flags |= KAIO_WAKEUP;
2266 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2267 splx(s);
2268
2269 if (error == ERESTART)
2270 return EINTR;
2271 else if (error < 0)
2272 return error;
2273 else if (error == EINTR)
2274 return EINTR;
2275 else if (error == EWOULDBLOCK)
2276 return EAGAIN;
2277 }
2278}
2279
2280static int
2281filt_aioattach(struct knote *kn)
2282{
2283 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2284
2285 /*
2286 * The aiocbe pointer must be validated before using it, so
2287 * registration is restricted to the kernel; the user cannot
2288 * set EV_FLAG1.
2289 */
2290 if ((kn->kn_flags & EV_FLAG1) == 0)
2291 return (EPERM);
2292 kn->kn_flags &= ~EV_FLAG1;
2293
2294 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2295
2296 return (0);
2297}
2298
2299static void
2300filt_aiodetach(struct knote *kn)
2301{
2302 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2303
2304 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2305}
2306
2307/*ARGSUSED*/
2308static int
2309filt_aio(struct knote *kn, long hint)
2310{
2311 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2312
2313 kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2314 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2315 aiocbe->jobstate != JOBST_JOBBFINISHED)
2316 return (0);
2317 kn->kn_flags |= EV_EOF;
2318 return (1);
2319}
794 /* Mark special process type. */
795 mycp->p_flag |= P_SYSTEM;
796
797 /*
798 * Wakeup parent process. (Parent sleeps to keep from blasting away
799 * and creating too many daemons.)
800 */
801 wakeup(mycp);
802
803 for (;;) {
804 /*
805 * curcp is the current daemon process context.
806 * userp is the current user process context.
807 */
808 curcp = mycp;
809
810 /*
811 * Take daemon off of free queue
812 */
813 if (aiop->aiothreadflags & AIOP_FREE) {
814 s = splnet();
815 TAILQ_REMOVE(&aio_freeproc, aiop, list);
816 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
817 aiop->aiothreadflags &= ~AIOP_FREE;
818 splx(s);
819 }
820 aiop->aiothreadflags &= ~AIOP_SCHED;
821
822 /*
823 * Check for jobs.
824 */
825 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
826 cb = &aiocbe->uaiocb;
827 userp = aiocbe->userproc;
828
829 aiocbe->jobstate = JOBST_JOBRUNNING;
830
831 /*
832 * Connect to process address space for user program.
833 */
834 if (userp != curcp) {
835 /*
836 * Save the current address space that we are
837 * connected to.
838 */
839 tmpvm = mycp->p_vmspace;
840
841 /*
842 * Point to the new user address space, and
843 * refer to it.
844 */
845 mycp->p_vmspace = userp->p_vmspace;
846 mycp->p_vmspace->vm_refcnt++;
847
848 /* Activate the new mapping. */
849 pmap_activate(FIRST_THREAD_IN_PROC(mycp));
850
851 /*
852 * If the old address space wasn't the daemons
853 * own address space, then we need to remove the
854 * daemon's reference from the other process
855 * that it was acting on behalf of.
856 */
857 if (tmpvm != myvm) {
858 vmspace_free(tmpvm);
859 }
860
861 /*
862 * Disassociate from previous clients file
863 * descriptors, and associate to the new clients
864 * descriptors. Note that the daemon doesn't
865 * need to worry about its orginal descriptors,
866 * because they were originally freed.
867 */
868 if (mycp->p_fd)
869 fdfree(td);
870 mycp->p_fd = fdshare(userp);
871 curcp = userp;
872 }
873
874 ki = userp->p_aioinfo;
875 lj = aiocbe->lio;
876
877 /* Account for currently active jobs. */
878 ki->kaio_active_count++;
879
880 /* Do the I/O function. */
881 aiocbe->jobaiothread = aiop;
882 aio_process(aiocbe);
883
884 /* Decrement the active job count. */
885 ki->kaio_active_count--;
886
887 /*
888 * Increment the completion count for wakeup/signal
889 * comparisons.
890 */
891 aiocbe->jobflags |= AIOCBLIST_DONE;
892 ki->kaio_queue_finished_count++;
893 if (lj)
894 lj->lioj_queue_finished_count++;
895 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
896 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
897 ki->kaio_flags &= ~KAIO_WAKEUP;
898 wakeup(userp);
899 }
900
901 s = splbio();
902 if (lj && (lj->lioj_flags &
903 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
904 if ((lj->lioj_queue_finished_count ==
905 lj->lioj_queue_count) &&
906 (lj->lioj_buffer_finished_count ==
907 lj->lioj_buffer_count)) {
908 PROC_LOCK(userp);
909 psignal(userp,
910 lj->lioj_signal.sigev_signo);
911 PROC_UNLOCK(userp);
912 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
913 }
914 }
915 splx(s);
916
917 aiocbe->jobstate = JOBST_JOBFINISHED;
918
919 /*
920 * If the I/O request should be automatically rundown,
921 * do the needed cleanup. Otherwise, place the queue
922 * entry for the just finished I/O request into the done
923 * queue for the associated client.
924 */
925 s = splnet();
926 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
927 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
928 zfree(aiocb_zone, aiocbe);
929 } else {
930 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
931 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
932 plist);
933 }
934 splx(s);
935 KNOTE(&aiocbe->klist, 0);
936
937 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
938 wakeup(aiocbe);
939 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
940 }
941
942 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
943 PROC_LOCK(userp);
944 psignal(userp, cb->aio_sigevent.sigev_signo);
945 PROC_UNLOCK(userp);
946 }
947 }
948
949 /*
950 * Disconnect from user address space.
951 */
952 if (curcp != mycp) {
953 /* Get the user address space to disconnect from. */
954 tmpvm = mycp->p_vmspace;
955
956 /* Get original address space for daemon. */
957 mycp->p_vmspace = myvm;
958
959 /* Activate the daemon's address space. */
960 pmap_activate(FIRST_THREAD_IN_PROC(mycp));
961#ifdef DIAGNOSTIC
962 if (tmpvm == myvm) {
963 printf("AIOD: vmspace problem -- %d\n",
964 mycp->p_pid);
965 }
966#endif
967 /* Remove our vmspace reference. */
968 vmspace_free(tmpvm);
969
970 /*
971 * Disassociate from the user process's file
972 * descriptors.
973 */
974 if (mycp->p_fd)
975 fdfree(td);
976 mycp->p_fd = NULL;
977 curcp = mycp;
978 }
979
980 /*
981 * If we are the first to be put onto the free queue, wakeup
982 * anyone waiting for a daemon.
983 */
984 s = splnet();
985 TAILQ_REMOVE(&aio_activeproc, aiop, list);
986 if (TAILQ_EMPTY(&aio_freeproc))
987 wakeup(&aio_freeproc);
988 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
989 aiop->aiothreadflags |= AIOP_FREE;
990 splx(s);
991
992 /*
993 * If daemon is inactive for a long time, allow it to exit,
994 * thereby freeing resources.
995 */
996 if ((aiop->aiothreadflags & AIOP_SCHED) == 0 &&
997 tsleep(aiop->aiothread, PRIBIO, "aiordy", aiod_lifetime)) {
998 s = splnet();
999 if (TAILQ_EMPTY(&aio_jobs)) {
1000 if ((aiop->aiothreadflags & AIOP_FREE) &&
1001 (num_aio_procs > target_aio_procs)) {
1002 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1003 splx(s);
1004 zfree(aiop_zone, aiop);
1005 num_aio_procs--;
1006#ifdef DIAGNOSTIC
1007 if (mycp->p_vmspace->vm_refcnt <= 1) {
1008 printf("AIOD: bad vm refcnt for"
1009 " exiting daemon: %d\n",
1010 mycp->p_vmspace->vm_refcnt);
1011 }
1012#endif
1013 kthread_exit(0);
1014 }
1015 }
1016 splx(s);
1017 }
1018 }
1019}
1020
1021/*
1022 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
1023 * AIO daemon modifies its environment itself.
1024 */
1025static int
1026aio_newproc()
1027{
1028 int error;
1029 struct proc *p;
1030
1031 error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, "aiod%d",
1032 num_aio_procs);
1033 if (error)
1034 return error;
1035
1036 /*
1037 * Wait until daemon is started, but continue on just in case to
1038 * handle error conditions.
1039 */
1040 error = tsleep(p, PZERO, "aiosta", aiod_timeout);
1041
1042 num_aio_procs++;
1043
1044 return error;
1045}
1046
1047/*
1048 * Try the high-performance, low-overhead physio method for eligible
1049 * VCHR devices. This method doesn't use an aio helper thread, and
1050 * thus has very low overhead.
1051 *
1052 * Assumes that the caller, _aio_aqueue(), has incremented the file
1053 * structure's reference count, preventing its deallocation for the
1054 * duration of this call.
1055 */
1056static int
1057aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
1058{
1059 int error;
1060 struct aiocb *cb;
1061 struct file *fp;
1062 struct buf *bp;
1063 struct vnode *vp;
1064 struct kaioinfo *ki;
1065 struct aio_liojob *lj;
1066 int s;
1067 int notify;
1068
1069 cb = &aiocbe->uaiocb;
1070 fp = aiocbe->fd_file;
1071
1072 if (fp->f_type != DTYPE_VNODE)
1073 return (-1);
1074
1075 vp = (struct vnode *)fp->f_data;
1076
1077 /*
1078 * If its not a disk, we don't want to return a positive error.
1079 * It causes the aio code to not fall through to try the thread
1080 * way when you're talking to a regular file.
1081 */
1082 if (!vn_isdisk(vp, &error)) {
1083 if (error == ENOTBLK)
1084 return (-1);
1085 else
1086 return (error);
1087 }
1088
1089 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
1090 return (-1);
1091
1092 if (cb->aio_nbytes >
1093 MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
1094 return (-1);
1095
1096 ki = p->p_aioinfo;
1097 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
1098 return (-1);
1099
1100 ki->kaio_buffer_count++;
1101
1102 lj = aiocbe->lio;
1103 if (lj)
1104 lj->lioj_buffer_count++;
1105
1106 /* Create and build a buffer header for a transfer. */
1107 bp = (struct buf *)getpbuf(NULL);
1108 BUF_KERNPROC(bp);
1109
1110 /*
1111 * Get a copy of the kva from the physical buffer.
1112 */
1113 bp->b_caller1 = p;
1114 bp->b_dev = vp->v_rdev;
1115 error = bp->b_error = 0;
1116
1117 bp->b_bcount = cb->aio_nbytes;
1118 bp->b_bufsize = cb->aio_nbytes;
1119 bp->b_flags = B_PHYS;
1120 bp->b_iodone = aio_physwakeup;
1121 bp->b_saveaddr = bp->b_data;
1122 bp->b_data = (void *)(uintptr_t)cb->aio_buf;
1123 bp->b_blkno = btodb(cb->aio_offset);
1124
1125 if (cb->aio_lio_opcode == LIO_WRITE) {
1126 bp->b_iocmd = BIO_WRITE;
1127 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1128 error = EFAULT;
1129 goto doerror;
1130 }
1131 } else {
1132 bp->b_iocmd = BIO_READ;
1133 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1134 error = EFAULT;
1135 goto doerror;
1136 }
1137 }
1138
1139 /* Bring buffer into kernel space. */
1140 vmapbuf(bp);
1141
1142 s = splbio();
1143 aiocbe->bp = bp;
1144 bp->b_spc = (void *)aiocbe;
1145 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1146 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1147 aiocbe->jobstate = JOBST_JOBQBUF;
1148 cb->_aiocb_private.status = cb->aio_nbytes;
1149 num_buf_aio++;
1150 bp->b_error = 0;
1151
1152 splx(s);
1153
1154 /* Perform transfer. */
1155 DEV_STRATEGY(bp, 0);
1156
1157 notify = 0;
1158 s = splbio();
1159
1160 /*
1161 * If we had an error invoking the request, or an error in processing
1162 * the request before we have returned, we process it as an error in
1163 * transfer. Note that such an I/O error is not indicated immediately,
1164 * but is returned using the aio_error mechanism. In this case,
1165 * aio_suspend will return immediately.
1166 */
1167 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1168 struct aiocb *job = aiocbe->uuaiocb;
1169
1170 aiocbe->uaiocb._aiocb_private.status = 0;
1171 suword(&job->_aiocb_private.status, 0);
1172 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1173 suword(&job->_aiocb_private.error, bp->b_error);
1174
1175 ki->kaio_buffer_finished_count++;
1176
1177 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1178 aiocbe->jobstate = JOBST_JOBBFINISHED;
1179 aiocbe->jobflags |= AIOCBLIST_DONE;
1180 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1181 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1182 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1183 notify = 1;
1184 }
1185 }
1186 splx(s);
1187 if (notify)
1188 KNOTE(&aiocbe->klist, 0);
1189 return 0;
1190
1191doerror:
1192 ki->kaio_buffer_count--;
1193 if (lj)
1194 lj->lioj_buffer_count--;
1195 aiocbe->bp = NULL;
1196 relpbuf(bp, NULL);
1197 return error;
1198}
1199
1200/*
1201 * This waits/tests physio completion.
1202 */
1203static int
1204aio_fphysio(struct aiocblist *iocb)
1205{
1206 int s;
1207 struct buf *bp;
1208 int error;
1209
1210 bp = iocb->bp;
1211
1212 s = splbio();
1213 while ((bp->b_flags & B_DONE) == 0) {
1214 if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
1215 if ((bp->b_flags & B_DONE) == 0) {
1216 splx(s);
1217 return EINPROGRESS;
1218 } else
1219 break;
1220 }
1221 }
1222 splx(s);
1223
1224 /* Release mapping into kernel space. */
1225 vunmapbuf(bp);
1226 iocb->bp = 0;
1227
1228 error = 0;
1229
1230 /* Check for an error. */
1231 if (bp->b_ioflags & BIO_ERROR)
1232 error = bp->b_error;
1233
1234 relpbuf(bp, NULL);
1235 return (error);
1236}
1237
1238/*
1239 * Wake up aio requests that may be serviceable now.
1240 */
1241static void
1242aio_swake_cb(struct socket *so, struct sockbuf *sb)
1243{
1244 struct aiocblist *cb,*cbn;
1245 struct proc *p;
1246 struct kaioinfo *ki = NULL;
1247 int opcode, wakecount = 0;
1248 struct aiothreadlist *aiop;
1249
1250 if (sb == &so->so_snd) {
1251 opcode = LIO_WRITE;
1252 so->so_snd.sb_flags &= ~SB_AIO;
1253 } else {
1254 opcode = LIO_READ;
1255 so->so_rcv.sb_flags &= ~SB_AIO;
1256 }
1257
1258 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1259 cbn = TAILQ_NEXT(cb, list);
1260 if (opcode == cb->uaiocb.aio_lio_opcode) {
1261 p = cb->userproc;
1262 ki = p->p_aioinfo;
1263 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1264 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1265 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1266 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1267 wakecount++;
1268 if (cb->jobstate != JOBST_JOBQGLOBAL)
1269 panic("invalid queue value");
1270 }
1271 }
1272
1273 while (wakecount--) {
1274 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1275 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1276 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1277 aiop->aiothreadflags &= ~AIOP_FREE;
1278 wakeup(aiop->aiothread);
1279 }
1280 }
1281}
1282
1283/*
1284 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1285 * technique is done in this code.
1286 */
1287static int
1288_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
1289{
1290 struct proc *p = td->td_proc;
1291 struct filedesc *fdp;
1292 struct file *fp;
1293 unsigned int fd;
1294 struct socket *so;
1295 int s;
1296 int error;
1297 int opcode;
1298 struct aiocblist *aiocbe;
1299 struct aiothreadlist *aiop;
1300 struct kaioinfo *ki;
1301 struct kevent kev;
1302 struct kqueue *kq;
1303 struct file *kq_fp;
1304
1305 aiocbe = zalloc(aiocb_zone);
1306 aiocbe->inputcharge = 0;
1307 aiocbe->outputcharge = 0;
1308 callout_handle_init(&aiocbe->timeouthandle);
1309 SLIST_INIT(&aiocbe->klist);
1310
1311 suword(&job->_aiocb_private.status, -1);
1312 suword(&job->_aiocb_private.error, 0);
1313 suword(&job->_aiocb_private.kernelinfo, -1);
1314
1315 error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
1316 if (error) {
1317 suword(&job->_aiocb_private.error, error);
1318 zfree(aiocb_zone, aiocbe);
1319 return error;
1320 }
1321 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
1322 !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
1323 zfree(aiocb_zone, aiocbe);
1324 return EINVAL;
1325 }
1326
1327 /* Save userspace address of the job info. */
1328 aiocbe->uuaiocb = job;
1329
1330 /* Get the opcode. */
1331 if (type != LIO_NOP)
1332 aiocbe->uaiocb.aio_lio_opcode = type;
1333 opcode = aiocbe->uaiocb.aio_lio_opcode;
1334
1335 /* Get the fd info for process. */
1336 fdp = p->p_fd;
1337
1338 /*
1339 * Range check file descriptor.
1340 */
1341 fd = aiocbe->uaiocb.aio_fildes;
1342 if (fd >= fdp->fd_nfiles) {
1343 zfree(aiocb_zone, aiocbe);
1344 if (type == 0)
1345 suword(&job->_aiocb_private.error, EBADF);
1346 return EBADF;
1347 }
1348
1349 fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1350 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1351 0))) {
1352 zfree(aiocb_zone, aiocbe);
1353 if (type == 0)
1354 suword(&job->_aiocb_private.error, EBADF);
1355 return EBADF;
1356 }
1357
1358 if (aiocbe->uaiocb.aio_offset == -1LL) {
1359 zfree(aiocb_zone, aiocbe);
1360 if (type == 0)
1361 suword(&job->_aiocb_private.error, EINVAL);
1362 return EINVAL;
1363 }
1364
1365 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1366 if (error) {
1367 zfree(aiocb_zone, aiocbe);
1368 if (type == 0)
1369 suword(&job->_aiocb_private.error, EINVAL);
1370 return error;
1371 }
1372
1373 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1374 if (jobrefid == LONG_MAX)
1375 jobrefid = 1;
1376 else
1377 jobrefid++;
1378
1379 if (opcode == LIO_NOP) {
1380 zfree(aiocb_zone, aiocbe);
1381 if (type == 0) {
1382 suword(&job->_aiocb_private.error, 0);
1383 suword(&job->_aiocb_private.status, 0);
1384 suword(&job->_aiocb_private.kernelinfo, 0);
1385 }
1386 return 0;
1387 }
1388
1389 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1390 zfree(aiocb_zone, aiocbe);
1391 if (type == 0) {
1392 suword(&job->_aiocb_private.status, 0);
1393 suword(&job->_aiocb_private.error, EINVAL);
1394 }
1395 return EINVAL;
1396 }
1397
1398 fhold(fp);
1399
1400 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1401 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1402 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1403 }
1404 else {
1405 /*
1406 * This method for requesting kevent-based notification won't
1407 * work on the alpha, since we're passing in a pointer
1408 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1409 * based method instead.
1410 */
1411 struct kevent *kevp;
1412
1413 kevp = (struct kevent *)(uintptr_t)job->aio_lio_opcode;
1414 if (kevp == NULL)
1415 goto no_kqueue;
1416
1417 error = copyin(kevp, &kev, sizeof(kev));
1418 if (error)
1419 goto aqueue_fail;
1420 }
1421 if ((u_int)kev.ident >= fdp->fd_nfiles ||
1422 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1423 (kq_fp->f_type != DTYPE_KQUEUE)) {
1424 error = EBADF;
1425 goto aqueue_fail;
1426 }
1427 kq = (struct kqueue *)kq_fp->f_data;
1428 kev.ident = (uintptr_t)aiocbe;
1429 kev.filter = EVFILT_AIO;
1430 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1431 error = kqueue_register(kq, &kev, td);
1432aqueue_fail:
1433 if (error) {
1434 zfree(aiocb_zone, aiocbe);
1435 if (type == 0)
1436 suword(&job->_aiocb_private.error, error);
1437 goto done;
1438 }
1439no_kqueue:
1440
1441 suword(&job->_aiocb_private.error, EINPROGRESS);
1442 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1443 aiocbe->userproc = p;
1444 aiocbe->jobflags = 0;
1445 aiocbe->lio = lj;
1446 ki = p->p_aioinfo;
1447
1448 if (fp->f_type == DTYPE_SOCKET) {
1449 /*
1450 * Alternate queueing for socket ops: Reach down into the
1451 * descriptor to get the socket data. Then check to see if the
1452 * socket is ready to be read or written (based on the requested
1453 * operation).
1454 *
1455 * If it is not ready for io, then queue the aiocbe on the
1456 * socket, and set the flags so we get a call when sbnotify()
1457 * happens.
1458 */
1459 so = (struct socket *)fp->f_data;
1460 s = splnet();
1461 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1462 LIO_WRITE) && (!sowriteable(so)))) {
1463 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1464 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1465 if (opcode == LIO_READ)
1466 so->so_rcv.sb_flags |= SB_AIO;
1467 else
1468 so->so_snd.sb_flags |= SB_AIO;
1469 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1470 ki->kaio_queue_count++;
1471 num_queue_count++;
1472 splx(s);
1473 error = 0;
1474 goto done;
1475 }
1476 splx(s);
1477 }
1478
1479 if ((error = aio_qphysio(p, aiocbe)) == 0)
1480 goto done;
1481 if (error > 0) {
1482 suword(&job->_aiocb_private.status, 0);
1483 aiocbe->uaiocb._aiocb_private.error = error;
1484 suword(&job->_aiocb_private.error, error);
1485 goto done;
1486 }
1487
1488 /* No buffer for daemon I/O. */
1489 aiocbe->bp = NULL;
1490
1491 ki->kaio_queue_count++;
1492 if (lj)
1493 lj->lioj_queue_count++;
1494 s = splnet();
1495 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1496 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1497 splx(s);
1498 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1499
1500 num_queue_count++;
1501 error = 0;
1502
1503 /*
1504 * If we don't have a free AIO process, and we are below our quota, then
1505 * start one. Otherwise, depend on the subsequent I/O completions to
1506 * pick-up this job. If we don't sucessfully create the new process
1507 * (thread) due to resource issues, we return an error for now (EAGAIN),
1508 * which is likely not the correct thing to do.
1509 */
1510 s = splnet();
1511retryproc:
1512 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1513 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1514 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1515 aiop->aiothreadflags &= ~AIOP_FREE;
1516 wakeup(aiop->aiothread);
1517 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1518 ((ki->kaio_active_count + num_aio_resv_start) <
1519 ki->kaio_maxactive_count)) {
1520 num_aio_resv_start++;
1521 if ((error = aio_newproc()) == 0) {
1522 num_aio_resv_start--;
1523 goto retryproc;
1524 }
1525 num_aio_resv_start--;
1526 }
1527 splx(s);
1528done:
1529 fdrop(fp, td);
1530 return error;
1531}
1532
1533/*
1534 * This routine queues an AIO request, checking for quotas.
1535 */
1536static int
1537aio_aqueue(struct thread *td, struct aiocb *job, int type)
1538{
1539 struct proc *p = td->td_proc;
1540 struct kaioinfo *ki;
1541
1542 if (p->p_aioinfo == NULL)
1543 aio_init_aioinfo(p);
1544
1545 if (num_queue_count >= max_queue_count)
1546 return EAGAIN;
1547
1548 ki = p->p_aioinfo;
1549 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1550 return EAGAIN;
1551
1552 return _aio_aqueue(td, job, NULL, type);
1553}
1554
1555/*
1556 * Support the aio_return system call, as a side-effect, kernel resources are
1557 * released.
1558 */
1559int
1560aio_return(struct thread *td, struct aio_return_args *uap)
1561{
1562 struct proc *p = td->td_proc;
1563 int s;
1564 int jobref;
1565 struct aiocblist *cb, *ncb;
1566 struct aiocb *ujob;
1567 struct kaioinfo *ki;
1568
1569 ki = p->p_aioinfo;
1570 if (ki == NULL)
1571 return EINVAL;
1572
1573 ujob = uap->aiocbp;
1574
1575 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1576 if (jobref == -1 || jobref == 0)
1577 return EINVAL;
1578
1579 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1580 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1581 jobref) {
1582 if (ujob == cb->uuaiocb) {
1583 td->td_retval[0] =
1584 cb->uaiocb._aiocb_private.status;
1585 } else
1586 td->td_retval[0] = EFAULT;
1587 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1588 p->p_stats->p_ru.ru_oublock +=
1589 cb->outputcharge;
1590 cb->outputcharge = 0;
1591 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1592 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
1593 cb->inputcharge = 0;
1594 }
1595 aio_free_entry(cb);
1596 return 0;
1597 }
1598 }
1599 s = splbio();
1600 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1601 ncb = TAILQ_NEXT(cb, plist);
1602 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1603 == jobref) {
1604 splx(s);
1605 if (ujob == cb->uuaiocb) {
1606 td->td_retval[0] =
1607 cb->uaiocb._aiocb_private.status;
1608 } else
1609 td->td_retval[0] = EFAULT;
1610 aio_free_entry(cb);
1611 return 0;
1612 }
1613 }
1614 splx(s);
1615
1616 return (EINVAL);
1617}
1618
1619/*
1620 * Allow a process to wakeup when any of the I/O requests are completed.
1621 */
1622int
1623aio_suspend(struct thread *td, struct aio_suspend_args *uap)
1624{
1625 struct proc *p = td->td_proc;
1626 struct timeval atv;
1627 struct timespec ts;
1628 struct aiocb *const *cbptr, *cbp;
1629 struct kaioinfo *ki;
1630 struct aiocblist *cb;
1631 int i;
1632 int njoblist;
1633 int error, s, timo;
1634 int *ijoblist;
1635 struct aiocb **ujoblist;
1636
1637 if (uap->nent > AIO_LISTIO_MAX)
1638 return EINVAL;
1639
1640 timo = 0;
1641 if (uap->timeout) {
1642 /* Get timespec struct. */
1643 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1644 return error;
1645
1646 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1647 return (EINVAL);
1648
1649 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1650 if (itimerfix(&atv))
1651 return (EINVAL);
1652 timo = tvtohz(&atv);
1653 }
1654
1655 ki = p->p_aioinfo;
1656 if (ki == NULL)
1657 return EAGAIN;
1658
1659 njoblist = 0;
1660 ijoblist = zalloc(aiol_zone);
1661 ujoblist = zalloc(aiol_zone);
1662 cbptr = uap->aiocbp;
1663
1664 for (i = 0; i < uap->nent; i++) {
1665 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1666 if (cbp == 0)
1667 continue;
1668 ujoblist[njoblist] = cbp;
1669 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1670 njoblist++;
1671 }
1672
1673 if (njoblist == 0) {
1674 zfree(aiol_zone, ijoblist);
1675 zfree(aiol_zone, ujoblist);
1676 return 0;
1677 }
1678
1679 error = 0;
1680 for (;;) {
1681 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1682 for (i = 0; i < njoblist; i++) {
1683 if (((intptr_t)
1684 cb->uaiocb._aiocb_private.kernelinfo) ==
1685 ijoblist[i]) {
1686 if (ujoblist[i] != cb->uuaiocb)
1687 error = EINVAL;
1688 zfree(aiol_zone, ijoblist);
1689 zfree(aiol_zone, ujoblist);
1690 return error;
1691 }
1692 }
1693 }
1694
1695 s = splbio();
1696 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1697 TAILQ_NEXT(cb, plist)) {
1698 for (i = 0; i < njoblist; i++) {
1699 if (((intptr_t)
1700 cb->uaiocb._aiocb_private.kernelinfo) ==
1701 ijoblist[i]) {
1702 splx(s);
1703 if (ujoblist[i] != cb->uuaiocb)
1704 error = EINVAL;
1705 zfree(aiol_zone, ijoblist);
1706 zfree(aiol_zone, ujoblist);
1707 return error;
1708 }
1709 }
1710 }
1711
1712 ki->kaio_flags |= KAIO_WAKEUP;
1713 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1714 splx(s);
1715
1716 if (error == ERESTART || error == EINTR) {
1717 zfree(aiol_zone, ijoblist);
1718 zfree(aiol_zone, ujoblist);
1719 return EINTR;
1720 } else if (error == EWOULDBLOCK) {
1721 zfree(aiol_zone, ijoblist);
1722 zfree(aiol_zone, ujoblist);
1723 return EAGAIN;
1724 }
1725 }
1726
1727/* NOTREACHED */
1728 return EINVAL;
1729}
1730
1731/*
1732 * aio_cancel cancels any non-physio aio operations not currently in
1733 * progress.
1734 */
1735int
1736aio_cancel(struct thread *td, struct aio_cancel_args *uap)
1737{
1738 struct proc *p = td->td_proc;
1739 struct kaioinfo *ki;
1740 struct aiocblist *cbe, *cbn;
1741 struct file *fp;
1742 struct filedesc *fdp;
1743 struct socket *so;
1744 struct proc *po;
1745 int s,error;
1746 int cancelled=0;
1747 int notcancelled=0;
1748 struct vnode *vp;
1749
1750 fdp = p->p_fd;
1751 if ((u_int)uap->fd >= fdp->fd_nfiles ||
1752 (fp = fdp->fd_ofiles[uap->fd]) == NULL)
1753 return (EBADF);
1754
1755 if (fp->f_type == DTYPE_VNODE) {
1756 vp = (struct vnode *)fp->f_data;
1757
1758 if (vn_isdisk(vp,&error)) {
1759 td->td_retval[0] = AIO_NOTCANCELED;
1760 return 0;
1761 }
1762 } else if (fp->f_type == DTYPE_SOCKET) {
1763 so = (struct socket *)fp->f_data;
1764
1765 s = splnet();
1766
1767 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1768 cbn = TAILQ_NEXT(cbe, list);
1769 if ((uap->aiocbp == NULL) ||
1770 (uap->aiocbp == cbe->uuaiocb) ) {
1771 po = cbe->userproc;
1772 ki = po->p_aioinfo;
1773 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1774 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1775 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1776 if (ki->kaio_flags & KAIO_WAKEUP) {
1777 wakeup(po);
1778 }
1779 cbe->jobstate = JOBST_JOBFINISHED;
1780 cbe->uaiocb._aiocb_private.status=-1;
1781 cbe->uaiocb._aiocb_private.error=ECANCELED;
1782 cancelled++;
1783/* XXX cancelled, knote? */
1784 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1785 SIGEV_SIGNAL) {
1786 PROC_LOCK(cbe->userproc);
1787 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1788 PROC_UNLOCK(cbe->userproc);
1789 }
1790 if (uap->aiocbp)
1791 break;
1792 }
1793 }
1794 splx(s);
1795
1796 if ((cancelled) && (uap->aiocbp)) {
1797 td->td_retval[0] = AIO_CANCELED;
1798 return 0;
1799 }
1800 }
1801 ki=p->p_aioinfo;
1802 s = splnet();
1803
1804 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1805 cbn = TAILQ_NEXT(cbe, plist);
1806
1807 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1808 ((uap->aiocbp == NULL ) ||
1809 (uap->aiocbp == cbe->uuaiocb))) {
1810
1811 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1812 TAILQ_REMOVE(&aio_jobs, cbe, list);
1813 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1814 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1815 plist);
1816 cancelled++;
1817 ki->kaio_queue_finished_count++;
1818 cbe->jobstate = JOBST_JOBFINISHED;
1819 cbe->uaiocb._aiocb_private.status = -1;
1820 cbe->uaiocb._aiocb_private.error = ECANCELED;
1821/* XXX cancelled, knote? */
1822 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1823 SIGEV_SIGNAL) {
1824 PROC_LOCK(cbe->userproc);
1825 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1826 PROC_UNLOCK(cbe->userproc);
1827 }
1828 } else {
1829 notcancelled++;
1830 }
1831 }
1832 }
1833 splx(s);
1834
1835 if (notcancelled) {
1836 td->td_retval[0] = AIO_NOTCANCELED;
1837 return 0;
1838 }
1839 if (cancelled) {
1840 td->td_retval[0] = AIO_CANCELED;
1841 return 0;
1842 }
1843 td->td_retval[0] = AIO_ALLDONE;
1844
1845 return 0;
1846}
1847
1848/*
1849 * aio_error is implemented in the kernel level for compatibility purposes only.
1850 * For a user mode async implementation, it would be best to do it in a userland
1851 * subroutine.
1852 */
1853int
1854aio_error(struct thread *td, struct aio_error_args *uap)
1855{
1856 struct proc *p = td->td_proc;
1857 int s;
1858 struct aiocblist *cb;
1859 struct kaioinfo *ki;
1860 int jobref;
1861
1862 ki = p->p_aioinfo;
1863 if (ki == NULL)
1864 return EINVAL;
1865
1866 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1867 if ((jobref == -1) || (jobref == 0))
1868 return EINVAL;
1869
1870 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
1871 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1872 jobref) {
1873 td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1874 return 0;
1875 }
1876 }
1877
1878 s = splnet();
1879
1880 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1881 plist)) {
1882 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1883 jobref) {
1884 td->td_retval[0] = EINPROGRESS;
1885 splx(s);
1886 return 0;
1887 }
1888 }
1889
1890 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1891 plist)) {
1892 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1893 jobref) {
1894 td->td_retval[0] = EINPROGRESS;
1895 splx(s);
1896 return 0;
1897 }
1898 }
1899 splx(s);
1900
1901 s = splbio();
1902 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1903 plist)) {
1904 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1905 jobref) {
1906 td->td_retval[0] = cb->uaiocb._aiocb_private.error;
1907 splx(s);
1908 return 0;
1909 }
1910 }
1911
1912 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1913 plist)) {
1914 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1915 jobref) {
1916 td->td_retval[0] = EINPROGRESS;
1917 splx(s);
1918 return 0;
1919 }
1920 }
1921 splx(s);
1922
1923#if (0)
1924 /*
1925 * Hack for lio.
1926 */
1927 status = fuword(&uap->aiocbp->_aiocb_private.status);
1928 if (status == -1)
1929 return fuword(&uap->aiocbp->_aiocb_private.error);
1930#endif
1931 return EINVAL;
1932}
1933
1934int
1935aio_read(struct thread *td, struct aio_read_args *uap)
1936{
1937
1938 return aio_aqueue(td, uap->aiocbp, LIO_READ);
1939}
1940
1941int
1942aio_write(struct thread *td, struct aio_write_args *uap)
1943{
1944
1945 return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
1946}
1947
1948int
1949lio_listio(struct thread *td, struct lio_listio_args *uap)
1950{
1951 struct proc *p = td->td_proc;
1952 int nent, nentqueued;
1953 struct aiocb *iocb, * const *cbptr;
1954 struct aiocblist *cb;
1955 struct kaioinfo *ki;
1956 struct aio_liojob *lj;
1957 int error, runningcode;
1958 int nerror;
1959 int i;
1960 int s;
1961
1962 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
1963 return EINVAL;
1964
1965 nent = uap->nent;
1966 if (nent > AIO_LISTIO_MAX)
1967 return EINVAL;
1968
1969 if (p->p_aioinfo == NULL)
1970 aio_init_aioinfo(p);
1971
1972 if ((nent + num_queue_count) > max_queue_count)
1973 return EAGAIN;
1974
1975 ki = p->p_aioinfo;
1976 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
1977 return EAGAIN;
1978
1979 lj = zalloc(aiolio_zone);
1980 if (!lj)
1981 return EAGAIN;
1982
1983 lj->lioj_flags = 0;
1984 lj->lioj_buffer_count = 0;
1985 lj->lioj_buffer_finished_count = 0;
1986 lj->lioj_queue_count = 0;
1987 lj->lioj_queue_finished_count = 0;
1988 lj->lioj_ki = ki;
1989
1990 /*
1991 * Setup signal.
1992 */
1993 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
1994 error = copyin(uap->sig, &lj->lioj_signal,
1995 sizeof(lj->lioj_signal));
1996 if (error) {
1997 zfree(aiolio_zone, lj);
1998 return error;
1999 }
2000 if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
2001 zfree(aiolio_zone, lj);
2002 return EINVAL;
2003 }
2004 lj->lioj_flags |= LIOJ_SIGNAL;
2005 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2006 } else
2007 lj->lioj_flags &= ~LIOJ_SIGNAL;
2008
2009 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2010 /*
2011 * Get pointers to the list of I/O requests.
2012 */
2013 nerror = 0;
2014 nentqueued = 0;
2015 cbptr = uap->acb_list;
2016 for (i = 0; i < uap->nent; i++) {
2017 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2018 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2019 error = _aio_aqueue(td, iocb, lj, 0);
2020 if (error == 0)
2021 nentqueued++;
2022 else
2023 nerror++;
2024 }
2025 }
2026
2027 /*
2028 * If we haven't queued any, then just return error.
2029 */
2030 if (nentqueued == 0)
2031 return 0;
2032
2033 /*
2034 * Calculate the appropriate error return.
2035 */
2036 runningcode = 0;
2037 if (nerror)
2038 runningcode = EIO;
2039
2040 if (uap->mode == LIO_WAIT) {
2041 int command, found, jobref;
2042
2043 for (;;) {
2044 found = 0;
2045 for (i = 0; i < uap->nent; i++) {
2046 /*
2047 * Fetch address of the control buf pointer in
2048 * user space.
2049 */
2050 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2051 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2052 == 0))
2053 continue;
2054
2055 /*
2056 * Fetch the associated command from user space.
2057 */
2058 command = fuword(&iocb->aio_lio_opcode);
2059 if (command == LIO_NOP) {
2060 found++;
2061 continue;
2062 }
2063
2064 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2065
2066 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2067 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2068 == jobref) {
2069 if (cb->uaiocb.aio_lio_opcode
2070 == LIO_WRITE) {
2071 p->p_stats->p_ru.ru_oublock
2072 +=
2073 cb->outputcharge;
2074 cb->outputcharge = 0;
2075 } else if (cb->uaiocb.aio_lio_opcode
2076 == LIO_READ) {
2077 p->p_stats->p_ru.ru_inblock
2078 += cb->inputcharge;
2079 cb->inputcharge = 0;
2080 }
2081 found++;
2082 break;
2083 }
2084 }
2085
2086 s = splbio();
2087 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2088 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2089 == jobref) {
2090 found++;
2091 break;
2092 }
2093 }
2094 splx(s);
2095 }
2096
2097 /*
2098 * If all I/Os have been disposed of, then we can
2099 * return.
2100 */
2101 if (found == nentqueued)
2102 return runningcode;
2103
2104 ki->kaio_flags |= KAIO_WAKEUP;
2105 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2106
2107 if (error == EINTR)
2108 return EINTR;
2109 else if (error == EWOULDBLOCK)
2110 return EAGAIN;
2111 }
2112 }
2113
2114 return runningcode;
2115}
2116
2117/*
2118 * This is a weird hack so that we can post a signal. It is safe to do so from
2119 * a timeout routine, but *not* from an interrupt routine.
2120 */
2121static void
2122process_signal(void *aioj)
2123{
2124 struct aiocblist *aiocbe = aioj;
2125 struct aio_liojob *lj = aiocbe->lio;
2126 struct aiocb *cb = &aiocbe->uaiocb;
2127
2128 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2129 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2130 PROC_LOCK(lj->lioj_ki->kaio_p);
2131 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2132 PROC_UNLOCK(lj->lioj_ki->kaio_p);
2133 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2134 }
2135
2136 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
2137 PROC_LOCK(aiocbe->userproc);
2138 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2139 PROC_UNLOCK(aiocbe->userproc);
2140 }
2141}
2142
2143/*
2144 * Interrupt handler for physio, performs the necessary process wakeups, and
2145 * signals.
2146 */
2147static void
2148aio_physwakeup(struct buf *bp)
2149{
2150 struct aiocblist *aiocbe;
2151 struct proc *p;
2152 struct kaioinfo *ki;
2153 struct aio_liojob *lj;
2154
2155 wakeup(bp);
2156
2157 aiocbe = (struct aiocblist *)bp->b_spc;
2158 if (aiocbe) {
2159 p = bp->b_caller1;
2160
2161 aiocbe->jobstate = JOBST_JOBBFINISHED;
2162 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2163 aiocbe->uaiocb._aiocb_private.error = 0;
2164 aiocbe->jobflags |= AIOCBLIST_DONE;
2165
2166 if (bp->b_ioflags & BIO_ERROR)
2167 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2168
2169 lj = aiocbe->lio;
2170 if (lj) {
2171 lj->lioj_buffer_finished_count++;
2172
2173 /*
2174 * wakeup/signal if all of the interrupt jobs are done.
2175 */
2176 if (lj->lioj_buffer_finished_count ==
2177 lj->lioj_buffer_count) {
2178 /*
2179 * Post a signal if it is called for.
2180 */
2181 if ((lj->lioj_flags &
2182 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2183 LIOJ_SIGNAL) {
2184 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2185 aiocbe->timeouthandle =
2186 timeout(process_signal,
2187 aiocbe, 0);
2188 }
2189 }
2190 }
2191
2192 ki = p->p_aioinfo;
2193 if (ki) {
2194 ki->kaio_buffer_finished_count++;
2195 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2196 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2197 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2198
2199 KNOTE(&aiocbe->klist, 0);
2200 /* Do the wakeup. */
2201 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2202 ki->kaio_flags &= ~KAIO_WAKEUP;
2203 wakeup(p);
2204 }
2205 }
2206
2207 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2208 aiocbe->timeouthandle =
2209 timeout(process_signal, aiocbe, 0);
2210 }
2211}
2212
2213int
2214aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
2215{
2216 struct proc *p = td->td_proc;
2217 struct timeval atv;
2218 struct timespec ts;
2219 struct aiocb **cbptr;
2220 struct kaioinfo *ki;
2221 struct aiocblist *cb = NULL;
2222 int error, s, timo;
2223
2224 suword(uap->aiocbp, (int)NULL);
2225
2226 timo = 0;
2227 if (uap->timeout) {
2228 /* Get timespec struct. */
2229 error = copyin(uap->timeout, &ts, sizeof(ts));
2230 if (error)
2231 return error;
2232
2233 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2234 return (EINVAL);
2235
2236 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2237 if (itimerfix(&atv))
2238 return (EINVAL);
2239 timo = tvtohz(&atv);
2240 }
2241
2242 ki = p->p_aioinfo;
2243 if (ki == NULL)
2244 return EAGAIN;
2245
2246 cbptr = uap->aiocbp;
2247
2248 for (;;) {
2249 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2250 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2251 td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2252 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2253 p->p_stats->p_ru.ru_oublock +=
2254 cb->outputcharge;
2255 cb->outputcharge = 0;
2256 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2257 p->p_stats->p_ru.ru_inblock += cb->inputcharge;
2258 cb->inputcharge = 0;
2259 }
2260 aio_free_entry(cb);
2261 return cb->uaiocb._aiocb_private.error;
2262 }
2263
2264 s = splbio();
2265 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2266 splx(s);
2267 suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
2268 td->td_retval[0] = cb->uaiocb._aiocb_private.status;
2269 aio_free_entry(cb);
2270 return cb->uaiocb._aiocb_private.error;
2271 }
2272
2273 ki->kaio_flags |= KAIO_WAKEUP;
2274 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2275 splx(s);
2276
2277 if (error == ERESTART)
2278 return EINTR;
2279 else if (error < 0)
2280 return error;
2281 else if (error == EINTR)
2282 return EINTR;
2283 else if (error == EWOULDBLOCK)
2284 return EAGAIN;
2285 }
2286}
2287
2288static int
2289filt_aioattach(struct knote *kn)
2290{
2291 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2292
2293 /*
2294 * The aiocbe pointer must be validated before using it, so
2295 * registration is restricted to the kernel; the user cannot
2296 * set EV_FLAG1.
2297 */
2298 if ((kn->kn_flags & EV_FLAG1) == 0)
2299 return (EPERM);
2300 kn->kn_flags &= ~EV_FLAG1;
2301
2302 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2303
2304 return (0);
2305}
2306
2307static void
2308filt_aiodetach(struct knote *kn)
2309{
2310 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2311
2312 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2313}
2314
2315/*ARGSUSED*/
2316static int
2317filt_aio(struct knote *kn, long hint)
2318{
2319 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2320
2321 kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
2322 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2323 aiocbe->jobstate != JOBST_JOBBFINISHED)
2324 return (0);
2325 kn->kn_flags |= EV_EOF;
2326 return (1);
2327}