Deleted Added
full compact
vfs_aio.c (72082) vfs_aio.c (72200)
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
1/*
2 * Copyright (c) 1997 John S. Dyson. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. John S. Dyson's name may not be used to endorse or promote products
10 * derived from this software without specific prior written permission.
11 *
12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything
13 * bad that happens because of using this software isn't the responsibility
14 * of the author. This software is distributed AS-IS.
15 *
16 * $FreeBSD: head/sys/kern/vfs_aio.c 72082 2001-02-06 09:25:10Z asmodai $
16 * $FreeBSD: head/sys/kern/vfs_aio.c 72200 2001-02-09 06:11:45Z bmilekic $
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/bio.h>
26#include <sys/buf.h>
27#include <sys/sysproto.h>
28#include <sys/filedesc.h>
29#include <sys/kernel.h>
30#include <sys/fcntl.h>
31#include <sys/file.h>
32#include <sys/lock.h>
33#include <sys/mutex.h>
34#include <sys/unistd.h>
35#include <sys/proc.h>
36#include <sys/resourcevar.h>
37#include <sys/signalvar.h>
38#include <sys/protosw.h>
39#include <sys/socketvar.h>
40#include <sys/sysctl.h>
41#include <sys/vnode.h>
42#include <sys/conf.h>
43#include <sys/event.h>
44
45#include <vm/vm.h>
46#include <vm/vm_extern.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49#include <vm/vm_zone.h>
50#include <sys/aio.h>
51
52#include <machine/limits.h>
53
54#include "opt_vfs_aio.h"
55
56#ifdef VFS_AIO
57
58static long jobrefid;
59
60#define JOBST_NULL 0x0
61#define JOBST_JOBQPROC 0x1
62#define JOBST_JOBQGLOBAL 0x2
63#define JOBST_JOBRUNNING 0x3
64#define JOBST_JOBFINISHED 0x4
65#define JOBST_JOBQBUF 0x5
66#define JOBST_JOBBFINISHED 0x6
67
68#ifndef MAX_AIO_PER_PROC
69#define MAX_AIO_PER_PROC 32
70#endif
71
72#ifndef MAX_AIO_QUEUE_PER_PROC
73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
74#endif
75
76#ifndef MAX_AIO_PROCS
77#define MAX_AIO_PROCS 32
78#endif
79
80#ifndef MAX_AIO_QUEUE
81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
82#endif
83
84#ifndef TARGET_AIO_PROCS
85#define TARGET_AIO_PROCS 4
86#endif
87
88#ifndef MAX_BUF_AIO
89#define MAX_BUF_AIO 16
90#endif
91
92#ifndef AIOD_TIMEOUT_DEFAULT
93#define AIOD_TIMEOUT_DEFAULT (10 * hz)
94#endif
95
96#ifndef AIOD_LIFETIME_DEFAULT
97#define AIOD_LIFETIME_DEFAULT (30 * hz)
98#endif
99
100static int max_aio_procs = MAX_AIO_PROCS;
101static int num_aio_procs = 0;
102static int target_aio_procs = TARGET_AIO_PROCS;
103static int max_queue_count = MAX_AIO_QUEUE;
104static int num_queue_count = 0;
105static int num_buf_aio = 0;
106static int num_aio_resv_start = 0;
107static int aiod_timeout;
108static int aiod_lifetime;
109
110static int max_aio_per_proc = MAX_AIO_PER_PROC;
111static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
112static int max_buf_aio = MAX_BUF_AIO;
113
114SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
115
116SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
117 CTLFLAG_RW, &max_aio_per_proc, 0, "");
118
119SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
120 CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
121
122SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
123 CTLFLAG_RW, &max_aio_procs, 0, "");
124
125SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
126 CTLFLAG_RD, &num_aio_procs, 0, "");
127
128SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
129 CTLFLAG_RD, &num_queue_count, 0, "");
130
131SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
132 CTLFLAG_RW, &max_queue_count, 0, "");
133
134SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
135 CTLFLAG_RW, &target_aio_procs, 0, "");
136
137SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
138 CTLFLAG_RW, &max_buf_aio, 0, "");
139
140SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
141 CTLFLAG_RD, &num_buf_aio, 0, "");
142
143SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
144 CTLFLAG_RW, &aiod_lifetime, 0, "");
145
146SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
147 CTLFLAG_RW, &aiod_timeout, 0, "");
148
149/*
150 * AIO process info
151 */
152#define AIOP_FREE 0x1 /* proc on free queue */
153#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
154
155struct aioproclist {
156 int aioprocflags; /* AIO proc flags */
157 TAILQ_ENTRY(aioproclist) list; /* List of processes */
158 struct proc *aioproc; /* The AIO thread */
159 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */
160};
161
162/*
163 * data-structure for lio signal management
164 */
165struct aio_liojob {
166 int lioj_flags;
167 int lioj_buffer_count;
168 int lioj_buffer_finished_count;
169 int lioj_queue_count;
170 int lioj_queue_finished_count;
171 struct sigevent lioj_signal; /* signal on all I/O done */
172 TAILQ_ENTRY (aio_liojob) lioj_list;
173 struct kaioinfo *lioj_ki;
174};
175#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
176#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
177
178/*
179 * per process aio data structure
180 */
181struct kaioinfo {
182 int kaio_flags; /* per process kaio flags */
183 int kaio_maxactive_count; /* maximum number of AIOs */
184 int kaio_active_count; /* number of currently used AIOs */
185 int kaio_qallowed_count; /* maxiumu size of AIO queue */
186 int kaio_queue_count; /* size of AIO queue */
187 int kaio_ballowed_count; /* maximum number of buffers */
188 int kaio_queue_finished_count; /* number of daemon jobs finished */
189 int kaio_buffer_count; /* number of physio buffers */
190 int kaio_buffer_finished_count; /* count of I/O done */
191 struct proc *kaio_p; /* process that uses this kaio block */
192 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
193 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */
194 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */
195 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */
196 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */
197 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
198};
199
200#define KAIO_RUNDOWN 0x1 /* process is being run down */
201#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
202
203static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
204static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
205static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
206static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
207
208static void aio_init_aioinfo(struct proc *p);
209static void aio_onceonly(void *);
210static int aio_free_entry(struct aiocblist *aiocbe);
211static void aio_process(struct aiocblist *aiocbe);
212static int aio_newproc(void);
213static int aio_aqueue(struct proc *p, struct aiocb *job, int type);
214static void aio_physwakeup(struct buf *bp);
215static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
216static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
217static void aio_daemon(void *uproc);
218
219SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
220
221static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0;
222static vm_zone_t aiolio_zone = 0;
223
224/*
225 * Startup initialization
226 */
227void
228aio_onceonly(void *na)
229{
230 TAILQ_INIT(&aio_freeproc);
231 TAILQ_INIT(&aio_activeproc);
232 TAILQ_INIT(&aio_jobs);
233 TAILQ_INIT(&aio_bufjobs);
234 TAILQ_INIT(&aio_freejobs);
235 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
236 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
237 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
238 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
239 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct
240 aio_liojob), 0, 0, 1);
241 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
242 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
243 jobrefid = 1;
244}
245
246/*
247 * Init the per-process aioinfo structure. The aioinfo limits are set
248 * per-process for user limit (resource) management.
249 */
250void
251aio_init_aioinfo(struct proc *p)
252{
253 struct kaioinfo *ki;
254 if (p->p_aioinfo == NULL) {
255 ki = zalloc(kaio_zone);
256 p->p_aioinfo = ki;
257 ki->kaio_flags = 0;
258 ki->kaio_maxactive_count = max_aio_per_proc;
259 ki->kaio_active_count = 0;
260 ki->kaio_qallowed_count = max_aio_queue_per_proc;
261 ki->kaio_queue_count = 0;
262 ki->kaio_ballowed_count = max_buf_aio;
263 ki->kaio_buffer_count = 0;
264 ki->kaio_buffer_finished_count = 0;
265 ki->kaio_p = p;
266 TAILQ_INIT(&ki->kaio_jobdone);
267 TAILQ_INIT(&ki->kaio_jobqueue);
268 TAILQ_INIT(&ki->kaio_bufdone);
269 TAILQ_INIT(&ki->kaio_bufqueue);
270 TAILQ_INIT(&ki->kaio_liojoblist);
271 TAILQ_INIT(&ki->kaio_sockqueue);
272 }
273
274 while (num_aio_procs < target_aio_procs)
275 aio_newproc();
276}
277
278/*
279 * Free a job entry. Wait for completion if it is currently active, but don't
280 * delay forever. If we delay, we return a flag that says that we have to
281 * restart the queue scan.
282 */
283int
284aio_free_entry(struct aiocblist *aiocbe)
285{
286 struct kaioinfo *ki;
287 struct aioproclist *aiop;
288 struct aio_liojob *lj;
289 struct proc *p;
290 int error;
291 int s;
292
293 if (aiocbe->jobstate == JOBST_NULL)
294 panic("aio_free_entry: freeing already free job");
295
296 p = aiocbe->userproc;
297 ki = p->p_aioinfo;
298 lj = aiocbe->lio;
299 if (ki == NULL)
300 panic("aio_free_entry: missing p->p_aioinfo");
301
302 if (aiocbe->jobstate == JOBST_JOBRUNNING) {
303 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
304 return 0;
305 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
306 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
307 }
308 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
309
310 if (aiocbe->bp == NULL) {
311 if (ki->kaio_queue_count <= 0)
312 panic("aio_free_entry: process queue size <= 0");
313 if (num_queue_count <= 0)
314 panic("aio_free_entry: system wide queue size <= 0");
315
316 if (lj) {
317 lj->lioj_queue_count--;
318 if (aiocbe->jobflags & AIOCBLIST_DONE)
319 lj->lioj_queue_finished_count--;
320 }
321 ki->kaio_queue_count--;
322 if (aiocbe->jobflags & AIOCBLIST_DONE)
323 ki->kaio_queue_finished_count--;
324 num_queue_count--;
325 } else {
326 if (lj) {
327 lj->lioj_buffer_count--;
328 if (aiocbe->jobflags & AIOCBLIST_DONE)
329 lj->lioj_buffer_finished_count--;
330 }
331 if (aiocbe->jobflags & AIOCBLIST_DONE)
332 ki->kaio_buffer_finished_count--;
333 ki->kaio_buffer_count--;
334 num_buf_aio--;
335 }
336
337 /* aiocbe is going away, we need to destroy any knotes */
338 knote_remove(p, &aiocbe->klist);
339
340 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
341 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
342 ki->kaio_flags &= ~KAIO_WAKEUP;
343 wakeup(p);
344 }
345
346 if (aiocbe->jobstate == JOBST_JOBQBUF) {
347 if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
348 return error;
349 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
350 panic("aio_free_entry: invalid physio finish-up state");
351 s = splbio();
352 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
353 splx(s);
354 } else if (aiocbe->jobstate == JOBST_JOBQPROC) {
355 aiop = aiocbe->jobaioproc;
356 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
357 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL)
358 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
359 else if (aiocbe->jobstate == JOBST_JOBFINISHED)
360 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
361 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
362 s = splbio();
363 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
364 splx(s);
365 if (aiocbe->bp) {
366 vunmapbuf(aiocbe->bp);
367 relpbuf(aiocbe->bp, NULL);
368 aiocbe->bp = NULL;
369 }
370 }
371 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
372 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
373 zfree(aiolio_zone, lj);
374 }
375 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
376 aiocbe->jobstate = JOBST_NULL;
377 return 0;
378}
379#endif /* VFS_AIO */
380
381/*
382 * Rundown the jobs for a given process.
383 */
384void
385aio_proc_rundown(struct proc *p)
386{
387#ifndef VFS_AIO
388 return;
389#else
390 int s;
391 struct kaioinfo *ki;
392 struct aio_liojob *lj, *ljn;
393 struct aiocblist *aiocbe, *aiocbn;
394 struct file *fp;
395 struct filedesc *fdp;
396 struct socket *so;
397
398 ki = p->p_aioinfo;
399 if (ki == NULL)
400 return;
401
402 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
403 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
404 ki->kaio_buffer_finished_count)) {
405 ki->kaio_flags |= KAIO_RUNDOWN;
406 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
407 break;
408 }
409
410 /*
411 * Move any aio ops that are waiting on socket I/O to the normal job
412 * queues so they are cleaned up with any others.
413 */
414 fdp = p->p_fd;
415
416 s = splnet();
417 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
418 aiocbn) {
419 aiocbn = TAILQ_NEXT(aiocbe, plist);
420 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
421
422 /*
423 * Under some circumstances, the aio_fildes and the file
424 * structure don't match. This would leave aiocbe's in the
425 * TAILQ associated with the socket and cause a panic later.
426 *
427 * Detect and fix.
428 */
429 if ((fp == NULL) || (fp != aiocbe->fd_file))
430 fp = aiocbe->fd_file;
431 if (fp) {
432 so = (struct socket *)fp->f_data;
433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
434 if (TAILQ_EMPTY(&so->so_aiojobq)) {
435 so->so_snd.sb_flags &= ~SB_AIO;
436 so->so_rcv.sb_flags &= ~SB_AIO;
437 }
438 }
439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
442 }
443 splx(s);
444
445restart1:
446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
447 aiocbn = TAILQ_NEXT(aiocbe, plist);
448 if (aio_free_entry(aiocbe))
449 goto restart1;
450 }
451
452restart2:
453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
454 aiocbn) {
455 aiocbn = TAILQ_NEXT(aiocbe, plist);
456 if (aio_free_entry(aiocbe))
457 goto restart2;
458 }
459
460/*
461 * Note the use of lots of splbio here, trying to avoid splbio for long chains
462 * of I/O. Probably unnecessary.
463 */
464restart3:
465 s = splbio();
466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
467 ki->kaio_flags |= KAIO_WAKEUP;
468 tsleep(p, PRIBIO, "aioprn", 0);
469 splx(s);
470 goto restart3;
471 }
472 splx(s);
473
474restart4:
475 s = splbio();
476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
477 aiocbn = TAILQ_NEXT(aiocbe, plist);
478 if (aio_free_entry(aiocbe)) {
479 splx(s);
480 goto restart4;
481 }
482 }
483 splx(s);
484
485 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
486 ljn = TAILQ_NEXT(lj, lioj_list);
487 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
488 0)) {
489 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
490 zfree(aiolio_zone, lj);
491 } else {
492#ifdef DIAGNOSTIC
493 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
494 "QF:%d\n", lj->lioj_buffer_count,
495 lj->lioj_buffer_finished_count,
496 lj->lioj_queue_count,
497 lj->lioj_queue_finished_count);
498#endif
499 }
500 }
501
502 zfree(kaio_zone, ki);
503 p->p_aioinfo = NULL;
504#endif /* VFS_AIO */
505}
506
507#ifdef VFS_AIO
508/*
509 * Select a job to run (called by an AIO daemon).
510 */
511static struct aiocblist *
512aio_selectjob(struct aioproclist *aiop)
513{
514 int s;
515 struct aiocblist *aiocbe;
516 struct kaioinfo *ki;
517 struct proc *userp;
518
519 aiocbe = TAILQ_FIRST(&aiop->jobtorun);
520 if (aiocbe) {
521 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
522 return aiocbe;
523 }
524
525 s = splnet();
526 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
527 TAILQ_NEXT(aiocbe, list)) {
528 userp = aiocbe->userproc;
529 ki = userp->p_aioinfo;
530
531 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
532 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
533 splx(s);
534 return aiocbe;
535 }
536 }
537 splx(s);
538
539 return NULL;
540}
541
542/*
543 * The AIO processing activity. This is the code that does the I/O request for
544 * the non-physio version of the operations. The normal vn operations are used,
545 * and this code should work in all instances for every type of file, including
546 * pipes, sockets, fifos, and regular files.
547 */
548void
549aio_process(struct aiocblist *aiocbe)
550{
551 struct filedesc *fdp;
552 struct proc *userp, *mycp;
553 struct aiocb *cb;
554 struct file *fp;
555 struct uio auio;
556 struct iovec aiov;
557 unsigned int fd;
558 int cnt;
559 int error;
560 off_t offset;
561 int oublock_st, oublock_end;
562 int inblock_st, inblock_end;
563
564 userp = aiocbe->userproc;
565 cb = &aiocbe->uaiocb;
566
567 mycp = curproc;
568
569 fdp = mycp->p_fd;
570 fd = cb->aio_fildes;
571 fp = fdp->fd_ofiles[fd];
572
573 if ((fp == NULL) || (fp != aiocbe->fd_file)) {
574 cb->_aiocb_private.error = EBADF;
575 cb->_aiocb_private.status = -1;
576 return;
577 }
578
579 aiov.iov_base = (void *)cb->aio_buf;
580 aiov.iov_len = cb->aio_nbytes;
581
582 auio.uio_iov = &aiov;
583 auio.uio_iovcnt = 1;
584 auio.uio_offset = offset = cb->aio_offset;
585 auio.uio_resid = cb->aio_nbytes;
586 cnt = cb->aio_nbytes;
587 auio.uio_segflg = UIO_USERSPACE;
588 auio.uio_procp = mycp;
589
590 inblock_st = mycp->p_stats->p_ru.ru_inblock;
591 oublock_st = mycp->p_stats->p_ru.ru_oublock;
592 /*
593 * Temporarily bump the ref count while reading to avoid the
594 * descriptor being ripped out from under us.
595 */
596 fhold(fp);
597 if (cb->aio_lio_opcode == LIO_READ) {
598 auio.uio_rw = UIO_READ;
599 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
600 } else {
601 auio.uio_rw = UIO_WRITE;
602 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
603 }
604 fdrop(fp, mycp);
605 inblock_end = mycp->p_stats->p_ru.ru_inblock;
606 oublock_end = mycp->p_stats->p_ru.ru_oublock;
607
608 aiocbe->inputcharge = inblock_end - inblock_st;
609 aiocbe->outputcharge = oublock_end - oublock_st;
610
611 if ((error) && (auio.uio_resid != cnt)) {
612 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
613 error = 0;
614 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
615 psignal(userp, SIGPIPE);
616 }
617
618 cnt -= auio.uio_resid;
619 cb->_aiocb_private.error = error;
620 cb->_aiocb_private.status = cnt;
621
622 return;
623}
624
625/*
626 * The AIO daemon, most of the actual work is done in aio_process,
627 * but the setup (and address space mgmt) is done in this routine.
628 */
629static void
630aio_daemon(void *uproc)
631{
632 int s;
633 struct aio_liojob *lj;
634 struct aiocb *cb;
635 struct aiocblist *aiocbe;
636 struct aioproclist *aiop;
637 struct kaioinfo *ki;
638 struct proc *curcp, *mycp, *userp;
639 struct vmspace *myvm, *tmpvm;
640
17 */
18
19/*
20 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
21 */
22
23#include <sys/param.h>
24#include <sys/systm.h>
25#include <sys/bio.h>
26#include <sys/buf.h>
27#include <sys/sysproto.h>
28#include <sys/filedesc.h>
29#include <sys/kernel.h>
30#include <sys/fcntl.h>
31#include <sys/file.h>
32#include <sys/lock.h>
33#include <sys/mutex.h>
34#include <sys/unistd.h>
35#include <sys/proc.h>
36#include <sys/resourcevar.h>
37#include <sys/signalvar.h>
38#include <sys/protosw.h>
39#include <sys/socketvar.h>
40#include <sys/sysctl.h>
41#include <sys/vnode.h>
42#include <sys/conf.h>
43#include <sys/event.h>
44
45#include <vm/vm.h>
46#include <vm/vm_extern.h>
47#include <vm/pmap.h>
48#include <vm/vm_map.h>
49#include <vm/vm_zone.h>
50#include <sys/aio.h>
51
52#include <machine/limits.h>
53
54#include "opt_vfs_aio.h"
55
56#ifdef VFS_AIO
57
58static long jobrefid;
59
60#define JOBST_NULL 0x0
61#define JOBST_JOBQPROC 0x1
62#define JOBST_JOBQGLOBAL 0x2
63#define JOBST_JOBRUNNING 0x3
64#define JOBST_JOBFINISHED 0x4
65#define JOBST_JOBQBUF 0x5
66#define JOBST_JOBBFINISHED 0x6
67
68#ifndef MAX_AIO_PER_PROC
69#define MAX_AIO_PER_PROC 32
70#endif
71
72#ifndef MAX_AIO_QUEUE_PER_PROC
73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
74#endif
75
76#ifndef MAX_AIO_PROCS
77#define MAX_AIO_PROCS 32
78#endif
79
80#ifndef MAX_AIO_QUEUE
81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
82#endif
83
84#ifndef TARGET_AIO_PROCS
85#define TARGET_AIO_PROCS 4
86#endif
87
88#ifndef MAX_BUF_AIO
89#define MAX_BUF_AIO 16
90#endif
91
92#ifndef AIOD_TIMEOUT_DEFAULT
93#define AIOD_TIMEOUT_DEFAULT (10 * hz)
94#endif
95
96#ifndef AIOD_LIFETIME_DEFAULT
97#define AIOD_LIFETIME_DEFAULT (30 * hz)
98#endif
99
100static int max_aio_procs = MAX_AIO_PROCS;
101static int num_aio_procs = 0;
102static int target_aio_procs = TARGET_AIO_PROCS;
103static int max_queue_count = MAX_AIO_QUEUE;
104static int num_queue_count = 0;
105static int num_buf_aio = 0;
106static int num_aio_resv_start = 0;
107static int aiod_timeout;
108static int aiod_lifetime;
109
110static int max_aio_per_proc = MAX_AIO_PER_PROC;
111static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
112static int max_buf_aio = MAX_BUF_AIO;
113
114SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt");
115
116SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc,
117 CTLFLAG_RW, &max_aio_per_proc, 0, "");
118
119SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc,
120 CTLFLAG_RW, &max_aio_queue_per_proc, 0, "");
121
122SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
123 CTLFLAG_RW, &max_aio_procs, 0, "");
124
125SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
126 CTLFLAG_RD, &num_aio_procs, 0, "");
127
128SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count,
129 CTLFLAG_RD, &num_queue_count, 0, "");
130
131SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue,
132 CTLFLAG_RW, &max_queue_count, 0, "");
133
134SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs,
135 CTLFLAG_RW, &target_aio_procs, 0, "");
136
137SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio,
138 CTLFLAG_RW, &max_buf_aio, 0, "");
139
140SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio,
141 CTLFLAG_RD, &num_buf_aio, 0, "");
142
143SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime,
144 CTLFLAG_RW, &aiod_lifetime, 0, "");
145
146SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout,
147 CTLFLAG_RW, &aiod_timeout, 0, "");
148
149/*
150 * AIO process info
151 */
152#define AIOP_FREE 0x1 /* proc on free queue */
153#define AIOP_SCHED 0x2 /* proc explicitly scheduled */
154
155struct aioproclist {
156 int aioprocflags; /* AIO proc flags */
157 TAILQ_ENTRY(aioproclist) list; /* List of processes */
158 struct proc *aioproc; /* The AIO thread */
159 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */
160};
161
162/*
163 * data-structure for lio signal management
164 */
165struct aio_liojob {
166 int lioj_flags;
167 int lioj_buffer_count;
168 int lioj_buffer_finished_count;
169 int lioj_queue_count;
170 int lioj_queue_finished_count;
171 struct sigevent lioj_signal; /* signal on all I/O done */
172 TAILQ_ENTRY (aio_liojob) lioj_list;
173 struct kaioinfo *lioj_ki;
174};
175#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
176#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
177
178/*
179 * per process aio data structure
180 */
181struct kaioinfo {
182 int kaio_flags; /* per process kaio flags */
183 int kaio_maxactive_count; /* maximum number of AIOs */
184 int kaio_active_count; /* number of currently used AIOs */
185 int kaio_qallowed_count; /* maxiumu size of AIO queue */
186 int kaio_queue_count; /* size of AIO queue */
187 int kaio_ballowed_count; /* maximum number of buffers */
188 int kaio_queue_finished_count; /* number of daemon jobs finished */
189 int kaio_buffer_count; /* number of physio buffers */
190 int kaio_buffer_finished_count; /* count of I/O done */
191 struct proc *kaio_p; /* process that uses this kaio block */
192 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */
193 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */
194 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */
195 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */
196 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */
197 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */
198};
199
200#define KAIO_RUNDOWN 0x1 /* process is being run down */
201#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
202
203static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc;
204static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */
205static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */
206static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */
207
208static void aio_init_aioinfo(struct proc *p);
209static void aio_onceonly(void *);
210static int aio_free_entry(struct aiocblist *aiocbe);
211static void aio_process(struct aiocblist *aiocbe);
212static int aio_newproc(void);
213static int aio_aqueue(struct proc *p, struct aiocb *job, int type);
214static void aio_physwakeup(struct buf *bp);
215static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type);
216static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
217static void aio_daemon(void *uproc);
218
219SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL);
220
221static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0;
222static vm_zone_t aiolio_zone = 0;
223
224/*
225 * Startup initialization
226 */
227void
228aio_onceonly(void *na)
229{
230 TAILQ_INIT(&aio_freeproc);
231 TAILQ_INIT(&aio_activeproc);
232 TAILQ_INIT(&aio_jobs);
233 TAILQ_INIT(&aio_bufjobs);
234 TAILQ_INIT(&aio_freejobs);
235 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1);
236 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1);
237 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1);
238 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1);
239 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct
240 aio_liojob), 0, 0, 1);
241 aiod_timeout = AIOD_TIMEOUT_DEFAULT;
242 aiod_lifetime = AIOD_LIFETIME_DEFAULT;
243 jobrefid = 1;
244}
245
246/*
247 * Init the per-process aioinfo structure. The aioinfo limits are set
248 * per-process for user limit (resource) management.
249 */
250void
251aio_init_aioinfo(struct proc *p)
252{
253 struct kaioinfo *ki;
254 if (p->p_aioinfo == NULL) {
255 ki = zalloc(kaio_zone);
256 p->p_aioinfo = ki;
257 ki->kaio_flags = 0;
258 ki->kaio_maxactive_count = max_aio_per_proc;
259 ki->kaio_active_count = 0;
260 ki->kaio_qallowed_count = max_aio_queue_per_proc;
261 ki->kaio_queue_count = 0;
262 ki->kaio_ballowed_count = max_buf_aio;
263 ki->kaio_buffer_count = 0;
264 ki->kaio_buffer_finished_count = 0;
265 ki->kaio_p = p;
266 TAILQ_INIT(&ki->kaio_jobdone);
267 TAILQ_INIT(&ki->kaio_jobqueue);
268 TAILQ_INIT(&ki->kaio_bufdone);
269 TAILQ_INIT(&ki->kaio_bufqueue);
270 TAILQ_INIT(&ki->kaio_liojoblist);
271 TAILQ_INIT(&ki->kaio_sockqueue);
272 }
273
274 while (num_aio_procs < target_aio_procs)
275 aio_newproc();
276}
277
278/*
279 * Free a job entry. Wait for completion if it is currently active, but don't
280 * delay forever. If we delay, we return a flag that says that we have to
281 * restart the queue scan.
282 */
283int
284aio_free_entry(struct aiocblist *aiocbe)
285{
286 struct kaioinfo *ki;
287 struct aioproclist *aiop;
288 struct aio_liojob *lj;
289 struct proc *p;
290 int error;
291 int s;
292
293 if (aiocbe->jobstate == JOBST_NULL)
294 panic("aio_free_entry: freeing already free job");
295
296 p = aiocbe->userproc;
297 ki = p->p_aioinfo;
298 lj = aiocbe->lio;
299 if (ki == NULL)
300 panic("aio_free_entry: missing p->p_aioinfo");
301
302 if (aiocbe->jobstate == JOBST_JOBRUNNING) {
303 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE)
304 return 0;
305 aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
306 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0);
307 }
308 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
309
310 if (aiocbe->bp == NULL) {
311 if (ki->kaio_queue_count <= 0)
312 panic("aio_free_entry: process queue size <= 0");
313 if (num_queue_count <= 0)
314 panic("aio_free_entry: system wide queue size <= 0");
315
316 if (lj) {
317 lj->lioj_queue_count--;
318 if (aiocbe->jobflags & AIOCBLIST_DONE)
319 lj->lioj_queue_finished_count--;
320 }
321 ki->kaio_queue_count--;
322 if (aiocbe->jobflags & AIOCBLIST_DONE)
323 ki->kaio_queue_finished_count--;
324 num_queue_count--;
325 } else {
326 if (lj) {
327 lj->lioj_buffer_count--;
328 if (aiocbe->jobflags & AIOCBLIST_DONE)
329 lj->lioj_buffer_finished_count--;
330 }
331 if (aiocbe->jobflags & AIOCBLIST_DONE)
332 ki->kaio_buffer_finished_count--;
333 ki->kaio_buffer_count--;
334 num_buf_aio--;
335 }
336
337 /* aiocbe is going away, we need to destroy any knotes */
338 knote_remove(p, &aiocbe->klist);
339
340 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
341 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
342 ki->kaio_flags &= ~KAIO_WAKEUP;
343 wakeup(p);
344 }
345
346 if (aiocbe->jobstate == JOBST_JOBQBUF) {
347 if ((error = aio_fphysio(p, aiocbe, 1)) != 0)
348 return error;
349 if (aiocbe->jobstate != JOBST_JOBBFINISHED)
350 panic("aio_free_entry: invalid physio finish-up state");
351 s = splbio();
352 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
353 splx(s);
354 } else if (aiocbe->jobstate == JOBST_JOBQPROC) {
355 aiop = aiocbe->jobaioproc;
356 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
357 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL)
358 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
359 else if (aiocbe->jobstate == JOBST_JOBFINISHED)
360 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
361 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
362 s = splbio();
363 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
364 splx(s);
365 if (aiocbe->bp) {
366 vunmapbuf(aiocbe->bp);
367 relpbuf(aiocbe->bp, NULL);
368 aiocbe->bp = NULL;
369 }
370 }
371 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
372 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
373 zfree(aiolio_zone, lj);
374 }
375 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
376 aiocbe->jobstate = JOBST_NULL;
377 return 0;
378}
379#endif /* VFS_AIO */
380
381/*
382 * Rundown the jobs for a given process.
383 */
384void
385aio_proc_rundown(struct proc *p)
386{
387#ifndef VFS_AIO
388 return;
389#else
390 int s;
391 struct kaioinfo *ki;
392 struct aio_liojob *lj, *ljn;
393 struct aiocblist *aiocbe, *aiocbn;
394 struct file *fp;
395 struct filedesc *fdp;
396 struct socket *so;
397
398 ki = p->p_aioinfo;
399 if (ki == NULL)
400 return;
401
402 ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
403 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
404 ki->kaio_buffer_finished_count)) {
405 ki->kaio_flags |= KAIO_RUNDOWN;
406 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
407 break;
408 }
409
410 /*
411 * Move any aio ops that are waiting on socket I/O to the normal job
412 * queues so they are cleaned up with any others.
413 */
414 fdp = p->p_fd;
415
416 s = splnet();
417 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
418 aiocbn) {
419 aiocbn = TAILQ_NEXT(aiocbe, plist);
420 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes];
421
422 /*
423 * Under some circumstances, the aio_fildes and the file
424 * structure don't match. This would leave aiocbe's in the
425 * TAILQ associated with the socket and cause a panic later.
426 *
427 * Detect and fix.
428 */
429 if ((fp == NULL) || (fp != aiocbe->fd_file))
430 fp = aiocbe->fd_file;
431 if (fp) {
432 so = (struct socket *)fp->f_data;
433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
434 if (TAILQ_EMPTY(&so->so_aiojobq)) {
435 so->so_snd.sb_flags &= ~SB_AIO;
436 so->so_rcv.sb_flags &= ~SB_AIO;
437 }
438 }
439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
442 }
443 splx(s);
444
445restart1:
446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
447 aiocbn = TAILQ_NEXT(aiocbe, plist);
448 if (aio_free_entry(aiocbe))
449 goto restart1;
450 }
451
452restart2:
453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
454 aiocbn) {
455 aiocbn = TAILQ_NEXT(aiocbe, plist);
456 if (aio_free_entry(aiocbe))
457 goto restart2;
458 }
459
460/*
461 * Note the use of lots of splbio here, trying to avoid splbio for long chains
462 * of I/O. Probably unnecessary.
463 */
464restart3:
465 s = splbio();
466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
467 ki->kaio_flags |= KAIO_WAKEUP;
468 tsleep(p, PRIBIO, "aioprn", 0);
469 splx(s);
470 goto restart3;
471 }
472 splx(s);
473
474restart4:
475 s = splbio();
476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
477 aiocbn = TAILQ_NEXT(aiocbe, plist);
478 if (aio_free_entry(aiocbe)) {
479 splx(s);
480 goto restart4;
481 }
482 }
483 splx(s);
484
485 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
486 ljn = TAILQ_NEXT(lj, lioj_list);
487 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
488 0)) {
489 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
490 zfree(aiolio_zone, lj);
491 } else {
492#ifdef DIAGNOSTIC
493 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
494 "QF:%d\n", lj->lioj_buffer_count,
495 lj->lioj_buffer_finished_count,
496 lj->lioj_queue_count,
497 lj->lioj_queue_finished_count);
498#endif
499 }
500 }
501
502 zfree(kaio_zone, ki);
503 p->p_aioinfo = NULL;
504#endif /* VFS_AIO */
505}
506
507#ifdef VFS_AIO
508/*
509 * Select a job to run (called by an AIO daemon).
510 */
511static struct aiocblist *
512aio_selectjob(struct aioproclist *aiop)
513{
514 int s;
515 struct aiocblist *aiocbe;
516 struct kaioinfo *ki;
517 struct proc *userp;
518
519 aiocbe = TAILQ_FIRST(&aiop->jobtorun);
520 if (aiocbe) {
521 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list);
522 return aiocbe;
523 }
524
525 s = splnet();
526 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
527 TAILQ_NEXT(aiocbe, list)) {
528 userp = aiocbe->userproc;
529 ki = userp->p_aioinfo;
530
531 if (ki->kaio_active_count < ki->kaio_maxactive_count) {
532 TAILQ_REMOVE(&aio_jobs, aiocbe, list);
533 splx(s);
534 return aiocbe;
535 }
536 }
537 splx(s);
538
539 return NULL;
540}
541
542/*
543 * The AIO processing activity. This is the code that does the I/O request for
544 * the non-physio version of the operations. The normal vn operations are used,
545 * and this code should work in all instances for every type of file, including
546 * pipes, sockets, fifos, and regular files.
547 */
548void
549aio_process(struct aiocblist *aiocbe)
550{
551 struct filedesc *fdp;
552 struct proc *userp, *mycp;
553 struct aiocb *cb;
554 struct file *fp;
555 struct uio auio;
556 struct iovec aiov;
557 unsigned int fd;
558 int cnt;
559 int error;
560 off_t offset;
561 int oublock_st, oublock_end;
562 int inblock_st, inblock_end;
563
564 userp = aiocbe->userproc;
565 cb = &aiocbe->uaiocb;
566
567 mycp = curproc;
568
569 fdp = mycp->p_fd;
570 fd = cb->aio_fildes;
571 fp = fdp->fd_ofiles[fd];
572
573 if ((fp == NULL) || (fp != aiocbe->fd_file)) {
574 cb->_aiocb_private.error = EBADF;
575 cb->_aiocb_private.status = -1;
576 return;
577 }
578
579 aiov.iov_base = (void *)cb->aio_buf;
580 aiov.iov_len = cb->aio_nbytes;
581
582 auio.uio_iov = &aiov;
583 auio.uio_iovcnt = 1;
584 auio.uio_offset = offset = cb->aio_offset;
585 auio.uio_resid = cb->aio_nbytes;
586 cnt = cb->aio_nbytes;
587 auio.uio_segflg = UIO_USERSPACE;
588 auio.uio_procp = mycp;
589
590 inblock_st = mycp->p_stats->p_ru.ru_inblock;
591 oublock_st = mycp->p_stats->p_ru.ru_oublock;
592 /*
593 * Temporarily bump the ref count while reading to avoid the
594 * descriptor being ripped out from under us.
595 */
596 fhold(fp);
597 if (cb->aio_lio_opcode == LIO_READ) {
598 auio.uio_rw = UIO_READ;
599 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
600 } else {
601 auio.uio_rw = UIO_WRITE;
602 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp);
603 }
604 fdrop(fp, mycp);
605 inblock_end = mycp->p_stats->p_ru.ru_inblock;
606 oublock_end = mycp->p_stats->p_ru.ru_oublock;
607
608 aiocbe->inputcharge = inblock_end - inblock_st;
609 aiocbe->outputcharge = oublock_end - oublock_st;
610
611 if ((error) && (auio.uio_resid != cnt)) {
612 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
613 error = 0;
614 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE))
615 psignal(userp, SIGPIPE);
616 }
617
618 cnt -= auio.uio_resid;
619 cb->_aiocb_private.error = error;
620 cb->_aiocb_private.status = cnt;
621
622 return;
623}
624
625/*
626 * The AIO daemon, most of the actual work is done in aio_process,
627 * but the setup (and address space mgmt) is done in this routine.
628 */
629static void
630aio_daemon(void *uproc)
631{
632 int s;
633 struct aio_liojob *lj;
634 struct aiocb *cb;
635 struct aiocblist *aiocbe;
636 struct aioproclist *aiop;
637 struct kaioinfo *ki;
638 struct proc *curcp, *mycp, *userp;
639 struct vmspace *myvm, *tmpvm;
640
641 mtx_enter(&Giant, MTX_DEF);
641 mtx_lock(&Giant);
642 /*
643 * Local copies of curproc (cp) and vmspace (myvm)
644 */
645 mycp = curproc;
646 myvm = mycp->p_vmspace;
647
648 if (mycp->p_textvp) {
649 vrele(mycp->p_textvp);
650 mycp->p_textvp = NULL;
651 }
652
653 /*
654 * Allocate and ready the aio control info. There is one aiop structure
655 * per daemon.
656 */
657 aiop = zalloc(aiop_zone);
658 aiop->aioproc = mycp;
659 aiop->aioprocflags |= AIOP_FREE;
660 TAILQ_INIT(&aiop->jobtorun);
661
662 s = splnet();
663
664 /*
665 * Place thread (lightweight process) onto the AIO free thread list.
666 */
667 if (TAILQ_EMPTY(&aio_freeproc))
668 wakeup(&aio_freeproc);
669 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
670
671 splx(s);
672
673 /* Make up a name for the daemon. */
674 strcpy(mycp->p_comm, "aiod");
675
676 /*
677 * Get rid of our current filedescriptors. AIOD's don't need any
678 * filedescriptors, except as temporarily inherited from the client.
679 * Credentials are also cloned, and made equivalent to "root".
680 */
681 fdfree(mycp);
682 mycp->p_fd = NULL;
683 mycp->p_ucred = crcopy(mycp->p_ucred);
684 mycp->p_ucred->cr_uid = 0;
685 uifree(mycp->p_ucred->cr_uidinfo);
686 mycp->p_ucred->cr_uidinfo = uifind(0);
687 mycp->p_ucred->cr_ngroups = 1;
688 mycp->p_ucred->cr_groups[0] = 1;
689
690 /* The daemon resides in its own pgrp. */
691 enterpgrp(mycp, mycp->p_pid, 1);
692
693 /* Mark special process type. */
694 mycp->p_flag |= P_SYSTEM;
695
696 /*
697 * Wakeup parent process. (Parent sleeps to keep from blasting away
698 * creating to many daemons.)
699 */
700 wakeup(mycp);
701
702 for (;;) {
703 /*
704 * curcp is the current daemon process context.
705 * userp is the current user process context.
706 */
707 curcp = mycp;
708
709 /*
710 * Take daemon off of free queue
711 */
712 if (aiop->aioprocflags & AIOP_FREE) {
713 s = splnet();
714 TAILQ_REMOVE(&aio_freeproc, aiop, list);
715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
716 aiop->aioprocflags &= ~AIOP_FREE;
717 splx(s);
718 }
719 aiop->aioprocflags &= ~AIOP_SCHED;
720
721 /*
722 * Check for jobs.
723 */
724 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
725 cb = &aiocbe->uaiocb;
726 userp = aiocbe->userproc;
727
728 aiocbe->jobstate = JOBST_JOBRUNNING;
729
730 /*
731 * Connect to process address space for user program.
732 */
733 if (userp != curcp) {
734 /*
735 * Save the current address space that we are
736 * connected to.
737 */
738 tmpvm = mycp->p_vmspace;
739
740 /*
741 * Point to the new user address space, and
742 * refer to it.
743 */
744 mycp->p_vmspace = userp->p_vmspace;
745 mycp->p_vmspace->vm_refcnt++;
746
747 /* Activate the new mapping. */
748 pmap_activate(mycp);
749
750 /*
751 * If the old address space wasn't the daemons
752 * own address space, then we need to remove the
753 * daemon's reference from the other process
754 * that it was acting on behalf of.
755 */
756 if (tmpvm != myvm) {
757 vmspace_free(tmpvm);
758 }
759
760 /*
761 * Disassociate from previous clients file
762 * descriptors, and associate to the new clients
763 * descriptors. Note that the daemon doesn't
764 * need to worry about its orginal descriptors,
765 * because they were originally freed.
766 */
767 if (mycp->p_fd)
768 fdfree(mycp);
769 mycp->p_fd = fdshare(userp);
770 curcp = userp;
771 }
772
773 ki = userp->p_aioinfo;
774 lj = aiocbe->lio;
775
776 /* Account for currently active jobs. */
777 ki->kaio_active_count++;
778
779 /* Do the I/O function. */
780 aiocbe->jobaioproc = aiop;
781 aio_process(aiocbe);
782
783 /* Decrement the active job count. */
784 ki->kaio_active_count--;
785
786 /*
787 * Increment the completion count for wakeup/signal
788 * comparisons.
789 */
790 aiocbe->jobflags |= AIOCBLIST_DONE;
791 ki->kaio_queue_finished_count++;
792 if (lj)
793 lj->lioj_queue_finished_count++;
794 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
795 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
796 ki->kaio_flags &= ~KAIO_WAKEUP;
797 wakeup(userp);
798 }
799
800 s = splbio();
801 if (lj && (lj->lioj_flags &
802 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
803 if ((lj->lioj_queue_finished_count ==
804 lj->lioj_queue_count) &&
805 (lj->lioj_buffer_finished_count ==
806 lj->lioj_buffer_count)) {
807 psignal(userp,
808 lj->lioj_signal.sigev_signo);
809 lj->lioj_flags |=
810 LIOJ_SIGNAL_POSTED;
811 }
812 }
813 splx(s);
814
815 aiocbe->jobstate = JOBST_JOBFINISHED;
816
817 /*
818 * If the I/O request should be automatically rundown,
819 * do the needed cleanup. Otherwise, place the queue
820 * entry for the just finished I/O request into the done
821 * queue for the associated client.
822 */
823 s = splnet();
824 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
825 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
826 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
827 } else {
828 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
829 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
830 plist);
831 }
832 splx(s);
833 KNOTE(&aiocbe->klist, 0);
834
835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
836 wakeup(aiocbe);
837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
838 }
839
840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
841 psignal(userp, cb->aio_sigevent.sigev_signo);
842 }
843 }
844
845 /*
846 * Disconnect from user address space.
847 */
848 if (curcp != mycp) {
849 /* Get the user address space to disconnect from. */
850 tmpvm = mycp->p_vmspace;
851
852 /* Get original address space for daemon. */
853 mycp->p_vmspace = myvm;
854
855 /* Activate the daemon's address space. */
856 pmap_activate(mycp);
857#ifdef DIAGNOSTIC
858 if (tmpvm == myvm) {
859 printf("AIOD: vmspace problem -- %d\n",
860 mycp->p_pid);
861 }
862#endif
863 /* Remove our vmspace reference. */
864 vmspace_free(tmpvm);
865
866 /*
867 * Disassociate from the user process's file
868 * descriptors.
869 */
870 if (mycp->p_fd)
871 fdfree(mycp);
872 mycp->p_fd = NULL;
873 curcp = mycp;
874 }
875
876 /*
877 * If we are the first to be put onto the free queue, wakeup
878 * anyone waiting for a daemon.
879 */
880 s = splnet();
881 TAILQ_REMOVE(&aio_activeproc, aiop, list);
882 if (TAILQ_EMPTY(&aio_freeproc))
883 wakeup(&aio_freeproc);
884 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
885 aiop->aioprocflags |= AIOP_FREE;
886 splx(s);
887
888 /*
889 * If daemon is inactive for a long time, allow it to exit,
890 * thereby freeing resources.
891 */
892 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
893 PRIBIO, "aiordy", aiod_lifetime)) {
894 s = splnet();
895 if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
897 if ((aiop->aioprocflags & AIOP_FREE) &&
898 (num_aio_procs > target_aio_procs)) {
899 TAILQ_REMOVE(&aio_freeproc, aiop, list);
900 splx(s);
901 zfree(aiop_zone, aiop);
902 num_aio_procs--;
903#ifdef DIAGNOSTIC
904 if (mycp->p_vmspace->vm_refcnt <= 1) {
905 printf("AIOD: bad vm refcnt for"
906 " exiting daemon: %d\n",
907 mycp->p_vmspace->vm_refcnt);
908 }
909#endif
910 exit1(mycp, 0);
911 }
912 }
913 splx(s);
914 }
915 }
916}
917
918/*
919 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
920 * AIO daemon modifies its environment itself.
921 */
922static int
923aio_newproc()
924{
925 int error;
926 struct proc *p, *np;
927
928 p = &proc0;
929 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
930 if (error)
931 return error;
932 cpu_set_fork_handler(np, aio_daemon, curproc);
933
934 /*
935 * Wait until daemon is started, but continue on just in case to
936 * handle error conditions.
937 */
938 error = tsleep(np, PZERO, "aiosta", aiod_timeout);
939 num_aio_procs++;
940
941 return error;
942}
943
944/*
945 * Try the high-performance physio method for eligible VCHR devices. This
946 * routine doesn't require the use of any additional threads, and have overhead.
947 */
948int
949aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
950{
951 int error;
952 struct aiocb *cb;
953 struct file *fp;
954 struct buf *bp;
955 struct vnode *vp;
956 struct kaioinfo *ki;
957 struct filedesc *fdp;
958 struct aio_liojob *lj;
959 int fd;
960 int s;
961 int notify;
962
963 cb = &aiocbe->uaiocb;
964 fdp = p->p_fd;
965 fd = cb->aio_fildes;
966 fp = fdp->fd_ofiles[fd];
967
968 if (fp->f_type != DTYPE_VNODE)
969 return (-1);
970
971 vp = (struct vnode *)fp->f_data;
972
973 /*
974 * If its not a disk, we don't want to return a positive error.
975 * It causes the aio code to not fall through to try the thread
976 * way when you're talking to a regular file.
977 */
978 if (!vn_isdisk(vp, &error)) {
979 if (error == ENOTBLK)
980 return (-1);
981 else
982 return (error);
983 }
984
985 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
986 return (-1);
987
988 if (cb->aio_nbytes > MAXPHYS)
989 return (-1);
990
991 ki = p->p_aioinfo;
992 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
993 return (-1);
994
995 fhold(fp);
996
997 ki->kaio_buffer_count++;
998
999 lj = aiocbe->lio;
1000 if (lj)
1001 lj->lioj_buffer_count++;
1002
1003 /* Create and build a buffer header for a transfer. */
1004 bp = (struct buf *)getpbuf(NULL);
1005
1006 /*
1007 * Get a copy of the kva from the physical buffer.
1008 */
1009 bp->b_caller1 = p;
1010 bp->b_dev = vp->v_rdev;
1011 error = bp->b_error = 0;
1012
1013 bp->b_bcount = cb->aio_nbytes;
1014 bp->b_bufsize = cb->aio_nbytes;
1015 bp->b_flags = B_PHYS;
1016 bp->b_iodone = aio_physwakeup;
1017 bp->b_saveaddr = bp->b_data;
1018 bp->b_data = (void *)cb->aio_buf;
1019 bp->b_blkno = btodb(cb->aio_offset);
1020
1021 if (cb->aio_lio_opcode == LIO_WRITE) {
1022 bp->b_iocmd = BIO_WRITE;
1023 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1024 error = EFAULT;
1025 goto doerror;
1026 }
1027 } else {
1028 bp->b_iocmd = BIO_READ;
1029 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1030 error = EFAULT;
1031 goto doerror;
1032 }
1033 }
1034
1035 /* Bring buffer into kernel space. */
1036 vmapbuf(bp);
1037
1038 s = splbio();
1039 aiocbe->bp = bp;
1040 bp->b_spc = (void *)aiocbe;
1041 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1042 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1043 aiocbe->jobstate = JOBST_JOBQBUF;
1044 cb->_aiocb_private.status = cb->aio_nbytes;
1045 num_buf_aio++;
1046 bp->b_error = 0;
1047
1048 splx(s);
1049
1050 /* Perform transfer. */
1051 DEV_STRATEGY(bp, 0);
1052
1053 notify = 0;
1054 s = splbio();
1055
1056 /*
1057 * If we had an error invoking the request, or an error in processing
1058 * the request before we have returned, we process it as an error in
1059 * transfer. Note that such an I/O error is not indicated immediately,
1060 * but is returned using the aio_error mechanism. In this case,
1061 * aio_suspend will return immediately.
1062 */
1063 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1064 struct aiocb *job = aiocbe->uuaiocb;
1065
1066 aiocbe->uaiocb._aiocb_private.status = 0;
1067 suword(&job->_aiocb_private.status, 0);
1068 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1069 suword(&job->_aiocb_private.error, bp->b_error);
1070
1071 ki->kaio_buffer_finished_count++;
1072
1073 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1074 aiocbe->jobstate = JOBST_JOBBFINISHED;
1075 aiocbe->jobflags |= AIOCBLIST_DONE;
1076 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1077 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1078 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1079 notify = 1;
1080 }
1081 }
1082 splx(s);
1083 if (notify)
1084 KNOTE(&aiocbe->klist, 0);
1085 fdrop(fp, p);
1086 return 0;
1087
1088doerror:
1089 ki->kaio_buffer_count--;
1090 if (lj)
1091 lj->lioj_buffer_count--;
1092 aiocbe->bp = NULL;
1093 relpbuf(bp, NULL);
1094 fdrop(fp, p);
1095 return error;
1096}
1097
1098/*
1099 * This waits/tests physio completion.
1100 */
1101int
1102aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait)
1103{
1104 int s;
1105 struct buf *bp;
1106 int error;
1107
1108 bp = iocb->bp;
1109
1110 s = splbio();
1111 if (flgwait == 0) {
1112 if ((bp->b_flags & B_DONE) == 0) {
1113 splx(s);
1114 return EINPROGRESS;
1115 }
1116 }
1117
1118 while ((bp->b_flags & B_DONE) == 0) {
1119 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1120 if ((bp->b_flags & B_DONE) == 0) {
1121 splx(s);
1122 return EINPROGRESS;
1123 } else
1124 break;
1125 }
1126 }
1127
1128 /* Release mapping into kernel space. */
1129 vunmapbuf(bp);
1130 iocb->bp = 0;
1131
1132 error = 0;
1133
1134 /* Check for an error. */
1135 if (bp->b_ioflags & BIO_ERROR)
1136 error = bp->b_error;
1137
1138 relpbuf(bp, NULL);
1139 return (error);
1140}
1141#endif /* VFS_AIO */
1142
1143/*
1144 * Wake up aio requests that may be serviceable now.
1145 */
1146void
1147aio_swake(struct socket *so, struct sockbuf *sb)
1148{
1149#ifndef VFS_AIO
1150 return;
1151#else
1152 struct aiocblist *cb,*cbn;
1153 struct proc *p;
1154 struct kaioinfo *ki = NULL;
1155 int opcode, wakecount = 0;
1156 struct aioproclist *aiop;
1157
1158 if (sb == &so->so_snd) {
1159 opcode = LIO_WRITE;
1160 so->so_snd.sb_flags &= ~SB_AIO;
1161 } else {
1162 opcode = LIO_READ;
1163 so->so_rcv.sb_flags &= ~SB_AIO;
1164 }
1165
1166 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1167 cbn = TAILQ_NEXT(cb, list);
1168 if (opcode == cb->uaiocb.aio_lio_opcode) {
1169 p = cb->userproc;
1170 ki = p->p_aioinfo;
1171 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1172 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1173 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1174 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1175 wakecount++;
1176 if (cb->jobstate != JOBST_JOBQGLOBAL)
1177 panic("invalid queue value");
1178 }
1179 }
1180
1181 while (wakecount--) {
1182 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1183 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1184 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1185 aiop->aioprocflags &= ~AIOP_FREE;
1186 wakeup(aiop->aioproc);
1187 }
1188 }
1189#endif /* VFS_AIO */
1190}
1191
1192#ifdef VFS_AIO
1193/*
1194 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1195 * technique is done in this code.
1196 */
1197static int
1198_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1199{
1200 struct filedesc *fdp;
1201 struct file *fp;
1202 unsigned int fd;
1203 struct socket *so;
1204 int s;
1205 int error;
1206 int opcode;
1207 struct aiocblist *aiocbe;
1208 struct aioproclist *aiop;
1209 struct kaioinfo *ki;
1210 struct kevent kev;
1211 struct kqueue *kq;
1212 struct file *kq_fp;
1213
1214 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
1215 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1216 else
1217 aiocbe = zalloc (aiocb_zone);
1218
1219 aiocbe->inputcharge = 0;
1220 aiocbe->outputcharge = 0;
1221 SLIST_INIT(&aiocbe->klist);
1222
1223 suword(&job->_aiocb_private.status, -1);
1224 suword(&job->_aiocb_private.error, 0);
1225 suword(&job->_aiocb_private.kernelinfo, -1);
1226
1227 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof
1228 aiocbe->uaiocb);
1229 if (error) {
1230 suword(&job->_aiocb_private.error, error);
1231
1232 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1233 return error;
1234 }
1235
1236 /* Save userspace address of the job info. */
1237 aiocbe->uuaiocb = job;
1238
1239 /* Get the opcode. */
1240 if (type != LIO_NOP)
1241 aiocbe->uaiocb.aio_lio_opcode = type;
1242 opcode = aiocbe->uaiocb.aio_lio_opcode;
1243
1244 /* Get the fd info for process. */
1245 fdp = p->p_fd;
1246
1247 /*
1248 * Range check file descriptor.
1249 */
1250 fd = aiocbe->uaiocb.aio_fildes;
1251 if (fd >= fdp->fd_nfiles) {
1252 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1253 if (type == 0)
1254 suword(&job->_aiocb_private.error, EBADF);
1255 return EBADF;
1256 }
1257
1258 fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1259 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1260 0))) {
1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1262 if (type == 0)
1263 suword(&job->_aiocb_private.error, EBADF);
1264 return EBADF;
1265 }
1266
1267 if (aiocbe->uaiocb.aio_offset == -1LL) {
1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1269 if (type == 0)
1270 suword(&job->_aiocb_private.error, EINVAL);
1271 return EINVAL;
1272 }
1273
1274 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1275 if (error) {
1276 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1277 if (type == 0)
1278 suword(&job->_aiocb_private.error, EINVAL);
1279 return error;
1280 }
1281
1282 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1283 if (jobrefid == LONG_MAX)
1284 jobrefid = 1;
1285 else
1286 jobrefid++;
1287
1288 if (opcode == LIO_NOP) {
1289 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1290 if (type == 0) {
1291 suword(&job->_aiocb_private.error, 0);
1292 suword(&job->_aiocb_private.status, 0);
1293 suword(&job->_aiocb_private.kernelinfo, 0);
1294 }
1295 return 0;
1296 }
1297
1298 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1299 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1300 if (type == 0) {
1301 suword(&job->_aiocb_private.status, 0);
1302 suword(&job->_aiocb_private.error, EINVAL);
1303 }
1304 return EINVAL;
1305 }
1306
1307 fhold(fp);
1308
1309 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1310 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1311 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1312 }
1313 else {
1314 /*
1315 * This method for requesting kevent-based notification won't
1316 * work on the alpha, since we're passing in a pointer
1317 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1318 * based method instead.
1319 */
1320 struct kevent *kevp;
1321
1322 kevp = (struct kevent *)job->aio_lio_opcode;
1323 if (kevp == NULL)
1324 goto no_kqueue;
1325
1326 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev));
1327 if (error)
1328 goto aqueue_fail;
1329 }
1330 if ((u_int)kev.ident >= fdp->fd_nfiles ||
1331 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1332 (kq_fp->f_type != DTYPE_KQUEUE)) {
1333 error = EBADF;
1334 goto aqueue_fail;
1335 }
1336 kq = (struct kqueue *)kq_fp->f_data;
1337 kev.ident = (uintptr_t)aiocbe;
1338 kev.filter = EVFILT_AIO;
1339 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1340 error = kqueue_register(kq, &kev, p);
1341aqueue_fail:
1342 if (error) {
1343 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1344 if (type == 0)
1345 suword(&job->_aiocb_private.error, error);
1346 goto done;
1347 }
1348no_kqueue:
1349
1350 suword(&job->_aiocb_private.error, EINPROGRESS);
1351 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1352 aiocbe->userproc = p;
1353 aiocbe->jobflags = 0;
1354 aiocbe->lio = lj;
1355 ki = p->p_aioinfo;
1356
1357 if (fp->f_type == DTYPE_SOCKET) {
1358 /*
1359 * Alternate queueing for socket ops: Reach down into the
1360 * descriptor to get the socket data. Then check to see if the
1361 * socket is ready to be read or written (based on the requested
1362 * operation).
1363 *
1364 * If it is not ready for io, then queue the aiocbe on the
1365 * socket, and set the flags so we get a call when sbnotify()
1366 * happens.
1367 */
1368 so = (struct socket *)fp->f_data;
1369 s = splnet();
1370 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1371 LIO_WRITE) && (!sowriteable(so)))) {
1372 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1373 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1374 if (opcode == LIO_READ)
1375 so->so_rcv.sb_flags |= SB_AIO;
1376 else
1377 so->so_snd.sb_flags |= SB_AIO;
1378 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1379 ki->kaio_queue_count++;
1380 num_queue_count++;
1381 splx(s);
1382 error = 0;
1383 goto done;
1384 }
1385 splx(s);
1386 }
1387
1388 if ((error = aio_qphysio(p, aiocbe)) == 0)
1389 goto done;
1390 if (error > 0) {
1391 suword(&job->_aiocb_private.status, 0);
1392 aiocbe->uaiocb._aiocb_private.error = error;
1393 suword(&job->_aiocb_private.error, error);
1394 goto done;
1395 }
1396
1397 /* No buffer for daemon I/O. */
1398 aiocbe->bp = NULL;
1399
1400 ki->kaio_queue_count++;
1401 if (lj)
1402 lj->lioj_queue_count++;
1403 s = splnet();
1404 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1405 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1406 splx(s);
1407 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1408
1409 num_queue_count++;
1410 error = 0;
1411
1412 /*
1413 * If we don't have a free AIO process, and we are below our quota, then
1414 * start one. Otherwise, depend on the subsequent I/O completions to
1415 * pick-up this job. If we don't sucessfully create the new process
1416 * (thread) due to resource issues, we return an error for now (EAGAIN),
1417 * which is likely not the correct thing to do.
1418 */
1419retryproc:
1420 s = splnet();
1421 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1422 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1423 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1424 aiop->aioprocflags &= ~AIOP_FREE;
1425 wakeup(aiop->aioproc);
1426 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1427 ((ki->kaio_active_count + num_aio_resv_start) <
1428 ki->kaio_maxactive_count)) {
1429 num_aio_resv_start++;
1430 if ((error = aio_newproc()) == 0) {
1431 num_aio_resv_start--;
1432 p->p_retval[0] = 0;
1433 goto retryproc;
1434 }
1435 num_aio_resv_start--;
1436 }
1437 splx(s);
1438done:
1439 fdrop(fp, p);
1440 return error;
1441}
1442
1443/*
1444 * This routine queues an AIO request, checking for quotas.
1445 */
1446static int
1447aio_aqueue(struct proc *p, struct aiocb *job, int type)
1448{
1449 struct kaioinfo *ki;
1450
1451 if (p->p_aioinfo == NULL)
1452 aio_init_aioinfo(p);
1453
1454 if (num_queue_count >= max_queue_count)
1455 return EAGAIN;
1456
1457 ki = p->p_aioinfo;
1458 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1459 return EAGAIN;
1460
1461 return _aio_aqueue(p, job, NULL, type);
1462}
1463#endif /* VFS_AIO */
1464
1465/*
1466 * Support the aio_return system call, as a side-effect, kernel resources are
1467 * released.
1468 */
1469int
1470aio_return(struct proc *p, struct aio_return_args *uap)
1471{
1472#ifndef VFS_AIO
1473 return ENOSYS;
1474#else
1475 int s;
1476 int jobref;
1477 struct aiocblist *cb, *ncb;
1478 struct aiocb *ujob;
1479 struct kaioinfo *ki;
1480
1481 ki = p->p_aioinfo;
1482 if (ki == NULL)
1483 return EINVAL;
1484
1485 ujob = uap->aiocbp;
1486
1487 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1488 if (jobref == -1 || jobref == 0)
1489 return EINVAL;
1490
1491 s = splnet();
1492 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1493 plist)) {
1494 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1495 jobref) {
1496 splx(s);
1497 if (ujob == cb->uuaiocb) {
1498 p->p_retval[0] =
1499 cb->uaiocb._aiocb_private.status;
1500 } else
1501 p->p_retval[0] = EFAULT;
1502 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1503 curproc->p_stats->p_ru.ru_oublock +=
1504 cb->outputcharge;
1505 cb->outputcharge = 0;
1506 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1507 curproc->p_stats->p_ru.ru_inblock +=
1508 cb->inputcharge;
1509 cb->inputcharge = 0;
1510 }
1511 aio_free_entry(cb);
1512 return 0;
1513 }
1514 }
1515 splx(s);
1516
1517 s = splbio();
1518 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1519 ncb = TAILQ_NEXT(cb, plist);
1520 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1521 == jobref) {
1522 splx(s);
1523 if (ujob == cb->uuaiocb) {
1524 p->p_retval[0] =
1525 cb->uaiocb._aiocb_private.status;
1526 } else
1527 p->p_retval[0] = EFAULT;
1528 aio_free_entry(cb);
1529 return 0;
1530 }
1531 }
1532 splx(s);
1533
1534 return (EINVAL);
1535#endif /* VFS_AIO */
1536}
1537
1538/*
1539 * Allow a process to wakeup when any of the I/O requests are completed.
1540 */
1541int
1542aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1543{
1544#ifndef VFS_AIO
1545 return ENOSYS;
1546#else
1547 struct timeval atv;
1548 struct timespec ts;
1549 struct aiocb *const *cbptr, *cbp;
1550 struct kaioinfo *ki;
1551 struct aiocblist *cb;
1552 int i;
1553 int njoblist;
1554 int error, s, timo;
1555 int *ijoblist;
1556 struct aiocb **ujoblist;
1557
1558 if (uap->nent >= AIO_LISTIO_MAX)
1559 return EINVAL;
1560
1561 timo = 0;
1562 if (uap->timeout) {
1563 /* Get timespec struct. */
1564 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1565 return error;
1566
1567 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1568 return (EINVAL);
1569
1570 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1571 if (itimerfix(&atv))
1572 return (EINVAL);
1573 timo = tvtohz(&atv);
1574 }
1575
1576 ki = p->p_aioinfo;
1577 if (ki == NULL)
1578 return EAGAIN;
1579
1580 njoblist = 0;
1581 ijoblist = zalloc(aiol_zone);
1582 ujoblist = zalloc(aiol_zone);
1583 cbptr = uap->aiocbp;
1584
1585 for (i = 0; i < uap->nent; i++) {
1586 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1587 if (cbp == 0)
1588 continue;
1589 ujoblist[njoblist] = cbp;
1590 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1591 njoblist++;
1592 }
1593
1594 if (njoblist == 0) {
1595 zfree(aiol_zone, ijoblist);
1596 zfree(aiol_zone, ujoblist);
1597 return 0;
1598 }
1599
1600 error = 0;
1601 for (;;) {
1602 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1603 TAILQ_NEXT(cb, plist)) {
1604 for (i = 0; i < njoblist; i++) {
1605 if (((intptr_t)
1606 cb->uaiocb._aiocb_private.kernelinfo) ==
1607 ijoblist[i]) {
1608 if (ujoblist[i] != cb->uuaiocb)
1609 error = EINVAL;
1610 zfree(aiol_zone, ijoblist);
1611 zfree(aiol_zone, ujoblist);
1612 return error;
1613 }
1614 }
1615 }
1616
1617 s = splbio();
1618 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1619 TAILQ_NEXT(cb, plist)) {
1620 for (i = 0; i < njoblist; i++) {
1621 if (((intptr_t)
1622 cb->uaiocb._aiocb_private.kernelinfo) ==
1623 ijoblist[i]) {
1624 splx(s);
1625 if (ujoblist[i] != cb->uuaiocb)
1626 error = EINVAL;
1627 zfree(aiol_zone, ijoblist);
1628 zfree(aiol_zone, ujoblist);
1629 return error;
1630 }
1631 }
1632 }
1633
1634 ki->kaio_flags |= KAIO_WAKEUP;
1635 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1636 splx(s);
1637
1638 if (error == ERESTART || error == EINTR) {
1639 zfree(aiol_zone, ijoblist);
1640 zfree(aiol_zone, ujoblist);
1641 return EINTR;
1642 } else if (error == EWOULDBLOCK) {
1643 zfree(aiol_zone, ijoblist);
1644 zfree(aiol_zone, ujoblist);
1645 return EAGAIN;
1646 }
1647 }
1648
1649/* NOTREACHED */
1650 return EINVAL;
1651#endif /* VFS_AIO */
1652}
1653
1654/*
1655 * aio_cancel cancels any non-physio aio operations not currently in
1656 * progress.
1657 */
1658int
1659aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1660{
1661#ifndef VFS_AIO
1662 return ENOSYS;
1663#else
1664 struct kaioinfo *ki;
1665 struct aiocblist *cbe, *cbn;
1666 struct file *fp;
1667 struct filedesc *fdp;
1668 struct socket *so;
1669 struct proc *po;
1670 int s,error;
1671 int cancelled=0;
1672 int notcancelled=0;
1673 struct vnode *vp;
1674
1675 fdp = p->p_fd;
1676
1677 fp = fdp->fd_ofiles[uap->fd];
1678
1679 if (fp == NULL) {
1680 return EBADF;
1681 }
1682
1683 if (fp->f_type == DTYPE_VNODE) {
1684 vp = (struct vnode *)fp->f_data;
1685
1686 if (vn_isdisk(vp,&error)) {
1687 p->p_retval[0] = AIO_NOTCANCELED;
1688 return 0;
1689 }
1690 } else if (fp->f_type == DTYPE_SOCKET) {
1691 so = (struct socket *)fp->f_data;
1692
1693 s = splnet();
1694
1695 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1696 cbn = TAILQ_NEXT(cbe, list);
1697 if ((uap->aiocbp == NULL) ||
1698 (uap->aiocbp == cbe->uuaiocb) ) {
1699 po = cbe->userproc;
1700 ki = po->p_aioinfo;
1701 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1702 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1703 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1704 if (ki->kaio_flags & KAIO_WAKEUP) {
1705 wakeup(po);
1706 }
1707 cbe->jobstate = JOBST_JOBFINISHED;
1708 cbe->uaiocb._aiocb_private.status=-1;
1709 cbe->uaiocb._aiocb_private.error=ECANCELED;
1710 cancelled++;
1711/* XXX cancelled, knote? */
1712 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1713 SIGEV_SIGNAL)
1714 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1715 if (uap->aiocbp)
1716 break;
1717 }
1718 }
1719
1720 splx(s);
1721
1722 if ((cancelled) && (uap->aiocbp)) {
1723 p->p_retval[0] = AIO_CANCELED;
1724 return 0;
1725 }
1726
1727 }
1728
1729 ki=p->p_aioinfo;
1730
1731 s = splnet();
1732
1733 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1734 cbn = TAILQ_NEXT(cbe, plist);
1735
1736 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1737 ((uap->aiocbp == NULL ) ||
1738 (uap->aiocbp == cbe->uuaiocb))) {
1739
1740 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1741 TAILQ_REMOVE(&aio_jobs, cbe, list);
1742 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1743 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1744 plist);
1745 cancelled++;
1746 ki->kaio_queue_finished_count++;
1747 cbe->jobstate = JOBST_JOBFINISHED;
1748 cbe->uaiocb._aiocb_private.status = -1;
1749 cbe->uaiocb._aiocb_private.error = ECANCELED;
1750/* XXX cancelled, knote? */
1751 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1752 SIGEV_SIGNAL)
1753 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1754 } else {
1755 notcancelled++;
1756 }
1757 }
1758 }
1759
1760 splx(s);
1761
1762
1763 if (notcancelled) {
1764 p->p_retval[0] = AIO_NOTCANCELED;
1765 return 0;
1766 }
1767
1768 if (cancelled) {
1769 p->p_retval[0] = AIO_CANCELED;
1770 return 0;
1771 }
1772
1773 p->p_retval[0] = AIO_ALLDONE;
1774
1775 return 0;
1776#endif /* VFS_AIO */
1777}
1778
1779/*
1780 * aio_error is implemented in the kernel level for compatibility purposes only.
1781 * For a user mode async implementation, it would be best to do it in a userland
1782 * subroutine.
1783 */
1784int
1785aio_error(struct proc *p, struct aio_error_args *uap)
1786{
1787#ifndef VFS_AIO
1788 return ENOSYS;
1789#else
1790 int s;
1791 struct aiocblist *cb;
1792 struct kaioinfo *ki;
1793 int jobref;
1794
1795 ki = p->p_aioinfo;
1796 if (ki == NULL)
1797 return EINVAL;
1798
1799 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1800 if ((jobref == -1) || (jobref == 0))
1801 return EINVAL;
1802
1803 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1804 plist)) {
1805 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1806 jobref) {
1807 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1808 return 0;
1809 }
1810 }
1811
1812 s = splnet();
1813
1814 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1815 plist)) {
1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1817 jobref) {
1818 p->p_retval[0] = EINPROGRESS;
1819 splx(s);
1820 return 0;
1821 }
1822 }
1823
1824 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1825 plist)) {
1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1827 jobref) {
1828 p->p_retval[0] = EINPROGRESS;
1829 splx(s);
1830 return 0;
1831 }
1832 }
1833 splx(s);
1834
1835 s = splbio();
1836 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1837 plist)) {
1838 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1839 jobref) {
1840 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1841 splx(s);
1842 return 0;
1843 }
1844 }
1845
1846 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1847 plist)) {
1848 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1849 jobref) {
1850 p->p_retval[0] = EINPROGRESS;
1851 splx(s);
1852 return 0;
1853 }
1854 }
1855 splx(s);
1856
1857#if (0)
1858 /*
1859 * Hack for lio.
1860 */
1861 status = fuword(&uap->aiocbp->_aiocb_private.status);
1862 if (status == -1)
1863 return fuword(&uap->aiocbp->_aiocb_private.error);
1864#endif
1865 return EINVAL;
1866#endif /* VFS_AIO */
1867}
1868
1869int
1870aio_read(struct proc *p, struct aio_read_args *uap)
1871{
1872#ifndef VFS_AIO
1873 return ENOSYS;
1874#else
1875 struct filedesc *fdp;
1876 struct file *fp;
1877 struct uio auio;
1878 struct iovec aiov;
1879 unsigned int fd;
1880 int cnt;
1881 struct aiocb iocb;
1882 int error, pmodes;
1883
1884 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1885 if ((pmodes & AIO_PMODE_SYNC) == 0)
1886 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1887
1888 /* Get control block. */
1889 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1890 != 0)
1891 return error;
1892
1893 /* Get the fd info for process. */
1894 fdp = p->p_fd;
1895
1896 /*
1897 * Range check file descriptor.
1898 */
1899 fd = iocb.aio_fildes;
1900 if (fd >= fdp->fd_nfiles)
1901 return EBADF;
1902 fp = fdp->fd_ofiles[fd];
1903 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1904 return EBADF;
1905 if (iocb.aio_offset == -1LL)
1906 return EINVAL;
1907
1908 auio.uio_resid = iocb.aio_nbytes;
1909 if (auio.uio_resid < 0)
1910 return (EINVAL);
1911
1912 /*
1913 * Process sync simply -- queue async request.
1914 */
1915 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0)
1916 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1917
1918 aiov.iov_base = (void *)iocb.aio_buf;
1919 aiov.iov_len = iocb.aio_nbytes;
1920
1921 auio.uio_iov = &aiov;
1922 auio.uio_iovcnt = 1;
1923 auio.uio_offset = iocb.aio_offset;
1924 auio.uio_rw = UIO_READ;
1925 auio.uio_segflg = UIO_USERSPACE;
1926 auio.uio_procp = p;
1927
1928 cnt = iocb.aio_nbytes;
1929 /*
1930 * Temporarily bump the ref count while reading to avoid the
1931 * descriptor being ripped out from under us.
1932 */
1933 fhold(fp);
1934 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p);
1935 fdrop(fp, p);
1936 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error ==
1937 EINTR || error == EWOULDBLOCK))
1938 error = 0;
1939 cnt -= auio.uio_resid;
1940 p->p_retval[0] = cnt;
1941 return error;
1942#endif /* VFS_AIO */
1943}
1944
1945int
1946aio_write(struct proc *p, struct aio_write_args *uap)
1947{
1948#ifndef VFS_AIO
1949 return ENOSYS;
1950#else
1951 struct filedesc *fdp;
1952 struct file *fp;
1953 struct uio auio;
1954 struct iovec aiov;
1955 unsigned int fd;
1956 int cnt;
1957 struct aiocb iocb;
1958 int error;
1959 int pmodes;
1960
1961 /*
1962 * Process sync simply -- queue async request.
1963 */
1964 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1965 if ((pmodes & AIO_PMODE_SYNC) == 0)
1966 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE);
1967
1968 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1969 != 0)
1970 return error;
1971
1972 /* Get the fd info for process. */
1973 fdp = p->p_fd;
1974
1975 /*
1976 * Range check file descriptor.
1977 */
1978 fd = iocb.aio_fildes;
1979 if (fd >= fdp->fd_nfiles)
1980 return EBADF;
1981 fp = fdp->fd_ofiles[fd];
1982 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1983 return EBADF;
1984 if (iocb.aio_offset == -1LL)
1985 return EINVAL;
1986
1987 aiov.iov_base = (void *)iocb.aio_buf;
1988 aiov.iov_len = iocb.aio_nbytes;
1989 auio.uio_iov = &aiov;
1990 auio.uio_iovcnt = 1;
1991 auio.uio_offset = iocb.aio_offset;
1992
1993 auio.uio_resid = iocb.aio_nbytes;
1994 if (auio.uio_resid < 0)
1995 return (EINVAL);
1996
1997 auio.uio_rw = UIO_WRITE;
1998 auio.uio_segflg = UIO_USERSPACE;
1999 auio.uio_procp = p;
2000
2001 cnt = iocb.aio_nbytes;
2002 /*
2003 * Temporarily bump the ref count while writing to avoid the
2004 * descriptor being ripped out from under us.
2005 */
2006 fhold(fp);
2007 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p);
2008 fdrop(fp, p);
2009 if (error) {
2010 if (auio.uio_resid != cnt) {
2011 if (error == ERESTART || error == EINTR || error ==
2012 EWOULDBLOCK)
2013 error = 0;
2014 if (error == EPIPE)
2015 psignal(p, SIGPIPE);
2016 }
2017 }
2018 cnt -= auio.uio_resid;
2019 p->p_retval[0] = cnt;
2020 return error;
2021#endif /* VFS_AIO */
2022}
2023
2024int
2025lio_listio(struct proc *p, struct lio_listio_args *uap)
2026{
2027#ifndef VFS_AIO
2028 return ENOSYS;
2029#else
2030 int nent, nentqueued;
2031 struct aiocb *iocb, * const *cbptr;
2032 struct aiocblist *cb;
2033 struct kaioinfo *ki;
2034 struct aio_liojob *lj;
2035 int error, runningcode;
2036 int nerror;
2037 int i;
2038 int s;
2039
2040 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2041 return EINVAL;
2042
2043 nent = uap->nent;
2044 if (nent > AIO_LISTIO_MAX)
2045 return EINVAL;
2046
2047 if (p->p_aioinfo == NULL)
2048 aio_init_aioinfo(p);
2049
2050 if ((nent + num_queue_count) > max_queue_count)
2051 return EAGAIN;
2052
2053 ki = p->p_aioinfo;
2054 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
2055 return EAGAIN;
2056
2057 lj = zalloc(aiolio_zone);
2058 if (!lj)
2059 return EAGAIN;
2060
2061 lj->lioj_flags = 0;
2062 lj->lioj_buffer_count = 0;
2063 lj->lioj_buffer_finished_count = 0;
2064 lj->lioj_queue_count = 0;
2065 lj->lioj_queue_finished_count = 0;
2066 lj->lioj_ki = ki;
2067 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2068
2069 /*
2070 * Setup signal.
2071 */
2072 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2073 error = copyin(uap->sig, &lj->lioj_signal,
2074 sizeof(lj->lioj_signal));
2075 if (error)
2076 return error;
2077 lj->lioj_flags |= LIOJ_SIGNAL;
2078 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2079 } else
2080 lj->lioj_flags &= ~LIOJ_SIGNAL;
2081
2082 /*
2083 * Get pointers to the list of I/O requests.
2084 */
2085 nerror = 0;
2086 nentqueued = 0;
2087 cbptr = uap->acb_list;
2088 for (i = 0; i < uap->nent; i++) {
2089 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2090 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2091 error = _aio_aqueue(p, iocb, lj, 0);
2092 if (error == 0)
2093 nentqueued++;
2094 else
2095 nerror++;
2096 }
2097 }
2098
2099 /*
2100 * If we haven't queued any, then just return error.
2101 */
2102 if (nentqueued == 0)
2103 return 0;
2104
2105 /*
2106 * Calculate the appropriate error return.
2107 */
2108 runningcode = 0;
2109 if (nerror)
2110 runningcode = EIO;
2111
2112 if (uap->mode == LIO_WAIT) {
2113 int command, found, jobref;
2114
2115 for (;;) {
2116 found = 0;
2117 for (i = 0; i < uap->nent; i++) {
2118 /*
2119 * Fetch address of the control buf pointer in
2120 * user space.
2121 */
2122 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2123 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2124 == 0))
2125 continue;
2126
2127 /*
2128 * Fetch the associated command from user space.
2129 */
2130 command = fuword(&iocb->aio_lio_opcode);
2131 if (command == LIO_NOP) {
2132 found++;
2133 continue;
2134 }
2135
2136 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2137
2138 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2139 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2140 == jobref) {
2141 if (cb->uaiocb.aio_lio_opcode
2142 == LIO_WRITE) {
2143 curproc->p_stats->p_ru.ru_oublock
2144 +=
2145 cb->outputcharge;
2146 cb->outputcharge = 0;
2147 } else if (cb->uaiocb.aio_lio_opcode
2148 == LIO_READ) {
2149 curproc->p_stats->p_ru.ru_inblock
2150 += cb->inputcharge;
2151 cb->inputcharge = 0;
2152 }
2153 found++;
2154 break;
2155 }
2156 }
2157
2158 s = splbio();
2159 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2160 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2161 == jobref) {
2162 found++;
2163 break;
2164 }
2165 }
2166 splx(s);
2167 }
2168
2169 /*
2170 * If all I/Os have been disposed of, then we can
2171 * return.
2172 */
2173 if (found == nentqueued)
2174 return runningcode;
2175
2176 ki->kaio_flags |= KAIO_WAKEUP;
2177 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2178
2179 if (error == EINTR)
2180 return EINTR;
2181 else if (error == EWOULDBLOCK)
2182 return EAGAIN;
2183 }
2184 }
2185
2186 return runningcode;
2187#endif /* VFS_AIO */
2188}
2189
2190#ifdef VFS_AIO
2191/*
2192 * This is a weird hack so that we can post a signal. It is safe to do so from
2193 * a timeout routine, but *not* from an interrupt routine.
2194 */
2195static void
2196process_signal(void *aioj)
2197{
2198 struct aiocblist *aiocbe = aioj;
2199 struct aio_liojob *lj = aiocbe->lio;
2200 struct aiocb *cb = &aiocbe->uaiocb;
2201
2202 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2203 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2204 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2205 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2206 }
2207
2208 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2209 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2210}
2211
2212/*
2213 * Interrupt handler for physio, performs the necessary process wakeups, and
2214 * signals.
2215 */
2216static void
2217aio_physwakeup(struct buf *bp)
2218{
2219 struct aiocblist *aiocbe;
2220 struct proc *p;
2221 struct kaioinfo *ki;
2222 struct aio_liojob *lj;
2223
2224 wakeup((caddr_t)bp);
2225
2226 aiocbe = (struct aiocblist *)bp->b_spc;
2227 if (aiocbe) {
2228 p = bp->b_caller1;
2229
2230 aiocbe->jobstate = JOBST_JOBBFINISHED;
2231 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2232 aiocbe->uaiocb._aiocb_private.error = 0;
2233 aiocbe->jobflags |= AIOCBLIST_DONE;
2234
2235 if (bp->b_ioflags & BIO_ERROR)
2236 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2237
2238 lj = aiocbe->lio;
2239 if (lj) {
2240 lj->lioj_buffer_finished_count++;
2241
2242 /*
2243 * wakeup/signal if all of the interrupt jobs are done.
2244 */
2245 if (lj->lioj_buffer_finished_count ==
2246 lj->lioj_buffer_count) {
2247 /*
2248 * Post a signal if it is called for.
2249 */
2250 if ((lj->lioj_flags &
2251 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2252 LIOJ_SIGNAL) {
2253 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2254 timeout(process_signal, aiocbe, 0);
2255 }
2256 }
2257 }
2258
2259 ki = p->p_aioinfo;
2260 if (ki) {
2261 ki->kaio_buffer_finished_count++;
2262 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2263 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2264 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2265
2266 KNOTE(&aiocbe->klist, 0);
2267 /* Do the wakeup. */
2268 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2269 ki->kaio_flags &= ~KAIO_WAKEUP;
2270 wakeup(p);
2271 }
2272 }
2273
2274 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2275 timeout(process_signal, aiocbe, 0);
2276 }
2277}
2278#endif /* VFS_AIO */
2279
2280int
2281aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2282{
2283#ifndef VFS_AIO
2284 return ENOSYS;
2285#else
2286 struct timeval atv;
2287 struct timespec ts;
2288 struct aiocb **cbptr;
2289 struct kaioinfo *ki;
2290 struct aiocblist *cb = NULL;
2291 int error, s, timo;
2292
2293 suword(uap->aiocbp, (int)NULL);
2294
2295 timo = 0;
2296 if (uap->timeout) {
2297 /* Get timespec struct. */
2298 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts,
2299 sizeof(ts));
2300 if (error)
2301 return error;
2302
2303 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2304 return (EINVAL);
2305
2306 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2307 if (itimerfix(&atv))
2308 return (EINVAL);
2309 timo = tvtohz(&atv);
2310 }
2311
2312 ki = p->p_aioinfo;
2313 if (ki == NULL)
2314 return EAGAIN;
2315
2316 cbptr = uap->aiocbp;
2317
2318 for (;;) {
2319 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2320 suword(uap->aiocbp, (int)cb->uuaiocb);
2321 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2322 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2323 curproc->p_stats->p_ru.ru_oublock +=
2324 cb->outputcharge;
2325 cb->outputcharge = 0;
2326 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2327 curproc->p_stats->p_ru.ru_inblock +=
2328 cb->inputcharge;
2329 cb->inputcharge = 0;
2330 }
2331 aio_free_entry(cb);
2332 return cb->uaiocb._aiocb_private.error;
2333 }
2334
2335 s = splbio();
2336 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2337 splx(s);
2338 suword(uap->aiocbp, (int)cb->uuaiocb);
2339 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2340 aio_free_entry(cb);
2341 return cb->uaiocb._aiocb_private.error;
2342 }
2343
2344 ki->kaio_flags |= KAIO_WAKEUP;
2345 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2346 splx(s);
2347
2348 if (error == ERESTART)
2349 return EINTR;
2350 else if (error < 0)
2351 return error;
2352 else if (error == EINTR)
2353 return EINTR;
2354 else if (error == EWOULDBLOCK)
2355 return EAGAIN;
2356 }
2357#endif /* VFS_AIO */
2358}
2359
2360
2361#ifndef VFS_AIO
2362static int
2363filt_aioattach(struct knote *kn)
2364{
2365
2366 return (ENXIO);
2367}
2368
2369struct filterops aio_filtops =
2370 { 0, filt_aioattach, NULL, NULL };
2371
2372#else
2373static int
2374filt_aioattach(struct knote *kn)
2375{
2376 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2377
2378 /*
2379 * The aiocbe pointer must be validated before using it, so
2380 * registration is restricted to the kernel; the user cannot
2381 * set EV_FLAG1.
2382 */
2383 if ((kn->kn_flags & EV_FLAG1) == 0)
2384 return (EPERM);
2385 kn->kn_flags &= ~EV_FLAG1;
2386
2387 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2388
2389 return (0);
2390}
2391
2392static void
2393filt_aiodetach(struct knote *kn)
2394{
2395 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2396 int s = splhigh(); /* XXX no clue, so overkill */
2397
2398 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2399 splx(s);
2400}
2401
2402/*ARGSUSED*/
2403static int
2404filt_aio(struct knote *kn, long hint)
2405{
2406 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2407
2408 kn->kn_data = 0; /* XXX data returned? */
2409 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2410 aiocbe->jobstate != JOBST_JOBBFINISHED)
2411 return (0);
2412 kn->kn_flags |= EV_EOF;
2413 return (1);
2414}
2415
2416struct filterops aio_filtops =
2417 { 0, filt_aioattach, filt_aiodetach, filt_aio };
2418#endif /* VFS_AIO */
642 /*
643 * Local copies of curproc (cp) and vmspace (myvm)
644 */
645 mycp = curproc;
646 myvm = mycp->p_vmspace;
647
648 if (mycp->p_textvp) {
649 vrele(mycp->p_textvp);
650 mycp->p_textvp = NULL;
651 }
652
653 /*
654 * Allocate and ready the aio control info. There is one aiop structure
655 * per daemon.
656 */
657 aiop = zalloc(aiop_zone);
658 aiop->aioproc = mycp;
659 aiop->aioprocflags |= AIOP_FREE;
660 TAILQ_INIT(&aiop->jobtorun);
661
662 s = splnet();
663
664 /*
665 * Place thread (lightweight process) onto the AIO free thread list.
666 */
667 if (TAILQ_EMPTY(&aio_freeproc))
668 wakeup(&aio_freeproc);
669 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
670
671 splx(s);
672
673 /* Make up a name for the daemon. */
674 strcpy(mycp->p_comm, "aiod");
675
676 /*
677 * Get rid of our current filedescriptors. AIOD's don't need any
678 * filedescriptors, except as temporarily inherited from the client.
679 * Credentials are also cloned, and made equivalent to "root".
680 */
681 fdfree(mycp);
682 mycp->p_fd = NULL;
683 mycp->p_ucred = crcopy(mycp->p_ucred);
684 mycp->p_ucred->cr_uid = 0;
685 uifree(mycp->p_ucred->cr_uidinfo);
686 mycp->p_ucred->cr_uidinfo = uifind(0);
687 mycp->p_ucred->cr_ngroups = 1;
688 mycp->p_ucred->cr_groups[0] = 1;
689
690 /* The daemon resides in its own pgrp. */
691 enterpgrp(mycp, mycp->p_pid, 1);
692
693 /* Mark special process type. */
694 mycp->p_flag |= P_SYSTEM;
695
696 /*
697 * Wakeup parent process. (Parent sleeps to keep from blasting away
698 * creating to many daemons.)
699 */
700 wakeup(mycp);
701
702 for (;;) {
703 /*
704 * curcp is the current daemon process context.
705 * userp is the current user process context.
706 */
707 curcp = mycp;
708
709 /*
710 * Take daemon off of free queue
711 */
712 if (aiop->aioprocflags & AIOP_FREE) {
713 s = splnet();
714 TAILQ_REMOVE(&aio_freeproc, aiop, list);
715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
716 aiop->aioprocflags &= ~AIOP_FREE;
717 splx(s);
718 }
719 aiop->aioprocflags &= ~AIOP_SCHED;
720
721 /*
722 * Check for jobs.
723 */
724 while ((aiocbe = aio_selectjob(aiop)) != NULL) {
725 cb = &aiocbe->uaiocb;
726 userp = aiocbe->userproc;
727
728 aiocbe->jobstate = JOBST_JOBRUNNING;
729
730 /*
731 * Connect to process address space for user program.
732 */
733 if (userp != curcp) {
734 /*
735 * Save the current address space that we are
736 * connected to.
737 */
738 tmpvm = mycp->p_vmspace;
739
740 /*
741 * Point to the new user address space, and
742 * refer to it.
743 */
744 mycp->p_vmspace = userp->p_vmspace;
745 mycp->p_vmspace->vm_refcnt++;
746
747 /* Activate the new mapping. */
748 pmap_activate(mycp);
749
750 /*
751 * If the old address space wasn't the daemons
752 * own address space, then we need to remove the
753 * daemon's reference from the other process
754 * that it was acting on behalf of.
755 */
756 if (tmpvm != myvm) {
757 vmspace_free(tmpvm);
758 }
759
760 /*
761 * Disassociate from previous clients file
762 * descriptors, and associate to the new clients
763 * descriptors. Note that the daemon doesn't
764 * need to worry about its orginal descriptors,
765 * because they were originally freed.
766 */
767 if (mycp->p_fd)
768 fdfree(mycp);
769 mycp->p_fd = fdshare(userp);
770 curcp = userp;
771 }
772
773 ki = userp->p_aioinfo;
774 lj = aiocbe->lio;
775
776 /* Account for currently active jobs. */
777 ki->kaio_active_count++;
778
779 /* Do the I/O function. */
780 aiocbe->jobaioproc = aiop;
781 aio_process(aiocbe);
782
783 /* Decrement the active job count. */
784 ki->kaio_active_count--;
785
786 /*
787 * Increment the completion count for wakeup/signal
788 * comparisons.
789 */
790 aiocbe->jobflags |= AIOCBLIST_DONE;
791 ki->kaio_queue_finished_count++;
792 if (lj)
793 lj->lioj_queue_finished_count++;
794 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
795 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
796 ki->kaio_flags &= ~KAIO_WAKEUP;
797 wakeup(userp);
798 }
799
800 s = splbio();
801 if (lj && (lj->lioj_flags &
802 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
803 if ((lj->lioj_queue_finished_count ==
804 lj->lioj_queue_count) &&
805 (lj->lioj_buffer_finished_count ==
806 lj->lioj_buffer_count)) {
807 psignal(userp,
808 lj->lioj_signal.sigev_signo);
809 lj->lioj_flags |=
810 LIOJ_SIGNAL_POSTED;
811 }
812 }
813 splx(s);
814
815 aiocbe->jobstate = JOBST_JOBFINISHED;
816
817 /*
818 * If the I/O request should be automatically rundown,
819 * do the needed cleanup. Otherwise, place the queue
820 * entry for the just finished I/O request into the done
821 * queue for the associated client.
822 */
823 s = splnet();
824 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) {
825 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE;
826 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
827 } else {
828 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
829 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe,
830 plist);
831 }
832 splx(s);
833 KNOTE(&aiocbe->klist, 0);
834
835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
836 wakeup(aiocbe);
837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
838 }
839
840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
841 psignal(userp, cb->aio_sigevent.sigev_signo);
842 }
843 }
844
845 /*
846 * Disconnect from user address space.
847 */
848 if (curcp != mycp) {
849 /* Get the user address space to disconnect from. */
850 tmpvm = mycp->p_vmspace;
851
852 /* Get original address space for daemon. */
853 mycp->p_vmspace = myvm;
854
855 /* Activate the daemon's address space. */
856 pmap_activate(mycp);
857#ifdef DIAGNOSTIC
858 if (tmpvm == myvm) {
859 printf("AIOD: vmspace problem -- %d\n",
860 mycp->p_pid);
861 }
862#endif
863 /* Remove our vmspace reference. */
864 vmspace_free(tmpvm);
865
866 /*
867 * Disassociate from the user process's file
868 * descriptors.
869 */
870 if (mycp->p_fd)
871 fdfree(mycp);
872 mycp->p_fd = NULL;
873 curcp = mycp;
874 }
875
876 /*
877 * If we are the first to be put onto the free queue, wakeup
878 * anyone waiting for a daemon.
879 */
880 s = splnet();
881 TAILQ_REMOVE(&aio_activeproc, aiop, list);
882 if (TAILQ_EMPTY(&aio_freeproc))
883 wakeup(&aio_freeproc);
884 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
885 aiop->aioprocflags |= AIOP_FREE;
886 splx(s);
887
888 /*
889 * If daemon is inactive for a long time, allow it to exit,
890 * thereby freeing resources.
891 */
892 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp,
893 PRIBIO, "aiordy", aiod_lifetime)) {
894 s = splnet();
895 if ((TAILQ_FIRST(&aio_jobs) == NULL) &&
896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) {
897 if ((aiop->aioprocflags & AIOP_FREE) &&
898 (num_aio_procs > target_aio_procs)) {
899 TAILQ_REMOVE(&aio_freeproc, aiop, list);
900 splx(s);
901 zfree(aiop_zone, aiop);
902 num_aio_procs--;
903#ifdef DIAGNOSTIC
904 if (mycp->p_vmspace->vm_refcnt <= 1) {
905 printf("AIOD: bad vm refcnt for"
906 " exiting daemon: %d\n",
907 mycp->p_vmspace->vm_refcnt);
908 }
909#endif
910 exit1(mycp, 0);
911 }
912 }
913 splx(s);
914 }
915 }
916}
917
918/*
919 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
920 * AIO daemon modifies its environment itself.
921 */
922static int
923aio_newproc()
924{
925 int error;
926 struct proc *p, *np;
927
928 p = &proc0;
929 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np);
930 if (error)
931 return error;
932 cpu_set_fork_handler(np, aio_daemon, curproc);
933
934 /*
935 * Wait until daemon is started, but continue on just in case to
936 * handle error conditions.
937 */
938 error = tsleep(np, PZERO, "aiosta", aiod_timeout);
939 num_aio_procs++;
940
941 return error;
942}
943
944/*
945 * Try the high-performance physio method for eligible VCHR devices. This
946 * routine doesn't require the use of any additional threads, and have overhead.
947 */
948int
949aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
950{
951 int error;
952 struct aiocb *cb;
953 struct file *fp;
954 struct buf *bp;
955 struct vnode *vp;
956 struct kaioinfo *ki;
957 struct filedesc *fdp;
958 struct aio_liojob *lj;
959 int fd;
960 int s;
961 int notify;
962
963 cb = &aiocbe->uaiocb;
964 fdp = p->p_fd;
965 fd = cb->aio_fildes;
966 fp = fdp->fd_ofiles[fd];
967
968 if (fp->f_type != DTYPE_VNODE)
969 return (-1);
970
971 vp = (struct vnode *)fp->f_data;
972
973 /*
974 * If its not a disk, we don't want to return a positive error.
975 * It causes the aio code to not fall through to try the thread
976 * way when you're talking to a regular file.
977 */
978 if (!vn_isdisk(vp, &error)) {
979 if (error == ENOTBLK)
980 return (-1);
981 else
982 return (error);
983 }
984
985 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys)
986 return (-1);
987
988 if (cb->aio_nbytes > MAXPHYS)
989 return (-1);
990
991 ki = p->p_aioinfo;
992 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
993 return (-1);
994
995 fhold(fp);
996
997 ki->kaio_buffer_count++;
998
999 lj = aiocbe->lio;
1000 if (lj)
1001 lj->lioj_buffer_count++;
1002
1003 /* Create and build a buffer header for a transfer. */
1004 bp = (struct buf *)getpbuf(NULL);
1005
1006 /*
1007 * Get a copy of the kva from the physical buffer.
1008 */
1009 bp->b_caller1 = p;
1010 bp->b_dev = vp->v_rdev;
1011 error = bp->b_error = 0;
1012
1013 bp->b_bcount = cb->aio_nbytes;
1014 bp->b_bufsize = cb->aio_nbytes;
1015 bp->b_flags = B_PHYS;
1016 bp->b_iodone = aio_physwakeup;
1017 bp->b_saveaddr = bp->b_data;
1018 bp->b_data = (void *)cb->aio_buf;
1019 bp->b_blkno = btodb(cb->aio_offset);
1020
1021 if (cb->aio_lio_opcode == LIO_WRITE) {
1022 bp->b_iocmd = BIO_WRITE;
1023 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) {
1024 error = EFAULT;
1025 goto doerror;
1026 }
1027 } else {
1028 bp->b_iocmd = BIO_READ;
1029 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) {
1030 error = EFAULT;
1031 goto doerror;
1032 }
1033 }
1034
1035 /* Bring buffer into kernel space. */
1036 vmapbuf(bp);
1037
1038 s = splbio();
1039 aiocbe->bp = bp;
1040 bp->b_spc = (void *)aiocbe;
1041 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list);
1042 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
1043 aiocbe->jobstate = JOBST_JOBQBUF;
1044 cb->_aiocb_private.status = cb->aio_nbytes;
1045 num_buf_aio++;
1046 bp->b_error = 0;
1047
1048 splx(s);
1049
1050 /* Perform transfer. */
1051 DEV_STRATEGY(bp, 0);
1052
1053 notify = 0;
1054 s = splbio();
1055
1056 /*
1057 * If we had an error invoking the request, or an error in processing
1058 * the request before we have returned, we process it as an error in
1059 * transfer. Note that such an I/O error is not indicated immediately,
1060 * but is returned using the aio_error mechanism. In this case,
1061 * aio_suspend will return immediately.
1062 */
1063 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
1064 struct aiocb *job = aiocbe->uuaiocb;
1065
1066 aiocbe->uaiocb._aiocb_private.status = 0;
1067 suword(&job->_aiocb_private.status, 0);
1068 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
1069 suword(&job->_aiocb_private.error, bp->b_error);
1070
1071 ki->kaio_buffer_finished_count++;
1072
1073 if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
1074 aiocbe->jobstate = JOBST_JOBBFINISHED;
1075 aiocbe->jobflags |= AIOCBLIST_DONE;
1076 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
1077 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
1078 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
1079 notify = 1;
1080 }
1081 }
1082 splx(s);
1083 if (notify)
1084 KNOTE(&aiocbe->klist, 0);
1085 fdrop(fp, p);
1086 return 0;
1087
1088doerror:
1089 ki->kaio_buffer_count--;
1090 if (lj)
1091 lj->lioj_buffer_count--;
1092 aiocbe->bp = NULL;
1093 relpbuf(bp, NULL);
1094 fdrop(fp, p);
1095 return error;
1096}
1097
1098/*
1099 * This waits/tests physio completion.
1100 */
1101int
1102aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait)
1103{
1104 int s;
1105 struct buf *bp;
1106 int error;
1107
1108 bp = iocb->bp;
1109
1110 s = splbio();
1111 if (flgwait == 0) {
1112 if ((bp->b_flags & B_DONE) == 0) {
1113 splx(s);
1114 return EINPROGRESS;
1115 }
1116 }
1117
1118 while ((bp->b_flags & B_DONE) == 0) {
1119 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) {
1120 if ((bp->b_flags & B_DONE) == 0) {
1121 splx(s);
1122 return EINPROGRESS;
1123 } else
1124 break;
1125 }
1126 }
1127
1128 /* Release mapping into kernel space. */
1129 vunmapbuf(bp);
1130 iocb->bp = 0;
1131
1132 error = 0;
1133
1134 /* Check for an error. */
1135 if (bp->b_ioflags & BIO_ERROR)
1136 error = bp->b_error;
1137
1138 relpbuf(bp, NULL);
1139 return (error);
1140}
1141#endif /* VFS_AIO */
1142
1143/*
1144 * Wake up aio requests that may be serviceable now.
1145 */
1146void
1147aio_swake(struct socket *so, struct sockbuf *sb)
1148{
1149#ifndef VFS_AIO
1150 return;
1151#else
1152 struct aiocblist *cb,*cbn;
1153 struct proc *p;
1154 struct kaioinfo *ki = NULL;
1155 int opcode, wakecount = 0;
1156 struct aioproclist *aiop;
1157
1158 if (sb == &so->so_snd) {
1159 opcode = LIO_WRITE;
1160 so->so_snd.sb_flags &= ~SB_AIO;
1161 } else {
1162 opcode = LIO_READ;
1163 so->so_rcv.sb_flags &= ~SB_AIO;
1164 }
1165
1166 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
1167 cbn = TAILQ_NEXT(cb, list);
1168 if (opcode == cb->uaiocb.aio_lio_opcode) {
1169 p = cb->userproc;
1170 ki = p->p_aioinfo;
1171 TAILQ_REMOVE(&so->so_aiojobq, cb, list);
1172 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
1173 TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
1174 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
1175 wakecount++;
1176 if (cb->jobstate != JOBST_JOBQGLOBAL)
1177 panic("invalid queue value");
1178 }
1179 }
1180
1181 while (wakecount--) {
1182 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
1183 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1184 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1185 aiop->aioprocflags &= ~AIOP_FREE;
1186 wakeup(aiop->aioproc);
1187 }
1188 }
1189#endif /* VFS_AIO */
1190}
1191
1192#ifdef VFS_AIO
1193/*
1194 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
1195 * technique is done in this code.
1196 */
1197static int
1198_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type)
1199{
1200 struct filedesc *fdp;
1201 struct file *fp;
1202 unsigned int fd;
1203 struct socket *so;
1204 int s;
1205 int error;
1206 int opcode;
1207 struct aiocblist *aiocbe;
1208 struct aioproclist *aiop;
1209 struct kaioinfo *ki;
1210 struct kevent kev;
1211 struct kqueue *kq;
1212 struct file *kq_fp;
1213
1214 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL)
1215 TAILQ_REMOVE(&aio_freejobs, aiocbe, list);
1216 else
1217 aiocbe = zalloc (aiocb_zone);
1218
1219 aiocbe->inputcharge = 0;
1220 aiocbe->outputcharge = 0;
1221 SLIST_INIT(&aiocbe->klist);
1222
1223 suword(&job->_aiocb_private.status, -1);
1224 suword(&job->_aiocb_private.error, 0);
1225 suword(&job->_aiocb_private.kernelinfo, -1);
1226
1227 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof
1228 aiocbe->uaiocb);
1229 if (error) {
1230 suword(&job->_aiocb_private.error, error);
1231
1232 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1233 return error;
1234 }
1235
1236 /* Save userspace address of the job info. */
1237 aiocbe->uuaiocb = job;
1238
1239 /* Get the opcode. */
1240 if (type != LIO_NOP)
1241 aiocbe->uaiocb.aio_lio_opcode = type;
1242 opcode = aiocbe->uaiocb.aio_lio_opcode;
1243
1244 /* Get the fd info for process. */
1245 fdp = p->p_fd;
1246
1247 /*
1248 * Range check file descriptor.
1249 */
1250 fd = aiocbe->uaiocb.aio_fildes;
1251 if (fd >= fdp->fd_nfiles) {
1252 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1253 if (type == 0)
1254 suword(&job->_aiocb_private.error, EBADF);
1255 return EBADF;
1256 }
1257
1258 fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
1259 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) ==
1260 0))) {
1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1262 if (type == 0)
1263 suword(&job->_aiocb_private.error, EBADF);
1264 return EBADF;
1265 }
1266
1267 if (aiocbe->uaiocb.aio_offset == -1LL) {
1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1269 if (type == 0)
1270 suword(&job->_aiocb_private.error, EINVAL);
1271 return EINVAL;
1272 }
1273
1274 error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
1275 if (error) {
1276 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1277 if (type == 0)
1278 suword(&job->_aiocb_private.error, EINVAL);
1279 return error;
1280 }
1281
1282 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
1283 if (jobrefid == LONG_MAX)
1284 jobrefid = 1;
1285 else
1286 jobrefid++;
1287
1288 if (opcode == LIO_NOP) {
1289 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1290 if (type == 0) {
1291 suword(&job->_aiocb_private.error, 0);
1292 suword(&job->_aiocb_private.status, 0);
1293 suword(&job->_aiocb_private.kernelinfo, 0);
1294 }
1295 return 0;
1296 }
1297
1298 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
1299 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1300 if (type == 0) {
1301 suword(&job->_aiocb_private.status, 0);
1302 suword(&job->_aiocb_private.error, EINVAL);
1303 }
1304 return EINVAL;
1305 }
1306
1307 fhold(fp);
1308
1309 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
1310 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
1311 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
1312 }
1313 else {
1314 /*
1315 * This method for requesting kevent-based notification won't
1316 * work on the alpha, since we're passing in a pointer
1317 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT-
1318 * based method instead.
1319 */
1320 struct kevent *kevp;
1321
1322 kevp = (struct kevent *)job->aio_lio_opcode;
1323 if (kevp == NULL)
1324 goto no_kqueue;
1325
1326 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev));
1327 if (error)
1328 goto aqueue_fail;
1329 }
1330 if ((u_int)kev.ident >= fdp->fd_nfiles ||
1331 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
1332 (kq_fp->f_type != DTYPE_KQUEUE)) {
1333 error = EBADF;
1334 goto aqueue_fail;
1335 }
1336 kq = (struct kqueue *)kq_fp->f_data;
1337 kev.ident = (uintptr_t)aiocbe;
1338 kev.filter = EVFILT_AIO;
1339 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
1340 error = kqueue_register(kq, &kev, p);
1341aqueue_fail:
1342 if (error) {
1343 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list);
1344 if (type == 0)
1345 suword(&job->_aiocb_private.error, error);
1346 goto done;
1347 }
1348no_kqueue:
1349
1350 suword(&job->_aiocb_private.error, EINPROGRESS);
1351 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
1352 aiocbe->userproc = p;
1353 aiocbe->jobflags = 0;
1354 aiocbe->lio = lj;
1355 ki = p->p_aioinfo;
1356
1357 if (fp->f_type == DTYPE_SOCKET) {
1358 /*
1359 * Alternate queueing for socket ops: Reach down into the
1360 * descriptor to get the socket data. Then check to see if the
1361 * socket is ready to be read or written (based on the requested
1362 * operation).
1363 *
1364 * If it is not ready for io, then queue the aiocbe on the
1365 * socket, and set the flags so we get a call when sbnotify()
1366 * happens.
1367 */
1368 so = (struct socket *)fp->f_data;
1369 s = splnet();
1370 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
1371 LIO_WRITE) && (!sowriteable(so)))) {
1372 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
1373 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
1374 if (opcode == LIO_READ)
1375 so->so_rcv.sb_flags |= SB_AIO;
1376 else
1377 so->so_snd.sb_flags |= SB_AIO;
1378 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
1379 ki->kaio_queue_count++;
1380 num_queue_count++;
1381 splx(s);
1382 error = 0;
1383 goto done;
1384 }
1385 splx(s);
1386 }
1387
1388 if ((error = aio_qphysio(p, aiocbe)) == 0)
1389 goto done;
1390 if (error > 0) {
1391 suword(&job->_aiocb_private.status, 0);
1392 aiocbe->uaiocb._aiocb_private.error = error;
1393 suword(&job->_aiocb_private.error, error);
1394 goto done;
1395 }
1396
1397 /* No buffer for daemon I/O. */
1398 aiocbe->bp = NULL;
1399
1400 ki->kaio_queue_count++;
1401 if (lj)
1402 lj->lioj_queue_count++;
1403 s = splnet();
1404 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
1405 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
1406 splx(s);
1407 aiocbe->jobstate = JOBST_JOBQGLOBAL;
1408
1409 num_queue_count++;
1410 error = 0;
1411
1412 /*
1413 * If we don't have a free AIO process, and we are below our quota, then
1414 * start one. Otherwise, depend on the subsequent I/O completions to
1415 * pick-up this job. If we don't sucessfully create the new process
1416 * (thread) due to resource issues, we return an error for now (EAGAIN),
1417 * which is likely not the correct thing to do.
1418 */
1419retryproc:
1420 s = splnet();
1421 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
1422 TAILQ_REMOVE(&aio_freeproc, aiop, list);
1423 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list);
1424 aiop->aioprocflags &= ~AIOP_FREE;
1425 wakeup(aiop->aioproc);
1426 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
1427 ((ki->kaio_active_count + num_aio_resv_start) <
1428 ki->kaio_maxactive_count)) {
1429 num_aio_resv_start++;
1430 if ((error = aio_newproc()) == 0) {
1431 num_aio_resv_start--;
1432 p->p_retval[0] = 0;
1433 goto retryproc;
1434 }
1435 num_aio_resv_start--;
1436 }
1437 splx(s);
1438done:
1439 fdrop(fp, p);
1440 return error;
1441}
1442
1443/*
1444 * This routine queues an AIO request, checking for quotas.
1445 */
1446static int
1447aio_aqueue(struct proc *p, struct aiocb *job, int type)
1448{
1449 struct kaioinfo *ki;
1450
1451 if (p->p_aioinfo == NULL)
1452 aio_init_aioinfo(p);
1453
1454 if (num_queue_count >= max_queue_count)
1455 return EAGAIN;
1456
1457 ki = p->p_aioinfo;
1458 if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
1459 return EAGAIN;
1460
1461 return _aio_aqueue(p, job, NULL, type);
1462}
1463#endif /* VFS_AIO */
1464
1465/*
1466 * Support the aio_return system call, as a side-effect, kernel resources are
1467 * released.
1468 */
1469int
1470aio_return(struct proc *p, struct aio_return_args *uap)
1471{
1472#ifndef VFS_AIO
1473 return ENOSYS;
1474#else
1475 int s;
1476 int jobref;
1477 struct aiocblist *cb, *ncb;
1478 struct aiocb *ujob;
1479 struct kaioinfo *ki;
1480
1481 ki = p->p_aioinfo;
1482 if (ki == NULL)
1483 return EINVAL;
1484
1485 ujob = uap->aiocbp;
1486
1487 jobref = fuword(&ujob->_aiocb_private.kernelinfo);
1488 if (jobref == -1 || jobref == 0)
1489 return EINVAL;
1490
1491 s = splnet();
1492 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1493 plist)) {
1494 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
1495 jobref) {
1496 splx(s);
1497 if (ujob == cb->uuaiocb) {
1498 p->p_retval[0] =
1499 cb->uaiocb._aiocb_private.status;
1500 } else
1501 p->p_retval[0] = EFAULT;
1502 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
1503 curproc->p_stats->p_ru.ru_oublock +=
1504 cb->outputcharge;
1505 cb->outputcharge = 0;
1506 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
1507 curproc->p_stats->p_ru.ru_inblock +=
1508 cb->inputcharge;
1509 cb->inputcharge = 0;
1510 }
1511 aio_free_entry(cb);
1512 return 0;
1513 }
1514 }
1515 splx(s);
1516
1517 s = splbio();
1518 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
1519 ncb = TAILQ_NEXT(cb, plist);
1520 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
1521 == jobref) {
1522 splx(s);
1523 if (ujob == cb->uuaiocb) {
1524 p->p_retval[0] =
1525 cb->uaiocb._aiocb_private.status;
1526 } else
1527 p->p_retval[0] = EFAULT;
1528 aio_free_entry(cb);
1529 return 0;
1530 }
1531 }
1532 splx(s);
1533
1534 return (EINVAL);
1535#endif /* VFS_AIO */
1536}
1537
1538/*
1539 * Allow a process to wakeup when any of the I/O requests are completed.
1540 */
1541int
1542aio_suspend(struct proc *p, struct aio_suspend_args *uap)
1543{
1544#ifndef VFS_AIO
1545 return ENOSYS;
1546#else
1547 struct timeval atv;
1548 struct timespec ts;
1549 struct aiocb *const *cbptr, *cbp;
1550 struct kaioinfo *ki;
1551 struct aiocblist *cb;
1552 int i;
1553 int njoblist;
1554 int error, s, timo;
1555 int *ijoblist;
1556 struct aiocb **ujoblist;
1557
1558 if (uap->nent >= AIO_LISTIO_MAX)
1559 return EINVAL;
1560
1561 timo = 0;
1562 if (uap->timeout) {
1563 /* Get timespec struct. */
1564 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
1565 return error;
1566
1567 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
1568 return (EINVAL);
1569
1570 TIMESPEC_TO_TIMEVAL(&atv, &ts);
1571 if (itimerfix(&atv))
1572 return (EINVAL);
1573 timo = tvtohz(&atv);
1574 }
1575
1576 ki = p->p_aioinfo;
1577 if (ki == NULL)
1578 return EAGAIN;
1579
1580 njoblist = 0;
1581 ijoblist = zalloc(aiol_zone);
1582 ujoblist = zalloc(aiol_zone);
1583 cbptr = uap->aiocbp;
1584
1585 for (i = 0; i < uap->nent; i++) {
1586 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
1587 if (cbp == 0)
1588 continue;
1589 ujoblist[njoblist] = cbp;
1590 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
1591 njoblist++;
1592 }
1593
1594 if (njoblist == 0) {
1595 zfree(aiol_zone, ijoblist);
1596 zfree(aiol_zone, ujoblist);
1597 return 0;
1598 }
1599
1600 error = 0;
1601 for (;;) {
1602 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb =
1603 TAILQ_NEXT(cb, plist)) {
1604 for (i = 0; i < njoblist; i++) {
1605 if (((intptr_t)
1606 cb->uaiocb._aiocb_private.kernelinfo) ==
1607 ijoblist[i]) {
1608 if (ujoblist[i] != cb->uuaiocb)
1609 error = EINVAL;
1610 zfree(aiol_zone, ijoblist);
1611 zfree(aiol_zone, ujoblist);
1612 return error;
1613 }
1614 }
1615 }
1616
1617 s = splbio();
1618 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
1619 TAILQ_NEXT(cb, plist)) {
1620 for (i = 0; i < njoblist; i++) {
1621 if (((intptr_t)
1622 cb->uaiocb._aiocb_private.kernelinfo) ==
1623 ijoblist[i]) {
1624 splx(s);
1625 if (ujoblist[i] != cb->uuaiocb)
1626 error = EINVAL;
1627 zfree(aiol_zone, ijoblist);
1628 zfree(aiol_zone, ujoblist);
1629 return error;
1630 }
1631 }
1632 }
1633
1634 ki->kaio_flags |= KAIO_WAKEUP;
1635 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo);
1636 splx(s);
1637
1638 if (error == ERESTART || error == EINTR) {
1639 zfree(aiol_zone, ijoblist);
1640 zfree(aiol_zone, ujoblist);
1641 return EINTR;
1642 } else if (error == EWOULDBLOCK) {
1643 zfree(aiol_zone, ijoblist);
1644 zfree(aiol_zone, ujoblist);
1645 return EAGAIN;
1646 }
1647 }
1648
1649/* NOTREACHED */
1650 return EINVAL;
1651#endif /* VFS_AIO */
1652}
1653
1654/*
1655 * aio_cancel cancels any non-physio aio operations not currently in
1656 * progress.
1657 */
1658int
1659aio_cancel(struct proc *p, struct aio_cancel_args *uap)
1660{
1661#ifndef VFS_AIO
1662 return ENOSYS;
1663#else
1664 struct kaioinfo *ki;
1665 struct aiocblist *cbe, *cbn;
1666 struct file *fp;
1667 struct filedesc *fdp;
1668 struct socket *so;
1669 struct proc *po;
1670 int s,error;
1671 int cancelled=0;
1672 int notcancelled=0;
1673 struct vnode *vp;
1674
1675 fdp = p->p_fd;
1676
1677 fp = fdp->fd_ofiles[uap->fd];
1678
1679 if (fp == NULL) {
1680 return EBADF;
1681 }
1682
1683 if (fp->f_type == DTYPE_VNODE) {
1684 vp = (struct vnode *)fp->f_data;
1685
1686 if (vn_isdisk(vp,&error)) {
1687 p->p_retval[0] = AIO_NOTCANCELED;
1688 return 0;
1689 }
1690 } else if (fp->f_type == DTYPE_SOCKET) {
1691 so = (struct socket *)fp->f_data;
1692
1693 s = splnet();
1694
1695 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
1696 cbn = TAILQ_NEXT(cbe, list);
1697 if ((uap->aiocbp == NULL) ||
1698 (uap->aiocbp == cbe->uuaiocb) ) {
1699 po = cbe->userproc;
1700 ki = po->p_aioinfo;
1701 TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
1702 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
1703 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
1704 if (ki->kaio_flags & KAIO_WAKEUP) {
1705 wakeup(po);
1706 }
1707 cbe->jobstate = JOBST_JOBFINISHED;
1708 cbe->uaiocb._aiocb_private.status=-1;
1709 cbe->uaiocb._aiocb_private.error=ECANCELED;
1710 cancelled++;
1711/* XXX cancelled, knote? */
1712 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1713 SIGEV_SIGNAL)
1714 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1715 if (uap->aiocbp)
1716 break;
1717 }
1718 }
1719
1720 splx(s);
1721
1722 if ((cancelled) && (uap->aiocbp)) {
1723 p->p_retval[0] = AIO_CANCELED;
1724 return 0;
1725 }
1726
1727 }
1728
1729 ki=p->p_aioinfo;
1730
1731 s = splnet();
1732
1733 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
1734 cbn = TAILQ_NEXT(cbe, plist);
1735
1736 if ((uap->fd == cbe->uaiocb.aio_fildes) &&
1737 ((uap->aiocbp == NULL ) ||
1738 (uap->aiocbp == cbe->uuaiocb))) {
1739
1740 if (cbe->jobstate == JOBST_JOBQGLOBAL) {
1741 TAILQ_REMOVE(&aio_jobs, cbe, list);
1742 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
1743 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
1744 plist);
1745 cancelled++;
1746 ki->kaio_queue_finished_count++;
1747 cbe->jobstate = JOBST_JOBFINISHED;
1748 cbe->uaiocb._aiocb_private.status = -1;
1749 cbe->uaiocb._aiocb_private.error = ECANCELED;
1750/* XXX cancelled, knote? */
1751 if (cbe->uaiocb.aio_sigevent.sigev_notify ==
1752 SIGEV_SIGNAL)
1753 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
1754 } else {
1755 notcancelled++;
1756 }
1757 }
1758 }
1759
1760 splx(s);
1761
1762
1763 if (notcancelled) {
1764 p->p_retval[0] = AIO_NOTCANCELED;
1765 return 0;
1766 }
1767
1768 if (cancelled) {
1769 p->p_retval[0] = AIO_CANCELED;
1770 return 0;
1771 }
1772
1773 p->p_retval[0] = AIO_ALLDONE;
1774
1775 return 0;
1776#endif /* VFS_AIO */
1777}
1778
1779/*
1780 * aio_error is implemented in the kernel level for compatibility purposes only.
1781 * For a user mode async implementation, it would be best to do it in a userland
1782 * subroutine.
1783 */
1784int
1785aio_error(struct proc *p, struct aio_error_args *uap)
1786{
1787#ifndef VFS_AIO
1788 return ENOSYS;
1789#else
1790 int s;
1791 struct aiocblist *cb;
1792 struct kaioinfo *ki;
1793 int jobref;
1794
1795 ki = p->p_aioinfo;
1796 if (ki == NULL)
1797 return EINVAL;
1798
1799 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
1800 if ((jobref == -1) || (jobref == 0))
1801 return EINVAL;
1802
1803 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb,
1804 plist)) {
1805 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1806 jobref) {
1807 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1808 return 0;
1809 }
1810 }
1811
1812 s = splnet();
1813
1814 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
1815 plist)) {
1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1817 jobref) {
1818 p->p_retval[0] = EINPROGRESS;
1819 splx(s);
1820 return 0;
1821 }
1822 }
1823
1824 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
1825 plist)) {
1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1827 jobref) {
1828 p->p_retval[0] = EINPROGRESS;
1829 splx(s);
1830 return 0;
1831 }
1832 }
1833 splx(s);
1834
1835 s = splbio();
1836 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
1837 plist)) {
1838 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1839 jobref) {
1840 p->p_retval[0] = cb->uaiocb._aiocb_private.error;
1841 splx(s);
1842 return 0;
1843 }
1844 }
1845
1846 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
1847 plist)) {
1848 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
1849 jobref) {
1850 p->p_retval[0] = EINPROGRESS;
1851 splx(s);
1852 return 0;
1853 }
1854 }
1855 splx(s);
1856
1857#if (0)
1858 /*
1859 * Hack for lio.
1860 */
1861 status = fuword(&uap->aiocbp->_aiocb_private.status);
1862 if (status == -1)
1863 return fuword(&uap->aiocbp->_aiocb_private.error);
1864#endif
1865 return EINVAL;
1866#endif /* VFS_AIO */
1867}
1868
1869int
1870aio_read(struct proc *p, struct aio_read_args *uap)
1871{
1872#ifndef VFS_AIO
1873 return ENOSYS;
1874#else
1875 struct filedesc *fdp;
1876 struct file *fp;
1877 struct uio auio;
1878 struct iovec aiov;
1879 unsigned int fd;
1880 int cnt;
1881 struct aiocb iocb;
1882 int error, pmodes;
1883
1884 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1885 if ((pmodes & AIO_PMODE_SYNC) == 0)
1886 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1887
1888 /* Get control block. */
1889 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1890 != 0)
1891 return error;
1892
1893 /* Get the fd info for process. */
1894 fdp = p->p_fd;
1895
1896 /*
1897 * Range check file descriptor.
1898 */
1899 fd = iocb.aio_fildes;
1900 if (fd >= fdp->fd_nfiles)
1901 return EBADF;
1902 fp = fdp->fd_ofiles[fd];
1903 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0))
1904 return EBADF;
1905 if (iocb.aio_offset == -1LL)
1906 return EINVAL;
1907
1908 auio.uio_resid = iocb.aio_nbytes;
1909 if (auio.uio_resid < 0)
1910 return (EINVAL);
1911
1912 /*
1913 * Process sync simply -- queue async request.
1914 */
1915 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0)
1916 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ);
1917
1918 aiov.iov_base = (void *)iocb.aio_buf;
1919 aiov.iov_len = iocb.aio_nbytes;
1920
1921 auio.uio_iov = &aiov;
1922 auio.uio_iovcnt = 1;
1923 auio.uio_offset = iocb.aio_offset;
1924 auio.uio_rw = UIO_READ;
1925 auio.uio_segflg = UIO_USERSPACE;
1926 auio.uio_procp = p;
1927
1928 cnt = iocb.aio_nbytes;
1929 /*
1930 * Temporarily bump the ref count while reading to avoid the
1931 * descriptor being ripped out from under us.
1932 */
1933 fhold(fp);
1934 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p);
1935 fdrop(fp, p);
1936 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error ==
1937 EINTR || error == EWOULDBLOCK))
1938 error = 0;
1939 cnt -= auio.uio_resid;
1940 p->p_retval[0] = cnt;
1941 return error;
1942#endif /* VFS_AIO */
1943}
1944
1945int
1946aio_write(struct proc *p, struct aio_write_args *uap)
1947{
1948#ifndef VFS_AIO
1949 return ENOSYS;
1950#else
1951 struct filedesc *fdp;
1952 struct file *fp;
1953 struct uio auio;
1954 struct iovec aiov;
1955 unsigned int fd;
1956 int cnt;
1957 struct aiocb iocb;
1958 int error;
1959 int pmodes;
1960
1961 /*
1962 * Process sync simply -- queue async request.
1963 */
1964 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes);
1965 if ((pmodes & AIO_PMODE_SYNC) == 0)
1966 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE);
1967
1968 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb))
1969 != 0)
1970 return error;
1971
1972 /* Get the fd info for process. */
1973 fdp = p->p_fd;
1974
1975 /*
1976 * Range check file descriptor.
1977 */
1978 fd = iocb.aio_fildes;
1979 if (fd >= fdp->fd_nfiles)
1980 return EBADF;
1981 fp = fdp->fd_ofiles[fd];
1982 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0))
1983 return EBADF;
1984 if (iocb.aio_offset == -1LL)
1985 return EINVAL;
1986
1987 aiov.iov_base = (void *)iocb.aio_buf;
1988 aiov.iov_len = iocb.aio_nbytes;
1989 auio.uio_iov = &aiov;
1990 auio.uio_iovcnt = 1;
1991 auio.uio_offset = iocb.aio_offset;
1992
1993 auio.uio_resid = iocb.aio_nbytes;
1994 if (auio.uio_resid < 0)
1995 return (EINVAL);
1996
1997 auio.uio_rw = UIO_WRITE;
1998 auio.uio_segflg = UIO_USERSPACE;
1999 auio.uio_procp = p;
2000
2001 cnt = iocb.aio_nbytes;
2002 /*
2003 * Temporarily bump the ref count while writing to avoid the
2004 * descriptor being ripped out from under us.
2005 */
2006 fhold(fp);
2007 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p);
2008 fdrop(fp, p);
2009 if (error) {
2010 if (auio.uio_resid != cnt) {
2011 if (error == ERESTART || error == EINTR || error ==
2012 EWOULDBLOCK)
2013 error = 0;
2014 if (error == EPIPE)
2015 psignal(p, SIGPIPE);
2016 }
2017 }
2018 cnt -= auio.uio_resid;
2019 p->p_retval[0] = cnt;
2020 return error;
2021#endif /* VFS_AIO */
2022}
2023
2024int
2025lio_listio(struct proc *p, struct lio_listio_args *uap)
2026{
2027#ifndef VFS_AIO
2028 return ENOSYS;
2029#else
2030 int nent, nentqueued;
2031 struct aiocb *iocb, * const *cbptr;
2032 struct aiocblist *cb;
2033 struct kaioinfo *ki;
2034 struct aio_liojob *lj;
2035 int error, runningcode;
2036 int nerror;
2037 int i;
2038 int s;
2039
2040 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
2041 return EINVAL;
2042
2043 nent = uap->nent;
2044 if (nent > AIO_LISTIO_MAX)
2045 return EINVAL;
2046
2047 if (p->p_aioinfo == NULL)
2048 aio_init_aioinfo(p);
2049
2050 if ((nent + num_queue_count) > max_queue_count)
2051 return EAGAIN;
2052
2053 ki = p->p_aioinfo;
2054 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
2055 return EAGAIN;
2056
2057 lj = zalloc(aiolio_zone);
2058 if (!lj)
2059 return EAGAIN;
2060
2061 lj->lioj_flags = 0;
2062 lj->lioj_buffer_count = 0;
2063 lj->lioj_buffer_finished_count = 0;
2064 lj->lioj_queue_count = 0;
2065 lj->lioj_queue_finished_count = 0;
2066 lj->lioj_ki = ki;
2067 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
2068
2069 /*
2070 * Setup signal.
2071 */
2072 if (uap->sig && (uap->mode == LIO_NOWAIT)) {
2073 error = copyin(uap->sig, &lj->lioj_signal,
2074 sizeof(lj->lioj_signal));
2075 if (error)
2076 return error;
2077 lj->lioj_flags |= LIOJ_SIGNAL;
2078 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED;
2079 } else
2080 lj->lioj_flags &= ~LIOJ_SIGNAL;
2081
2082 /*
2083 * Get pointers to the list of I/O requests.
2084 */
2085 nerror = 0;
2086 nentqueued = 0;
2087 cbptr = uap->acb_list;
2088 for (i = 0; i < uap->nent; i++) {
2089 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2090 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) {
2091 error = _aio_aqueue(p, iocb, lj, 0);
2092 if (error == 0)
2093 nentqueued++;
2094 else
2095 nerror++;
2096 }
2097 }
2098
2099 /*
2100 * If we haven't queued any, then just return error.
2101 */
2102 if (nentqueued == 0)
2103 return 0;
2104
2105 /*
2106 * Calculate the appropriate error return.
2107 */
2108 runningcode = 0;
2109 if (nerror)
2110 runningcode = EIO;
2111
2112 if (uap->mode == LIO_WAIT) {
2113 int command, found, jobref;
2114
2115 for (;;) {
2116 found = 0;
2117 for (i = 0; i < uap->nent; i++) {
2118 /*
2119 * Fetch address of the control buf pointer in
2120 * user space.
2121 */
2122 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]);
2123 if (((intptr_t)iocb == -1) || ((intptr_t)iocb
2124 == 0))
2125 continue;
2126
2127 /*
2128 * Fetch the associated command from user space.
2129 */
2130 command = fuword(&iocb->aio_lio_opcode);
2131 if (command == LIO_NOP) {
2132 found++;
2133 continue;
2134 }
2135
2136 jobref = fuword(&iocb->_aiocb_private.kernelinfo);
2137
2138 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
2139 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2140 == jobref) {
2141 if (cb->uaiocb.aio_lio_opcode
2142 == LIO_WRITE) {
2143 curproc->p_stats->p_ru.ru_oublock
2144 +=
2145 cb->outputcharge;
2146 cb->outputcharge = 0;
2147 } else if (cb->uaiocb.aio_lio_opcode
2148 == LIO_READ) {
2149 curproc->p_stats->p_ru.ru_inblock
2150 += cb->inputcharge;
2151 cb->inputcharge = 0;
2152 }
2153 found++;
2154 break;
2155 }
2156 }
2157
2158 s = splbio();
2159 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
2160 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
2161 == jobref) {
2162 found++;
2163 break;
2164 }
2165 }
2166 splx(s);
2167 }
2168
2169 /*
2170 * If all I/Os have been disposed of, then we can
2171 * return.
2172 */
2173 if (found == nentqueued)
2174 return runningcode;
2175
2176 ki->kaio_flags |= KAIO_WAKEUP;
2177 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
2178
2179 if (error == EINTR)
2180 return EINTR;
2181 else if (error == EWOULDBLOCK)
2182 return EAGAIN;
2183 }
2184 }
2185
2186 return runningcode;
2187#endif /* VFS_AIO */
2188}
2189
2190#ifdef VFS_AIO
2191/*
2192 * This is a weird hack so that we can post a signal. It is safe to do so from
2193 * a timeout routine, but *not* from an interrupt routine.
2194 */
2195static void
2196process_signal(void *aioj)
2197{
2198 struct aiocblist *aiocbe = aioj;
2199 struct aio_liojob *lj = aiocbe->lio;
2200 struct aiocb *cb = &aiocbe->uaiocb;
2201
2202 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) &&
2203 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) {
2204 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo);
2205 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2206 }
2207
2208 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2209 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo);
2210}
2211
2212/*
2213 * Interrupt handler for physio, performs the necessary process wakeups, and
2214 * signals.
2215 */
2216static void
2217aio_physwakeup(struct buf *bp)
2218{
2219 struct aiocblist *aiocbe;
2220 struct proc *p;
2221 struct kaioinfo *ki;
2222 struct aio_liojob *lj;
2223
2224 wakeup((caddr_t)bp);
2225
2226 aiocbe = (struct aiocblist *)bp->b_spc;
2227 if (aiocbe) {
2228 p = bp->b_caller1;
2229
2230 aiocbe->jobstate = JOBST_JOBBFINISHED;
2231 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
2232 aiocbe->uaiocb._aiocb_private.error = 0;
2233 aiocbe->jobflags |= AIOCBLIST_DONE;
2234
2235 if (bp->b_ioflags & BIO_ERROR)
2236 aiocbe->uaiocb._aiocb_private.error = bp->b_error;
2237
2238 lj = aiocbe->lio;
2239 if (lj) {
2240 lj->lioj_buffer_finished_count++;
2241
2242 /*
2243 * wakeup/signal if all of the interrupt jobs are done.
2244 */
2245 if (lj->lioj_buffer_finished_count ==
2246 lj->lioj_buffer_count) {
2247 /*
2248 * Post a signal if it is called for.
2249 */
2250 if ((lj->lioj_flags &
2251 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
2252 LIOJ_SIGNAL) {
2253 lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
2254 timeout(process_signal, aiocbe, 0);
2255 }
2256 }
2257 }
2258
2259 ki = p->p_aioinfo;
2260 if (ki) {
2261 ki->kaio_buffer_finished_count++;
2262 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list);
2263 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
2264 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
2265
2266 KNOTE(&aiocbe->klist, 0);
2267 /* Do the wakeup. */
2268 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
2269 ki->kaio_flags &= ~KAIO_WAKEUP;
2270 wakeup(p);
2271 }
2272 }
2273
2274 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL)
2275 timeout(process_signal, aiocbe, 0);
2276 }
2277}
2278#endif /* VFS_AIO */
2279
2280int
2281aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap)
2282{
2283#ifndef VFS_AIO
2284 return ENOSYS;
2285#else
2286 struct timeval atv;
2287 struct timespec ts;
2288 struct aiocb **cbptr;
2289 struct kaioinfo *ki;
2290 struct aiocblist *cb = NULL;
2291 int error, s, timo;
2292
2293 suword(uap->aiocbp, (int)NULL);
2294
2295 timo = 0;
2296 if (uap->timeout) {
2297 /* Get timespec struct. */
2298 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts,
2299 sizeof(ts));
2300 if (error)
2301 return error;
2302
2303 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
2304 return (EINVAL);
2305
2306 TIMESPEC_TO_TIMEVAL(&atv, &ts);
2307 if (itimerfix(&atv))
2308 return (EINVAL);
2309 timo = tvtohz(&atv);
2310 }
2311
2312 ki = p->p_aioinfo;
2313 if (ki == NULL)
2314 return EAGAIN;
2315
2316 cbptr = uap->aiocbp;
2317
2318 for (;;) {
2319 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
2320 suword(uap->aiocbp, (int)cb->uuaiocb);
2321 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2322 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
2323 curproc->p_stats->p_ru.ru_oublock +=
2324 cb->outputcharge;
2325 cb->outputcharge = 0;
2326 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
2327 curproc->p_stats->p_ru.ru_inblock +=
2328 cb->inputcharge;
2329 cb->inputcharge = 0;
2330 }
2331 aio_free_entry(cb);
2332 return cb->uaiocb._aiocb_private.error;
2333 }
2334
2335 s = splbio();
2336 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
2337 splx(s);
2338 suword(uap->aiocbp, (int)cb->uuaiocb);
2339 p->p_retval[0] = cb->uaiocb._aiocb_private.status;
2340 aio_free_entry(cb);
2341 return cb->uaiocb._aiocb_private.error;
2342 }
2343
2344 ki->kaio_flags |= KAIO_WAKEUP;
2345 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo);
2346 splx(s);
2347
2348 if (error == ERESTART)
2349 return EINTR;
2350 else if (error < 0)
2351 return error;
2352 else if (error == EINTR)
2353 return EINTR;
2354 else if (error == EWOULDBLOCK)
2355 return EAGAIN;
2356 }
2357#endif /* VFS_AIO */
2358}
2359
2360
2361#ifndef VFS_AIO
2362static int
2363filt_aioattach(struct knote *kn)
2364{
2365
2366 return (ENXIO);
2367}
2368
2369struct filterops aio_filtops =
2370 { 0, filt_aioattach, NULL, NULL };
2371
2372#else
2373static int
2374filt_aioattach(struct knote *kn)
2375{
2376 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2377
2378 /*
2379 * The aiocbe pointer must be validated before using it, so
2380 * registration is restricted to the kernel; the user cannot
2381 * set EV_FLAG1.
2382 */
2383 if ((kn->kn_flags & EV_FLAG1) == 0)
2384 return (EPERM);
2385 kn->kn_flags &= ~EV_FLAG1;
2386
2387 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
2388
2389 return (0);
2390}
2391
2392static void
2393filt_aiodetach(struct knote *kn)
2394{
2395 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2396 int s = splhigh(); /* XXX no clue, so overkill */
2397
2398 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
2399 splx(s);
2400}
2401
2402/*ARGSUSED*/
2403static int
2404filt_aio(struct knote *kn, long hint)
2405{
2406 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id;
2407
2408 kn->kn_data = 0; /* XXX data returned? */
2409 if (aiocbe->jobstate != JOBST_JOBFINISHED &&
2410 aiocbe->jobstate != JOBST_JOBBFINISHED)
2411 return (0);
2412 kn->kn_flags |= EV_EOF;
2413 return (1);
2414}
2415
2416struct filterops aio_filtops =
2417 { 0, filt_aioattach, filt_aiodetach, filt_aio };
2418#endif /* VFS_AIO */