17 */ 18 19/* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23#include <sys/param.h> 24#include <sys/systm.h> 25#include <sys/bio.h> 26#include <sys/buf.h> 27#include <sys/sysproto.h> 28#include <sys/filedesc.h> 29#include <sys/kernel.h> 30#include <sys/fcntl.h> 31#include <sys/file.h> 32#include <sys/lock.h> 33#include <sys/mutex.h> 34#include <sys/unistd.h> 35#include <sys/proc.h> 36#include <sys/resourcevar.h> 37#include <sys/signalvar.h> 38#include <sys/protosw.h> 39#include <sys/socketvar.h> 40#include <sys/sysctl.h> 41#include <sys/vnode.h> 42#include <sys/conf.h> 43#include <sys/event.h> 44 45#include <vm/vm.h> 46#include <vm/vm_extern.h> 47#include <vm/pmap.h> 48#include <vm/vm_map.h> 49#include <vm/vm_zone.h> 50#include <sys/aio.h> 51 52#include <machine/limits.h> 53 54#include "opt_vfs_aio.h" 55 56#ifdef VFS_AIO 57 58static long jobrefid; 59 60#define JOBST_NULL 0x0 61#define JOBST_JOBQPROC 0x1 62#define JOBST_JOBQGLOBAL 0x2 63#define JOBST_JOBRUNNING 0x3 64#define JOBST_JOBFINISHED 0x4 65#define JOBST_JOBQBUF 0x5 66#define JOBST_JOBBFINISHED 0x6 67 68#ifndef MAX_AIO_PER_PROC 69#define MAX_AIO_PER_PROC 32 70#endif 71 72#ifndef MAX_AIO_QUEUE_PER_PROC 73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 74#endif 75 76#ifndef MAX_AIO_PROCS 77#define MAX_AIO_PROCS 32 78#endif 79 80#ifndef MAX_AIO_QUEUE 81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 82#endif 83 84#ifndef TARGET_AIO_PROCS 85#define TARGET_AIO_PROCS 4 86#endif 87 88#ifndef MAX_BUF_AIO 89#define MAX_BUF_AIO 16 90#endif 91 92#ifndef AIOD_TIMEOUT_DEFAULT 93#define AIOD_TIMEOUT_DEFAULT (10 * hz) 94#endif 95 96#ifndef AIOD_LIFETIME_DEFAULT 97#define AIOD_LIFETIME_DEFAULT (30 * hz) 98#endif 99 100static int max_aio_procs = MAX_AIO_PROCS; 101static int num_aio_procs = 0; 102static int target_aio_procs = TARGET_AIO_PROCS; 103static int max_queue_count = MAX_AIO_QUEUE; 104static int num_queue_count = 0; 105static int num_buf_aio = 0; 106static int num_aio_resv_start = 0; 107static int aiod_timeout; 108static int aiod_lifetime; 109 110static int max_aio_per_proc = MAX_AIO_PER_PROC; 111static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 112static int max_buf_aio = MAX_BUF_AIO; 113 114SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 115 116SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 117 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 118 119SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 120 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 121 122SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 123 CTLFLAG_RW, &max_aio_procs, 0, ""); 124 125SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 126 CTLFLAG_RD, &num_aio_procs, 0, ""); 127 128SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 129 CTLFLAG_RD, &num_queue_count, 0, ""); 130 131SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 132 CTLFLAG_RW, &max_queue_count, 0, ""); 133 134SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 135 CTLFLAG_RW, &target_aio_procs, 0, ""); 136 137SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 138 CTLFLAG_RW, &max_buf_aio, 0, ""); 139 140SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 141 CTLFLAG_RD, &num_buf_aio, 0, ""); 142 143SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 144 CTLFLAG_RW, &aiod_lifetime, 0, ""); 145 146SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 147 CTLFLAG_RW, &aiod_timeout, 0, ""); 148 149/* 150 * AIO process info 151 */ 152#define AIOP_FREE 0x1 /* proc on free queue */ 153#define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 154 155struct aioproclist { 156 int aioprocflags; /* AIO proc flags */ 157 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 158 struct proc *aioproc; /* The AIO thread */ 159 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 160}; 161 162/* 163 * data-structure for lio signal management 164 */ 165struct aio_liojob { 166 int lioj_flags; 167 int lioj_buffer_count; 168 int lioj_buffer_finished_count; 169 int lioj_queue_count; 170 int lioj_queue_finished_count; 171 struct sigevent lioj_signal; /* signal on all I/O done */ 172 TAILQ_ENTRY (aio_liojob) lioj_list; 173 struct kaioinfo *lioj_ki; 174}; 175#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 176#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 177 178/* 179 * per process aio data structure 180 */ 181struct kaioinfo { 182 int kaio_flags; /* per process kaio flags */ 183 int kaio_maxactive_count; /* maximum number of AIOs */ 184 int kaio_active_count; /* number of currently used AIOs */ 185 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 186 int kaio_queue_count; /* size of AIO queue */ 187 int kaio_ballowed_count; /* maximum number of buffers */ 188 int kaio_queue_finished_count; /* number of daemon jobs finished */ 189 int kaio_buffer_count; /* number of physio buffers */ 190 int kaio_buffer_finished_count; /* count of I/O done */ 191 struct proc *kaio_p; /* process that uses this kaio block */ 192 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 193 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 194 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 195 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 196 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 197 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 198}; 199 200#define KAIO_RUNDOWN 0x1 /* process is being run down */ 201#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 202 203static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 204static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 205static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 206static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 207 208static void aio_init_aioinfo(struct proc *p); 209static void aio_onceonly(void *); 210static int aio_free_entry(struct aiocblist *aiocbe); 211static void aio_process(struct aiocblist *aiocbe); 212static int aio_newproc(void); 213static int aio_aqueue(struct proc *p, struct aiocb *job, int type); 214static void aio_physwakeup(struct buf *bp); 215static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 216static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 217static void aio_daemon(void *uproc); 218 219SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 220 221static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0; 222static vm_zone_t aiolio_zone = 0; 223 224/* 225 * Startup initialization 226 */ 227void 228aio_onceonly(void *na) 229{ 230 TAILQ_INIT(&aio_freeproc); 231 TAILQ_INIT(&aio_activeproc); 232 TAILQ_INIT(&aio_jobs); 233 TAILQ_INIT(&aio_bufjobs); 234 TAILQ_INIT(&aio_freejobs); 235 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 236 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 237 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 238 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 239 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct 240 aio_liojob), 0, 0, 1); 241 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 242 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 243 jobrefid = 1; 244} 245 246/* 247 * Init the per-process aioinfo structure. The aioinfo limits are set 248 * per-process for user limit (resource) management. 249 */ 250void 251aio_init_aioinfo(struct proc *p) 252{ 253 struct kaioinfo *ki; 254 if (p->p_aioinfo == NULL) { 255 ki = zalloc(kaio_zone); 256 p->p_aioinfo = ki; 257 ki->kaio_flags = 0; 258 ki->kaio_maxactive_count = max_aio_per_proc; 259 ki->kaio_active_count = 0; 260 ki->kaio_qallowed_count = max_aio_queue_per_proc; 261 ki->kaio_queue_count = 0; 262 ki->kaio_ballowed_count = max_buf_aio; 263 ki->kaio_buffer_count = 0; 264 ki->kaio_buffer_finished_count = 0; 265 ki->kaio_p = p; 266 TAILQ_INIT(&ki->kaio_jobdone); 267 TAILQ_INIT(&ki->kaio_jobqueue); 268 TAILQ_INIT(&ki->kaio_bufdone); 269 TAILQ_INIT(&ki->kaio_bufqueue); 270 TAILQ_INIT(&ki->kaio_liojoblist); 271 TAILQ_INIT(&ki->kaio_sockqueue); 272 } 273 274 while (num_aio_procs < target_aio_procs) 275 aio_newproc(); 276} 277 278/* 279 * Free a job entry. Wait for completion if it is currently active, but don't 280 * delay forever. If we delay, we return a flag that says that we have to 281 * restart the queue scan. 282 */ 283int 284aio_free_entry(struct aiocblist *aiocbe) 285{ 286 struct kaioinfo *ki; 287 struct aioproclist *aiop; 288 struct aio_liojob *lj; 289 struct proc *p; 290 int error; 291 int s; 292 293 if (aiocbe->jobstate == JOBST_NULL) 294 panic("aio_free_entry: freeing already free job"); 295 296 p = aiocbe->userproc; 297 ki = p->p_aioinfo; 298 lj = aiocbe->lio; 299 if (ki == NULL) 300 panic("aio_free_entry: missing p->p_aioinfo"); 301 302 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 303 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 304 return 0; 305 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 306 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 307 } 308 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 309 310 if (aiocbe->bp == NULL) { 311 if (ki->kaio_queue_count <= 0) 312 panic("aio_free_entry: process queue size <= 0"); 313 if (num_queue_count <= 0) 314 panic("aio_free_entry: system wide queue size <= 0"); 315 316 if (lj) { 317 lj->lioj_queue_count--; 318 if (aiocbe->jobflags & AIOCBLIST_DONE) 319 lj->lioj_queue_finished_count--; 320 } 321 ki->kaio_queue_count--; 322 if (aiocbe->jobflags & AIOCBLIST_DONE) 323 ki->kaio_queue_finished_count--; 324 num_queue_count--; 325 } else { 326 if (lj) { 327 lj->lioj_buffer_count--; 328 if (aiocbe->jobflags & AIOCBLIST_DONE) 329 lj->lioj_buffer_finished_count--; 330 } 331 if (aiocbe->jobflags & AIOCBLIST_DONE) 332 ki->kaio_buffer_finished_count--; 333 ki->kaio_buffer_count--; 334 num_buf_aio--; 335 } 336 337 /* aiocbe is going away, we need to destroy any knotes */ 338 knote_remove(p, &aiocbe->klist); 339 340 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 341 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 342 ki->kaio_flags &= ~KAIO_WAKEUP; 343 wakeup(p); 344 } 345 346 if (aiocbe->jobstate == JOBST_JOBQBUF) { 347 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 348 return error; 349 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 350 panic("aio_free_entry: invalid physio finish-up state"); 351 s = splbio(); 352 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 353 splx(s); 354 } else if (aiocbe->jobstate == JOBST_JOBQPROC) { 355 aiop = aiocbe->jobaioproc; 356 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 357 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) 358 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 359 else if (aiocbe->jobstate == JOBST_JOBFINISHED) 360 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 361 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 362 s = splbio(); 363 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 364 splx(s); 365 if (aiocbe->bp) { 366 vunmapbuf(aiocbe->bp); 367 relpbuf(aiocbe->bp, NULL); 368 aiocbe->bp = NULL; 369 } 370 } 371 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 372 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 373 zfree(aiolio_zone, lj); 374 } 375 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 376 aiocbe->jobstate = JOBST_NULL; 377 return 0; 378} 379#endif /* VFS_AIO */ 380 381/* 382 * Rundown the jobs for a given process. 383 */ 384void 385aio_proc_rundown(struct proc *p) 386{ 387#ifndef VFS_AIO 388 return; 389#else 390 int s; 391 struct kaioinfo *ki; 392 struct aio_liojob *lj, *ljn; 393 struct aiocblist *aiocbe, *aiocbn; 394 struct file *fp; 395 struct filedesc *fdp; 396 struct socket *so; 397 398 ki = p->p_aioinfo; 399 if (ki == NULL) 400 return; 401 402 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 403 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 404 ki->kaio_buffer_finished_count)) { 405 ki->kaio_flags |= KAIO_RUNDOWN; 406 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 407 break; 408 } 409 410 /* 411 * Move any aio ops that are waiting on socket I/O to the normal job 412 * queues so they are cleaned up with any others. 413 */ 414 fdp = p->p_fd; 415 416 s = splnet(); 417 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 418 aiocbn) { 419 aiocbn = TAILQ_NEXT(aiocbe, plist); 420 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 421 422 /* 423 * Under some circumstances, the aio_fildes and the file 424 * structure don't match. This would leave aiocbe's in the 425 * TAILQ associated with the socket and cause a panic later. 426 * 427 * Detect and fix. 428 */ 429 if ((fp == NULL) || (fp != aiocbe->fd_file)) 430 fp = aiocbe->fd_file; 431 if (fp) { 432 so = (struct socket *)fp->f_data; 433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 434 if (TAILQ_EMPTY(&so->so_aiojobq)) { 435 so->so_snd.sb_flags &= ~SB_AIO; 436 so->so_rcv.sb_flags &= ~SB_AIO; 437 } 438 } 439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 442 } 443 splx(s); 444 445restart1: 446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 447 aiocbn = TAILQ_NEXT(aiocbe, plist); 448 if (aio_free_entry(aiocbe)) 449 goto restart1; 450 } 451 452restart2: 453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 454 aiocbn) { 455 aiocbn = TAILQ_NEXT(aiocbe, plist); 456 if (aio_free_entry(aiocbe)) 457 goto restart2; 458 } 459 460/* 461 * Note the use of lots of splbio here, trying to avoid splbio for long chains 462 * of I/O. Probably unnecessary. 463 */ 464restart3: 465 s = splbio(); 466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 467 ki->kaio_flags |= KAIO_WAKEUP; 468 tsleep(p, PRIBIO, "aioprn", 0); 469 splx(s); 470 goto restart3; 471 } 472 splx(s); 473 474restart4: 475 s = splbio(); 476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 477 aiocbn = TAILQ_NEXT(aiocbe, plist); 478 if (aio_free_entry(aiocbe)) { 479 splx(s); 480 goto restart4; 481 } 482 } 483 splx(s); 484 485 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 486 ljn = TAILQ_NEXT(lj, lioj_list); 487 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 488 0)) { 489 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 490 zfree(aiolio_zone, lj); 491 } else { 492#ifdef DIAGNOSTIC 493 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 494 "QF:%d\n", lj->lioj_buffer_count, 495 lj->lioj_buffer_finished_count, 496 lj->lioj_queue_count, 497 lj->lioj_queue_finished_count); 498#endif 499 } 500 } 501 502 zfree(kaio_zone, ki); 503 p->p_aioinfo = NULL; 504#endif /* VFS_AIO */ 505} 506 507#ifdef VFS_AIO 508/* 509 * Select a job to run (called by an AIO daemon). 510 */ 511static struct aiocblist * 512aio_selectjob(struct aioproclist *aiop) 513{ 514 int s; 515 struct aiocblist *aiocbe; 516 struct kaioinfo *ki; 517 struct proc *userp; 518 519 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 520 if (aiocbe) { 521 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 522 return aiocbe; 523 } 524 525 s = splnet(); 526 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 527 TAILQ_NEXT(aiocbe, list)) { 528 userp = aiocbe->userproc; 529 ki = userp->p_aioinfo; 530 531 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 532 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 533 splx(s); 534 return aiocbe; 535 } 536 } 537 splx(s); 538 539 return NULL; 540} 541 542/* 543 * The AIO processing activity. This is the code that does the I/O request for 544 * the non-physio version of the operations. The normal vn operations are used, 545 * and this code should work in all instances for every type of file, including 546 * pipes, sockets, fifos, and regular files. 547 */ 548void 549aio_process(struct aiocblist *aiocbe) 550{ 551 struct filedesc *fdp; 552 struct proc *userp, *mycp; 553 struct aiocb *cb; 554 struct file *fp; 555 struct uio auio; 556 struct iovec aiov; 557 unsigned int fd; 558 int cnt; 559 int error; 560 off_t offset; 561 int oublock_st, oublock_end; 562 int inblock_st, inblock_end; 563 564 userp = aiocbe->userproc; 565 cb = &aiocbe->uaiocb; 566 567 mycp = curproc; 568 569 fdp = mycp->p_fd; 570 fd = cb->aio_fildes; 571 fp = fdp->fd_ofiles[fd]; 572 573 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 574 cb->_aiocb_private.error = EBADF; 575 cb->_aiocb_private.status = -1; 576 return; 577 } 578 579 aiov.iov_base = (void *)cb->aio_buf; 580 aiov.iov_len = cb->aio_nbytes; 581 582 auio.uio_iov = &aiov; 583 auio.uio_iovcnt = 1; 584 auio.uio_offset = offset = cb->aio_offset; 585 auio.uio_resid = cb->aio_nbytes; 586 cnt = cb->aio_nbytes; 587 auio.uio_segflg = UIO_USERSPACE; 588 auio.uio_procp = mycp; 589 590 inblock_st = mycp->p_stats->p_ru.ru_inblock; 591 oublock_st = mycp->p_stats->p_ru.ru_oublock; 592 /* 593 * Temporarily bump the ref count while reading to avoid the 594 * descriptor being ripped out from under us. 595 */ 596 fhold(fp); 597 if (cb->aio_lio_opcode == LIO_READ) { 598 auio.uio_rw = UIO_READ; 599 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 600 } else { 601 auio.uio_rw = UIO_WRITE; 602 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 603 } 604 fdrop(fp, mycp); 605 inblock_end = mycp->p_stats->p_ru.ru_inblock; 606 oublock_end = mycp->p_stats->p_ru.ru_oublock; 607 608 aiocbe->inputcharge = inblock_end - inblock_st; 609 aiocbe->outputcharge = oublock_end - oublock_st; 610 611 if ((error) && (auio.uio_resid != cnt)) { 612 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 613 error = 0; 614 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 615 psignal(userp, SIGPIPE); 616 } 617 618 cnt -= auio.uio_resid; 619 cb->_aiocb_private.error = error; 620 cb->_aiocb_private.status = cnt; 621 622 return; 623} 624 625/* 626 * The AIO daemon, most of the actual work is done in aio_process, 627 * but the setup (and address space mgmt) is done in this routine. 628 */ 629static void 630aio_daemon(void *uproc) 631{ 632 int s; 633 struct aio_liojob *lj; 634 struct aiocb *cb; 635 struct aiocblist *aiocbe; 636 struct aioproclist *aiop; 637 struct kaioinfo *ki; 638 struct proc *curcp, *mycp, *userp; 639 struct vmspace *myvm, *tmpvm; 640
| 17 */ 18 19/* 20 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 21 */ 22 23#include <sys/param.h> 24#include <sys/systm.h> 25#include <sys/bio.h> 26#include <sys/buf.h> 27#include <sys/sysproto.h> 28#include <sys/filedesc.h> 29#include <sys/kernel.h> 30#include <sys/fcntl.h> 31#include <sys/file.h> 32#include <sys/lock.h> 33#include <sys/mutex.h> 34#include <sys/unistd.h> 35#include <sys/proc.h> 36#include <sys/resourcevar.h> 37#include <sys/signalvar.h> 38#include <sys/protosw.h> 39#include <sys/socketvar.h> 40#include <sys/sysctl.h> 41#include <sys/vnode.h> 42#include <sys/conf.h> 43#include <sys/event.h> 44 45#include <vm/vm.h> 46#include <vm/vm_extern.h> 47#include <vm/pmap.h> 48#include <vm/vm_map.h> 49#include <vm/vm_zone.h> 50#include <sys/aio.h> 51 52#include <machine/limits.h> 53 54#include "opt_vfs_aio.h" 55 56#ifdef VFS_AIO 57 58static long jobrefid; 59 60#define JOBST_NULL 0x0 61#define JOBST_JOBQPROC 0x1 62#define JOBST_JOBQGLOBAL 0x2 63#define JOBST_JOBRUNNING 0x3 64#define JOBST_JOBFINISHED 0x4 65#define JOBST_JOBQBUF 0x5 66#define JOBST_JOBBFINISHED 0x6 67 68#ifndef MAX_AIO_PER_PROC 69#define MAX_AIO_PER_PROC 32 70#endif 71 72#ifndef MAX_AIO_QUEUE_PER_PROC 73#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 74#endif 75 76#ifndef MAX_AIO_PROCS 77#define MAX_AIO_PROCS 32 78#endif 79 80#ifndef MAX_AIO_QUEUE 81#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 82#endif 83 84#ifndef TARGET_AIO_PROCS 85#define TARGET_AIO_PROCS 4 86#endif 87 88#ifndef MAX_BUF_AIO 89#define MAX_BUF_AIO 16 90#endif 91 92#ifndef AIOD_TIMEOUT_DEFAULT 93#define AIOD_TIMEOUT_DEFAULT (10 * hz) 94#endif 95 96#ifndef AIOD_LIFETIME_DEFAULT 97#define AIOD_LIFETIME_DEFAULT (30 * hz) 98#endif 99 100static int max_aio_procs = MAX_AIO_PROCS; 101static int num_aio_procs = 0; 102static int target_aio_procs = TARGET_AIO_PROCS; 103static int max_queue_count = MAX_AIO_QUEUE; 104static int num_queue_count = 0; 105static int num_buf_aio = 0; 106static int num_aio_resv_start = 0; 107static int aiod_timeout; 108static int aiod_lifetime; 109 110static int max_aio_per_proc = MAX_AIO_PER_PROC; 111static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; 112static int max_buf_aio = MAX_BUF_AIO; 113 114SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 115 116SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 117 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 118 119SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 120 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 121 122SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 123 CTLFLAG_RW, &max_aio_procs, 0, ""); 124 125SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 126 CTLFLAG_RD, &num_aio_procs, 0, ""); 127 128SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 129 CTLFLAG_RD, &num_queue_count, 0, ""); 130 131SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 132 CTLFLAG_RW, &max_queue_count, 0, ""); 133 134SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 135 CTLFLAG_RW, &target_aio_procs, 0, ""); 136 137SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 138 CTLFLAG_RW, &max_buf_aio, 0, ""); 139 140SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 141 CTLFLAG_RD, &num_buf_aio, 0, ""); 142 143SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 144 CTLFLAG_RW, &aiod_lifetime, 0, ""); 145 146SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 147 CTLFLAG_RW, &aiod_timeout, 0, ""); 148 149/* 150 * AIO process info 151 */ 152#define AIOP_FREE 0x1 /* proc on free queue */ 153#define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 154 155struct aioproclist { 156 int aioprocflags; /* AIO proc flags */ 157 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 158 struct proc *aioproc; /* The AIO thread */ 159 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 160}; 161 162/* 163 * data-structure for lio signal management 164 */ 165struct aio_liojob { 166 int lioj_flags; 167 int lioj_buffer_count; 168 int lioj_buffer_finished_count; 169 int lioj_queue_count; 170 int lioj_queue_finished_count; 171 struct sigevent lioj_signal; /* signal on all I/O done */ 172 TAILQ_ENTRY (aio_liojob) lioj_list; 173 struct kaioinfo *lioj_ki; 174}; 175#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 176#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 177 178/* 179 * per process aio data structure 180 */ 181struct kaioinfo { 182 int kaio_flags; /* per process kaio flags */ 183 int kaio_maxactive_count; /* maximum number of AIOs */ 184 int kaio_active_count; /* number of currently used AIOs */ 185 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 186 int kaio_queue_count; /* size of AIO queue */ 187 int kaio_ballowed_count; /* maximum number of buffers */ 188 int kaio_queue_finished_count; /* number of daemon jobs finished */ 189 int kaio_buffer_count; /* number of physio buffers */ 190 int kaio_buffer_finished_count; /* count of I/O done */ 191 struct proc *kaio_p; /* process that uses this kaio block */ 192 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 193 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 194 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 195 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 196 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 197 TAILQ_HEAD (,aiocblist) kaio_sockqueue; /* queue for aios waiting on sockets */ 198}; 199 200#define KAIO_RUNDOWN 0x1 /* process is being run down */ 201#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ 202 203static TAILQ_HEAD(,aioproclist) aio_freeproc, aio_activeproc; 204static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 205static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 206static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 207 208static void aio_init_aioinfo(struct proc *p); 209static void aio_onceonly(void *); 210static int aio_free_entry(struct aiocblist *aiocbe); 211static void aio_process(struct aiocblist *aiocbe); 212static int aio_newproc(void); 213static int aio_aqueue(struct proc *p, struct aiocb *job, int type); 214static void aio_physwakeup(struct buf *bp); 215static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 216static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 217static void aio_daemon(void *uproc); 218 219SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 220 221static vm_zone_t kaio_zone = 0, aiop_zone = 0, aiocb_zone = 0, aiol_zone = 0; 222static vm_zone_t aiolio_zone = 0; 223 224/* 225 * Startup initialization 226 */ 227void 228aio_onceonly(void *na) 229{ 230 TAILQ_INIT(&aio_freeproc); 231 TAILQ_INIT(&aio_activeproc); 232 TAILQ_INIT(&aio_jobs); 233 TAILQ_INIT(&aio_bufjobs); 234 TAILQ_INIT(&aio_freejobs); 235 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 236 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 237 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 238 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 239 aiolio_zone = zinit("AIOLIO", AIO_LISTIO_MAX * sizeof (struct 240 aio_liojob), 0, 0, 1); 241 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 242 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 243 jobrefid = 1; 244} 245 246/* 247 * Init the per-process aioinfo structure. The aioinfo limits are set 248 * per-process for user limit (resource) management. 249 */ 250void 251aio_init_aioinfo(struct proc *p) 252{ 253 struct kaioinfo *ki; 254 if (p->p_aioinfo == NULL) { 255 ki = zalloc(kaio_zone); 256 p->p_aioinfo = ki; 257 ki->kaio_flags = 0; 258 ki->kaio_maxactive_count = max_aio_per_proc; 259 ki->kaio_active_count = 0; 260 ki->kaio_qallowed_count = max_aio_queue_per_proc; 261 ki->kaio_queue_count = 0; 262 ki->kaio_ballowed_count = max_buf_aio; 263 ki->kaio_buffer_count = 0; 264 ki->kaio_buffer_finished_count = 0; 265 ki->kaio_p = p; 266 TAILQ_INIT(&ki->kaio_jobdone); 267 TAILQ_INIT(&ki->kaio_jobqueue); 268 TAILQ_INIT(&ki->kaio_bufdone); 269 TAILQ_INIT(&ki->kaio_bufqueue); 270 TAILQ_INIT(&ki->kaio_liojoblist); 271 TAILQ_INIT(&ki->kaio_sockqueue); 272 } 273 274 while (num_aio_procs < target_aio_procs) 275 aio_newproc(); 276} 277 278/* 279 * Free a job entry. Wait for completion if it is currently active, but don't 280 * delay forever. If we delay, we return a flag that says that we have to 281 * restart the queue scan. 282 */ 283int 284aio_free_entry(struct aiocblist *aiocbe) 285{ 286 struct kaioinfo *ki; 287 struct aioproclist *aiop; 288 struct aio_liojob *lj; 289 struct proc *p; 290 int error; 291 int s; 292 293 if (aiocbe->jobstate == JOBST_NULL) 294 panic("aio_free_entry: freeing already free job"); 295 296 p = aiocbe->userproc; 297 ki = p->p_aioinfo; 298 lj = aiocbe->lio; 299 if (ki == NULL) 300 panic("aio_free_entry: missing p->p_aioinfo"); 301 302 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 303 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 304 return 0; 305 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 306 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 307 } 308 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 309 310 if (aiocbe->bp == NULL) { 311 if (ki->kaio_queue_count <= 0) 312 panic("aio_free_entry: process queue size <= 0"); 313 if (num_queue_count <= 0) 314 panic("aio_free_entry: system wide queue size <= 0"); 315 316 if (lj) { 317 lj->lioj_queue_count--; 318 if (aiocbe->jobflags & AIOCBLIST_DONE) 319 lj->lioj_queue_finished_count--; 320 } 321 ki->kaio_queue_count--; 322 if (aiocbe->jobflags & AIOCBLIST_DONE) 323 ki->kaio_queue_finished_count--; 324 num_queue_count--; 325 } else { 326 if (lj) { 327 lj->lioj_buffer_count--; 328 if (aiocbe->jobflags & AIOCBLIST_DONE) 329 lj->lioj_buffer_finished_count--; 330 } 331 if (aiocbe->jobflags & AIOCBLIST_DONE) 332 ki->kaio_buffer_finished_count--; 333 ki->kaio_buffer_count--; 334 num_buf_aio--; 335 } 336 337 /* aiocbe is going away, we need to destroy any knotes */ 338 knote_remove(p, &aiocbe->klist); 339 340 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN) 341 && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) { 342 ki->kaio_flags &= ~KAIO_WAKEUP; 343 wakeup(p); 344 } 345 346 if (aiocbe->jobstate == JOBST_JOBQBUF) { 347 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 348 return error; 349 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 350 panic("aio_free_entry: invalid physio finish-up state"); 351 s = splbio(); 352 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 353 splx(s); 354 } else if (aiocbe->jobstate == JOBST_JOBQPROC) { 355 aiop = aiocbe->jobaioproc; 356 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 357 } else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) 358 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 359 else if (aiocbe->jobstate == JOBST_JOBFINISHED) 360 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 361 else if (aiocbe->jobstate == JOBST_JOBBFINISHED) { 362 s = splbio(); 363 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 364 splx(s); 365 if (aiocbe->bp) { 366 vunmapbuf(aiocbe->bp); 367 relpbuf(aiocbe->bp, NULL); 368 aiocbe->bp = NULL; 369 } 370 } 371 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 372 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 373 zfree(aiolio_zone, lj); 374 } 375 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 376 aiocbe->jobstate = JOBST_NULL; 377 return 0; 378} 379#endif /* VFS_AIO */ 380 381/* 382 * Rundown the jobs for a given process. 383 */ 384void 385aio_proc_rundown(struct proc *p) 386{ 387#ifndef VFS_AIO 388 return; 389#else 390 int s; 391 struct kaioinfo *ki; 392 struct aio_liojob *lj, *ljn; 393 struct aiocblist *aiocbe, *aiocbn; 394 struct file *fp; 395 struct filedesc *fdp; 396 struct socket *so; 397 398 ki = p->p_aioinfo; 399 if (ki == NULL) 400 return; 401 402 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 403 while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count > 404 ki->kaio_buffer_finished_count)) { 405 ki->kaio_flags |= KAIO_RUNDOWN; 406 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 407 break; 408 } 409 410 /* 411 * Move any aio ops that are waiting on socket I/O to the normal job 412 * queues so they are cleaned up with any others. 413 */ 414 fdp = p->p_fd; 415 416 s = splnet(); 417 for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe = 418 aiocbn) { 419 aiocbn = TAILQ_NEXT(aiocbe, plist); 420 fp = fdp->fd_ofiles[aiocbe->uaiocb.aio_fildes]; 421 422 /* 423 * Under some circumstances, the aio_fildes and the file 424 * structure don't match. This would leave aiocbe's in the 425 * TAILQ associated with the socket and cause a panic later. 426 * 427 * Detect and fix. 428 */ 429 if ((fp == NULL) || (fp != aiocbe->fd_file)) 430 fp = aiocbe->fd_file; 431 if (fp) { 432 so = (struct socket *)fp->f_data; 433 TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list); 434 if (TAILQ_EMPTY(&so->so_aiojobq)) { 435 so->so_snd.sb_flags &= ~SB_AIO; 436 so->so_rcv.sb_flags &= ~SB_AIO; 437 } 438 } 439 TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist); 440 TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list); 441 TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist); 442 } 443 splx(s); 444 445restart1: 446 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) { 447 aiocbn = TAILQ_NEXT(aiocbe, plist); 448 if (aio_free_entry(aiocbe)) 449 goto restart1; 450 } 451 452restart2: 453 for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe = 454 aiocbn) { 455 aiocbn = TAILQ_NEXT(aiocbe, plist); 456 if (aio_free_entry(aiocbe)) 457 goto restart2; 458 } 459 460/* 461 * Note the use of lots of splbio here, trying to avoid splbio for long chains 462 * of I/O. Probably unnecessary. 463 */ 464restart3: 465 s = splbio(); 466 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 467 ki->kaio_flags |= KAIO_WAKEUP; 468 tsleep(p, PRIBIO, "aioprn", 0); 469 splx(s); 470 goto restart3; 471 } 472 splx(s); 473 474restart4: 475 s = splbio(); 476 for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) { 477 aiocbn = TAILQ_NEXT(aiocbe, plist); 478 if (aio_free_entry(aiocbe)) { 479 splx(s); 480 goto restart4; 481 } 482 } 483 splx(s); 484 485 for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) { 486 ljn = TAILQ_NEXT(lj, lioj_list); 487 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 488 0)) { 489 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 490 zfree(aiolio_zone, lj); 491 } else { 492#ifdef DIAGNOSTIC 493 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, " 494 "QF:%d\n", lj->lioj_buffer_count, 495 lj->lioj_buffer_finished_count, 496 lj->lioj_queue_count, 497 lj->lioj_queue_finished_count); 498#endif 499 } 500 } 501 502 zfree(kaio_zone, ki); 503 p->p_aioinfo = NULL; 504#endif /* VFS_AIO */ 505} 506 507#ifdef VFS_AIO 508/* 509 * Select a job to run (called by an AIO daemon). 510 */ 511static struct aiocblist * 512aio_selectjob(struct aioproclist *aiop) 513{ 514 int s; 515 struct aiocblist *aiocbe; 516 struct kaioinfo *ki; 517 struct proc *userp; 518 519 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 520 if (aiocbe) { 521 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 522 return aiocbe; 523 } 524 525 s = splnet(); 526 for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe = 527 TAILQ_NEXT(aiocbe, list)) { 528 userp = aiocbe->userproc; 529 ki = userp->p_aioinfo; 530 531 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 532 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 533 splx(s); 534 return aiocbe; 535 } 536 } 537 splx(s); 538 539 return NULL; 540} 541 542/* 543 * The AIO processing activity. This is the code that does the I/O request for 544 * the non-physio version of the operations. The normal vn operations are used, 545 * and this code should work in all instances for every type of file, including 546 * pipes, sockets, fifos, and regular files. 547 */ 548void 549aio_process(struct aiocblist *aiocbe) 550{ 551 struct filedesc *fdp; 552 struct proc *userp, *mycp; 553 struct aiocb *cb; 554 struct file *fp; 555 struct uio auio; 556 struct iovec aiov; 557 unsigned int fd; 558 int cnt; 559 int error; 560 off_t offset; 561 int oublock_st, oublock_end; 562 int inblock_st, inblock_end; 563 564 userp = aiocbe->userproc; 565 cb = &aiocbe->uaiocb; 566 567 mycp = curproc; 568 569 fdp = mycp->p_fd; 570 fd = cb->aio_fildes; 571 fp = fdp->fd_ofiles[fd]; 572 573 if ((fp == NULL) || (fp != aiocbe->fd_file)) { 574 cb->_aiocb_private.error = EBADF; 575 cb->_aiocb_private.status = -1; 576 return; 577 } 578 579 aiov.iov_base = (void *)cb->aio_buf; 580 aiov.iov_len = cb->aio_nbytes; 581 582 auio.uio_iov = &aiov; 583 auio.uio_iovcnt = 1; 584 auio.uio_offset = offset = cb->aio_offset; 585 auio.uio_resid = cb->aio_nbytes; 586 cnt = cb->aio_nbytes; 587 auio.uio_segflg = UIO_USERSPACE; 588 auio.uio_procp = mycp; 589 590 inblock_st = mycp->p_stats->p_ru.ru_inblock; 591 oublock_st = mycp->p_stats->p_ru.ru_oublock; 592 /* 593 * Temporarily bump the ref count while reading to avoid the 594 * descriptor being ripped out from under us. 595 */ 596 fhold(fp); 597 if (cb->aio_lio_opcode == LIO_READ) { 598 auio.uio_rw = UIO_READ; 599 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 600 } else { 601 auio.uio_rw = UIO_WRITE; 602 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, mycp); 603 } 604 fdrop(fp, mycp); 605 inblock_end = mycp->p_stats->p_ru.ru_inblock; 606 oublock_end = mycp->p_stats->p_ru.ru_oublock; 607 608 aiocbe->inputcharge = inblock_end - inblock_st; 609 aiocbe->outputcharge = oublock_end - oublock_st; 610 611 if ((error) && (auio.uio_resid != cnt)) { 612 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 613 error = 0; 614 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 615 psignal(userp, SIGPIPE); 616 } 617 618 cnt -= auio.uio_resid; 619 cb->_aiocb_private.error = error; 620 cb->_aiocb_private.status = cnt; 621 622 return; 623} 624 625/* 626 * The AIO daemon, most of the actual work is done in aio_process, 627 * but the setup (and address space mgmt) is done in this routine. 628 */ 629static void 630aio_daemon(void *uproc) 631{ 632 int s; 633 struct aio_liojob *lj; 634 struct aiocb *cb; 635 struct aiocblist *aiocbe; 636 struct aioproclist *aiop; 637 struct kaioinfo *ki; 638 struct proc *curcp, *mycp, *userp; 639 struct vmspace *myvm, *tmpvm; 640
|
642 /* 643 * Local copies of curproc (cp) and vmspace (myvm) 644 */ 645 mycp = curproc; 646 myvm = mycp->p_vmspace; 647 648 if (mycp->p_textvp) { 649 vrele(mycp->p_textvp); 650 mycp->p_textvp = NULL; 651 } 652 653 /* 654 * Allocate and ready the aio control info. There is one aiop structure 655 * per daemon. 656 */ 657 aiop = zalloc(aiop_zone); 658 aiop->aioproc = mycp; 659 aiop->aioprocflags |= AIOP_FREE; 660 TAILQ_INIT(&aiop->jobtorun); 661 662 s = splnet(); 663 664 /* 665 * Place thread (lightweight process) onto the AIO free thread list. 666 */ 667 if (TAILQ_EMPTY(&aio_freeproc)) 668 wakeup(&aio_freeproc); 669 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 670 671 splx(s); 672 673 /* Make up a name for the daemon. */ 674 strcpy(mycp->p_comm, "aiod"); 675 676 /* 677 * Get rid of our current filedescriptors. AIOD's don't need any 678 * filedescriptors, except as temporarily inherited from the client. 679 * Credentials are also cloned, and made equivalent to "root". 680 */ 681 fdfree(mycp); 682 mycp->p_fd = NULL; 683 mycp->p_ucred = crcopy(mycp->p_ucred); 684 mycp->p_ucred->cr_uid = 0; 685 uifree(mycp->p_ucred->cr_uidinfo); 686 mycp->p_ucred->cr_uidinfo = uifind(0); 687 mycp->p_ucred->cr_ngroups = 1; 688 mycp->p_ucred->cr_groups[0] = 1; 689 690 /* The daemon resides in its own pgrp. */ 691 enterpgrp(mycp, mycp->p_pid, 1); 692 693 /* Mark special process type. */ 694 mycp->p_flag |= P_SYSTEM; 695 696 /* 697 * Wakeup parent process. (Parent sleeps to keep from blasting away 698 * creating to many daemons.) 699 */ 700 wakeup(mycp); 701 702 for (;;) { 703 /* 704 * curcp is the current daemon process context. 705 * userp is the current user process context. 706 */ 707 curcp = mycp; 708 709 /* 710 * Take daemon off of free queue 711 */ 712 if (aiop->aioprocflags & AIOP_FREE) { 713 s = splnet(); 714 TAILQ_REMOVE(&aio_freeproc, aiop, list); 715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 716 aiop->aioprocflags &= ~AIOP_FREE; 717 splx(s); 718 } 719 aiop->aioprocflags &= ~AIOP_SCHED; 720 721 /* 722 * Check for jobs. 723 */ 724 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 725 cb = &aiocbe->uaiocb; 726 userp = aiocbe->userproc; 727 728 aiocbe->jobstate = JOBST_JOBRUNNING; 729 730 /* 731 * Connect to process address space for user program. 732 */ 733 if (userp != curcp) { 734 /* 735 * Save the current address space that we are 736 * connected to. 737 */ 738 tmpvm = mycp->p_vmspace; 739 740 /* 741 * Point to the new user address space, and 742 * refer to it. 743 */ 744 mycp->p_vmspace = userp->p_vmspace; 745 mycp->p_vmspace->vm_refcnt++; 746 747 /* Activate the new mapping. */ 748 pmap_activate(mycp); 749 750 /* 751 * If the old address space wasn't the daemons 752 * own address space, then we need to remove the 753 * daemon's reference from the other process 754 * that it was acting on behalf of. 755 */ 756 if (tmpvm != myvm) { 757 vmspace_free(tmpvm); 758 } 759 760 /* 761 * Disassociate from previous clients file 762 * descriptors, and associate to the new clients 763 * descriptors. Note that the daemon doesn't 764 * need to worry about its orginal descriptors, 765 * because they were originally freed. 766 */ 767 if (mycp->p_fd) 768 fdfree(mycp); 769 mycp->p_fd = fdshare(userp); 770 curcp = userp; 771 } 772 773 ki = userp->p_aioinfo; 774 lj = aiocbe->lio; 775 776 /* Account for currently active jobs. */ 777 ki->kaio_active_count++; 778 779 /* Do the I/O function. */ 780 aiocbe->jobaioproc = aiop; 781 aio_process(aiocbe); 782 783 /* Decrement the active job count. */ 784 ki->kaio_active_count--; 785 786 /* 787 * Increment the completion count for wakeup/signal 788 * comparisons. 789 */ 790 aiocbe->jobflags |= AIOCBLIST_DONE; 791 ki->kaio_queue_finished_count++; 792 if (lj) 793 lj->lioj_queue_finished_count++; 794 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 795 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 796 ki->kaio_flags &= ~KAIO_WAKEUP; 797 wakeup(userp); 798 } 799 800 s = splbio(); 801 if (lj && (lj->lioj_flags & 802 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 803 if ((lj->lioj_queue_finished_count == 804 lj->lioj_queue_count) && 805 (lj->lioj_buffer_finished_count == 806 lj->lioj_buffer_count)) { 807 psignal(userp, 808 lj->lioj_signal.sigev_signo); 809 lj->lioj_flags |= 810 LIOJ_SIGNAL_POSTED; 811 } 812 } 813 splx(s); 814 815 aiocbe->jobstate = JOBST_JOBFINISHED; 816 817 /* 818 * If the I/O request should be automatically rundown, 819 * do the needed cleanup. Otherwise, place the queue 820 * entry for the just finished I/O request into the done 821 * queue for the associated client. 822 */ 823 s = splnet(); 824 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 825 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 826 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 827 } else { 828 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 829 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 830 plist); 831 } 832 splx(s); 833 KNOTE(&aiocbe->klist, 0); 834 835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 836 wakeup(aiocbe); 837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 838 } 839 840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 841 psignal(userp, cb->aio_sigevent.sigev_signo); 842 } 843 } 844 845 /* 846 * Disconnect from user address space. 847 */ 848 if (curcp != mycp) { 849 /* Get the user address space to disconnect from. */ 850 tmpvm = mycp->p_vmspace; 851 852 /* Get original address space for daemon. */ 853 mycp->p_vmspace = myvm; 854 855 /* Activate the daemon's address space. */ 856 pmap_activate(mycp); 857#ifdef DIAGNOSTIC 858 if (tmpvm == myvm) { 859 printf("AIOD: vmspace problem -- %d\n", 860 mycp->p_pid); 861 } 862#endif 863 /* Remove our vmspace reference. */ 864 vmspace_free(tmpvm); 865 866 /* 867 * Disassociate from the user process's file 868 * descriptors. 869 */ 870 if (mycp->p_fd) 871 fdfree(mycp); 872 mycp->p_fd = NULL; 873 curcp = mycp; 874 } 875 876 /* 877 * If we are the first to be put onto the free queue, wakeup 878 * anyone waiting for a daemon. 879 */ 880 s = splnet(); 881 TAILQ_REMOVE(&aio_activeproc, aiop, list); 882 if (TAILQ_EMPTY(&aio_freeproc)) 883 wakeup(&aio_freeproc); 884 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 885 aiop->aioprocflags |= AIOP_FREE; 886 splx(s); 887 888 /* 889 * If daemon is inactive for a long time, allow it to exit, 890 * thereby freeing resources. 891 */ 892 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 893 PRIBIO, "aiordy", aiod_lifetime)) { 894 s = splnet(); 895 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 897 if ((aiop->aioprocflags & AIOP_FREE) && 898 (num_aio_procs > target_aio_procs)) { 899 TAILQ_REMOVE(&aio_freeproc, aiop, list); 900 splx(s); 901 zfree(aiop_zone, aiop); 902 num_aio_procs--; 903#ifdef DIAGNOSTIC 904 if (mycp->p_vmspace->vm_refcnt <= 1) { 905 printf("AIOD: bad vm refcnt for" 906 " exiting daemon: %d\n", 907 mycp->p_vmspace->vm_refcnt); 908 } 909#endif 910 exit1(mycp, 0); 911 } 912 } 913 splx(s); 914 } 915 } 916} 917 918/* 919 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 920 * AIO daemon modifies its environment itself. 921 */ 922static int 923aio_newproc() 924{ 925 int error; 926 struct proc *p, *np; 927 928 p = &proc0; 929 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 930 if (error) 931 return error; 932 cpu_set_fork_handler(np, aio_daemon, curproc); 933 934 /* 935 * Wait until daemon is started, but continue on just in case to 936 * handle error conditions. 937 */ 938 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 939 num_aio_procs++; 940 941 return error; 942} 943 944/* 945 * Try the high-performance physio method for eligible VCHR devices. This 946 * routine doesn't require the use of any additional threads, and have overhead. 947 */ 948int 949aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 950{ 951 int error; 952 struct aiocb *cb; 953 struct file *fp; 954 struct buf *bp; 955 struct vnode *vp; 956 struct kaioinfo *ki; 957 struct filedesc *fdp; 958 struct aio_liojob *lj; 959 int fd; 960 int s; 961 int notify; 962 963 cb = &aiocbe->uaiocb; 964 fdp = p->p_fd; 965 fd = cb->aio_fildes; 966 fp = fdp->fd_ofiles[fd]; 967 968 if (fp->f_type != DTYPE_VNODE) 969 return (-1); 970 971 vp = (struct vnode *)fp->f_data; 972 973 /* 974 * If its not a disk, we don't want to return a positive error. 975 * It causes the aio code to not fall through to try the thread 976 * way when you're talking to a regular file. 977 */ 978 if (!vn_isdisk(vp, &error)) { 979 if (error == ENOTBLK) 980 return (-1); 981 else 982 return (error); 983 } 984 985 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 986 return (-1); 987 988 if (cb->aio_nbytes > MAXPHYS) 989 return (-1); 990 991 ki = p->p_aioinfo; 992 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 993 return (-1); 994 995 fhold(fp); 996 997 ki->kaio_buffer_count++; 998 999 lj = aiocbe->lio; 1000 if (lj) 1001 lj->lioj_buffer_count++; 1002 1003 /* Create and build a buffer header for a transfer. */ 1004 bp = (struct buf *)getpbuf(NULL); 1005 1006 /* 1007 * Get a copy of the kva from the physical buffer. 1008 */ 1009 bp->b_caller1 = p; 1010 bp->b_dev = vp->v_rdev; 1011 error = bp->b_error = 0; 1012 1013 bp->b_bcount = cb->aio_nbytes; 1014 bp->b_bufsize = cb->aio_nbytes; 1015 bp->b_flags = B_PHYS; 1016 bp->b_iodone = aio_physwakeup; 1017 bp->b_saveaddr = bp->b_data; 1018 bp->b_data = (void *)cb->aio_buf; 1019 bp->b_blkno = btodb(cb->aio_offset); 1020 1021 if (cb->aio_lio_opcode == LIO_WRITE) { 1022 bp->b_iocmd = BIO_WRITE; 1023 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1024 error = EFAULT; 1025 goto doerror; 1026 } 1027 } else { 1028 bp->b_iocmd = BIO_READ; 1029 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1030 error = EFAULT; 1031 goto doerror; 1032 } 1033 } 1034 1035 /* Bring buffer into kernel space. */ 1036 vmapbuf(bp); 1037 1038 s = splbio(); 1039 aiocbe->bp = bp; 1040 bp->b_spc = (void *)aiocbe; 1041 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1042 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1043 aiocbe->jobstate = JOBST_JOBQBUF; 1044 cb->_aiocb_private.status = cb->aio_nbytes; 1045 num_buf_aio++; 1046 bp->b_error = 0; 1047 1048 splx(s); 1049 1050 /* Perform transfer. */ 1051 DEV_STRATEGY(bp, 0); 1052 1053 notify = 0; 1054 s = splbio(); 1055 1056 /* 1057 * If we had an error invoking the request, or an error in processing 1058 * the request before we have returned, we process it as an error in 1059 * transfer. Note that such an I/O error is not indicated immediately, 1060 * but is returned using the aio_error mechanism. In this case, 1061 * aio_suspend will return immediately. 1062 */ 1063 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1064 struct aiocb *job = aiocbe->uuaiocb; 1065 1066 aiocbe->uaiocb._aiocb_private.status = 0; 1067 suword(&job->_aiocb_private.status, 0); 1068 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1069 suword(&job->_aiocb_private.error, bp->b_error); 1070 1071 ki->kaio_buffer_finished_count++; 1072 1073 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1074 aiocbe->jobstate = JOBST_JOBBFINISHED; 1075 aiocbe->jobflags |= AIOCBLIST_DONE; 1076 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1077 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1078 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1079 notify = 1; 1080 } 1081 } 1082 splx(s); 1083 if (notify) 1084 KNOTE(&aiocbe->klist, 0); 1085 fdrop(fp, p); 1086 return 0; 1087 1088doerror: 1089 ki->kaio_buffer_count--; 1090 if (lj) 1091 lj->lioj_buffer_count--; 1092 aiocbe->bp = NULL; 1093 relpbuf(bp, NULL); 1094 fdrop(fp, p); 1095 return error; 1096} 1097 1098/* 1099 * This waits/tests physio completion. 1100 */ 1101int 1102aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait) 1103{ 1104 int s; 1105 struct buf *bp; 1106 int error; 1107 1108 bp = iocb->bp; 1109 1110 s = splbio(); 1111 if (flgwait == 0) { 1112 if ((bp->b_flags & B_DONE) == 0) { 1113 splx(s); 1114 return EINPROGRESS; 1115 } 1116 } 1117 1118 while ((bp->b_flags & B_DONE) == 0) { 1119 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1120 if ((bp->b_flags & B_DONE) == 0) { 1121 splx(s); 1122 return EINPROGRESS; 1123 } else 1124 break; 1125 } 1126 } 1127 1128 /* Release mapping into kernel space. */ 1129 vunmapbuf(bp); 1130 iocb->bp = 0; 1131 1132 error = 0; 1133 1134 /* Check for an error. */ 1135 if (bp->b_ioflags & BIO_ERROR) 1136 error = bp->b_error; 1137 1138 relpbuf(bp, NULL); 1139 return (error); 1140} 1141#endif /* VFS_AIO */ 1142 1143/* 1144 * Wake up aio requests that may be serviceable now. 1145 */ 1146void 1147aio_swake(struct socket *so, struct sockbuf *sb) 1148{ 1149#ifndef VFS_AIO 1150 return; 1151#else 1152 struct aiocblist *cb,*cbn; 1153 struct proc *p; 1154 struct kaioinfo *ki = NULL; 1155 int opcode, wakecount = 0; 1156 struct aioproclist *aiop; 1157 1158 if (sb == &so->so_snd) { 1159 opcode = LIO_WRITE; 1160 so->so_snd.sb_flags &= ~SB_AIO; 1161 } else { 1162 opcode = LIO_READ; 1163 so->so_rcv.sb_flags &= ~SB_AIO; 1164 } 1165 1166 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1167 cbn = TAILQ_NEXT(cb, list); 1168 if (opcode == cb->uaiocb.aio_lio_opcode) { 1169 p = cb->userproc; 1170 ki = p->p_aioinfo; 1171 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1172 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1173 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1174 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1175 wakecount++; 1176 if (cb->jobstate != JOBST_JOBQGLOBAL) 1177 panic("invalid queue value"); 1178 } 1179 } 1180 1181 while (wakecount--) { 1182 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1183 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1184 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1185 aiop->aioprocflags &= ~AIOP_FREE; 1186 wakeup(aiop->aioproc); 1187 } 1188 } 1189#endif /* VFS_AIO */ 1190} 1191 1192#ifdef VFS_AIO 1193/* 1194 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1195 * technique is done in this code. 1196 */ 1197static int 1198_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1199{ 1200 struct filedesc *fdp; 1201 struct file *fp; 1202 unsigned int fd; 1203 struct socket *so; 1204 int s; 1205 int error; 1206 int opcode; 1207 struct aiocblist *aiocbe; 1208 struct aioproclist *aiop; 1209 struct kaioinfo *ki; 1210 struct kevent kev; 1211 struct kqueue *kq; 1212 struct file *kq_fp; 1213 1214 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1215 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1216 else 1217 aiocbe = zalloc (aiocb_zone); 1218 1219 aiocbe->inputcharge = 0; 1220 aiocbe->outputcharge = 0; 1221 SLIST_INIT(&aiocbe->klist); 1222 1223 suword(&job->_aiocb_private.status, -1); 1224 suword(&job->_aiocb_private.error, 0); 1225 suword(&job->_aiocb_private.kernelinfo, -1); 1226 1227 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof 1228 aiocbe->uaiocb); 1229 if (error) { 1230 suword(&job->_aiocb_private.error, error); 1231 1232 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1233 return error; 1234 } 1235 1236 /* Save userspace address of the job info. */ 1237 aiocbe->uuaiocb = job; 1238 1239 /* Get the opcode. */ 1240 if (type != LIO_NOP) 1241 aiocbe->uaiocb.aio_lio_opcode = type; 1242 opcode = aiocbe->uaiocb.aio_lio_opcode; 1243 1244 /* Get the fd info for process. */ 1245 fdp = p->p_fd; 1246 1247 /* 1248 * Range check file descriptor. 1249 */ 1250 fd = aiocbe->uaiocb.aio_fildes; 1251 if (fd >= fdp->fd_nfiles) { 1252 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1253 if (type == 0) 1254 suword(&job->_aiocb_private.error, EBADF); 1255 return EBADF; 1256 } 1257 1258 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1259 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1260 0))) { 1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1262 if (type == 0) 1263 suword(&job->_aiocb_private.error, EBADF); 1264 return EBADF; 1265 } 1266 1267 if (aiocbe->uaiocb.aio_offset == -1LL) { 1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1269 if (type == 0) 1270 suword(&job->_aiocb_private.error, EINVAL); 1271 return EINVAL; 1272 } 1273 1274 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1275 if (error) { 1276 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1277 if (type == 0) 1278 suword(&job->_aiocb_private.error, EINVAL); 1279 return error; 1280 } 1281 1282 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1283 if (jobrefid == LONG_MAX) 1284 jobrefid = 1; 1285 else 1286 jobrefid++; 1287 1288 if (opcode == LIO_NOP) { 1289 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1290 if (type == 0) { 1291 suword(&job->_aiocb_private.error, 0); 1292 suword(&job->_aiocb_private.status, 0); 1293 suword(&job->_aiocb_private.kernelinfo, 0); 1294 } 1295 return 0; 1296 } 1297 1298 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1299 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1300 if (type == 0) { 1301 suword(&job->_aiocb_private.status, 0); 1302 suword(&job->_aiocb_private.error, EINVAL); 1303 } 1304 return EINVAL; 1305 } 1306 1307 fhold(fp); 1308 1309 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1310 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1311 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; 1312 } 1313 else { 1314 /* 1315 * This method for requesting kevent-based notification won't 1316 * work on the alpha, since we're passing in a pointer 1317 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1318 * based method instead. 1319 */ 1320 struct kevent *kevp; 1321 1322 kevp = (struct kevent *)job->aio_lio_opcode; 1323 if (kevp == NULL) 1324 goto no_kqueue; 1325 1326 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev)); 1327 if (error) 1328 goto aqueue_fail; 1329 } 1330 if ((u_int)kev.ident >= fdp->fd_nfiles || 1331 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || 1332 (kq_fp->f_type != DTYPE_KQUEUE)) { 1333 error = EBADF; 1334 goto aqueue_fail; 1335 } 1336 kq = (struct kqueue *)kq_fp->f_data; 1337 kev.ident = (uintptr_t)aiocbe; 1338 kev.filter = EVFILT_AIO; 1339 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1340 error = kqueue_register(kq, &kev, p); 1341aqueue_fail: 1342 if (error) { 1343 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1344 if (type == 0) 1345 suword(&job->_aiocb_private.error, error); 1346 goto done; 1347 } 1348no_kqueue: 1349 1350 suword(&job->_aiocb_private.error, EINPROGRESS); 1351 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1352 aiocbe->userproc = p; 1353 aiocbe->jobflags = 0; 1354 aiocbe->lio = lj; 1355 ki = p->p_aioinfo; 1356 1357 if (fp->f_type == DTYPE_SOCKET) { 1358 /* 1359 * Alternate queueing for socket ops: Reach down into the 1360 * descriptor to get the socket data. Then check to see if the 1361 * socket is ready to be read or written (based on the requested 1362 * operation). 1363 * 1364 * If it is not ready for io, then queue the aiocbe on the 1365 * socket, and set the flags so we get a call when sbnotify() 1366 * happens. 1367 */ 1368 so = (struct socket *)fp->f_data; 1369 s = splnet(); 1370 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1371 LIO_WRITE) && (!sowriteable(so)))) { 1372 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1373 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1374 if (opcode == LIO_READ) 1375 so->so_rcv.sb_flags |= SB_AIO; 1376 else 1377 so->so_snd.sb_flags |= SB_AIO; 1378 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1379 ki->kaio_queue_count++; 1380 num_queue_count++; 1381 splx(s); 1382 error = 0; 1383 goto done; 1384 } 1385 splx(s); 1386 } 1387 1388 if ((error = aio_qphysio(p, aiocbe)) == 0) 1389 goto done; 1390 if (error > 0) { 1391 suword(&job->_aiocb_private.status, 0); 1392 aiocbe->uaiocb._aiocb_private.error = error; 1393 suword(&job->_aiocb_private.error, error); 1394 goto done; 1395 } 1396 1397 /* No buffer for daemon I/O. */ 1398 aiocbe->bp = NULL; 1399 1400 ki->kaio_queue_count++; 1401 if (lj) 1402 lj->lioj_queue_count++; 1403 s = splnet(); 1404 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1405 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1406 splx(s); 1407 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1408 1409 num_queue_count++; 1410 error = 0; 1411 1412 /* 1413 * If we don't have a free AIO process, and we are below our quota, then 1414 * start one. Otherwise, depend on the subsequent I/O completions to 1415 * pick-up this job. If we don't sucessfully create the new process 1416 * (thread) due to resource issues, we return an error for now (EAGAIN), 1417 * which is likely not the correct thing to do. 1418 */ 1419retryproc: 1420 s = splnet(); 1421 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1422 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1423 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1424 aiop->aioprocflags &= ~AIOP_FREE; 1425 wakeup(aiop->aioproc); 1426 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1427 ((ki->kaio_active_count + num_aio_resv_start) < 1428 ki->kaio_maxactive_count)) { 1429 num_aio_resv_start++; 1430 if ((error = aio_newproc()) == 0) { 1431 num_aio_resv_start--; 1432 p->p_retval[0] = 0; 1433 goto retryproc; 1434 } 1435 num_aio_resv_start--; 1436 } 1437 splx(s); 1438done: 1439 fdrop(fp, p); 1440 return error; 1441} 1442 1443/* 1444 * This routine queues an AIO request, checking for quotas. 1445 */ 1446static int 1447aio_aqueue(struct proc *p, struct aiocb *job, int type) 1448{ 1449 struct kaioinfo *ki; 1450 1451 if (p->p_aioinfo == NULL) 1452 aio_init_aioinfo(p); 1453 1454 if (num_queue_count >= max_queue_count) 1455 return EAGAIN; 1456 1457 ki = p->p_aioinfo; 1458 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1459 return EAGAIN; 1460 1461 return _aio_aqueue(p, job, NULL, type); 1462} 1463#endif /* VFS_AIO */ 1464 1465/* 1466 * Support the aio_return system call, as a side-effect, kernel resources are 1467 * released. 1468 */ 1469int 1470aio_return(struct proc *p, struct aio_return_args *uap) 1471{ 1472#ifndef VFS_AIO 1473 return ENOSYS; 1474#else 1475 int s; 1476 int jobref; 1477 struct aiocblist *cb, *ncb; 1478 struct aiocb *ujob; 1479 struct kaioinfo *ki; 1480 1481 ki = p->p_aioinfo; 1482 if (ki == NULL) 1483 return EINVAL; 1484 1485 ujob = uap->aiocbp; 1486 1487 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1488 if (jobref == -1 || jobref == 0) 1489 return EINVAL; 1490 1491 s = splnet(); 1492 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1493 plist)) { 1494 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1495 jobref) { 1496 splx(s); 1497 if (ujob == cb->uuaiocb) { 1498 p->p_retval[0] = 1499 cb->uaiocb._aiocb_private.status; 1500 } else 1501 p->p_retval[0] = EFAULT; 1502 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1503 curproc->p_stats->p_ru.ru_oublock += 1504 cb->outputcharge; 1505 cb->outputcharge = 0; 1506 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1507 curproc->p_stats->p_ru.ru_inblock += 1508 cb->inputcharge; 1509 cb->inputcharge = 0; 1510 } 1511 aio_free_entry(cb); 1512 return 0; 1513 } 1514 } 1515 splx(s); 1516 1517 s = splbio(); 1518 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1519 ncb = TAILQ_NEXT(cb, plist); 1520 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1521 == jobref) { 1522 splx(s); 1523 if (ujob == cb->uuaiocb) { 1524 p->p_retval[0] = 1525 cb->uaiocb._aiocb_private.status; 1526 } else 1527 p->p_retval[0] = EFAULT; 1528 aio_free_entry(cb); 1529 return 0; 1530 } 1531 } 1532 splx(s); 1533 1534 return (EINVAL); 1535#endif /* VFS_AIO */ 1536} 1537 1538/* 1539 * Allow a process to wakeup when any of the I/O requests are completed. 1540 */ 1541int 1542aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1543{ 1544#ifndef VFS_AIO 1545 return ENOSYS; 1546#else 1547 struct timeval atv; 1548 struct timespec ts; 1549 struct aiocb *const *cbptr, *cbp; 1550 struct kaioinfo *ki; 1551 struct aiocblist *cb; 1552 int i; 1553 int njoblist; 1554 int error, s, timo; 1555 int *ijoblist; 1556 struct aiocb **ujoblist; 1557 1558 if (uap->nent >= AIO_LISTIO_MAX) 1559 return EINVAL; 1560 1561 timo = 0; 1562 if (uap->timeout) { 1563 /* Get timespec struct. */ 1564 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1565 return error; 1566 1567 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1568 return (EINVAL); 1569 1570 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1571 if (itimerfix(&atv)) 1572 return (EINVAL); 1573 timo = tvtohz(&atv); 1574 } 1575 1576 ki = p->p_aioinfo; 1577 if (ki == NULL) 1578 return EAGAIN; 1579 1580 njoblist = 0; 1581 ijoblist = zalloc(aiol_zone); 1582 ujoblist = zalloc(aiol_zone); 1583 cbptr = uap->aiocbp; 1584 1585 for (i = 0; i < uap->nent; i++) { 1586 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1587 if (cbp == 0) 1588 continue; 1589 ujoblist[njoblist] = cbp; 1590 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1591 njoblist++; 1592 } 1593 1594 if (njoblist == 0) { 1595 zfree(aiol_zone, ijoblist); 1596 zfree(aiol_zone, ujoblist); 1597 return 0; 1598 } 1599 1600 error = 0; 1601 for (;;) { 1602 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = 1603 TAILQ_NEXT(cb, plist)) { 1604 for (i = 0; i < njoblist; i++) { 1605 if (((intptr_t) 1606 cb->uaiocb._aiocb_private.kernelinfo) == 1607 ijoblist[i]) { 1608 if (ujoblist[i] != cb->uuaiocb) 1609 error = EINVAL; 1610 zfree(aiol_zone, ijoblist); 1611 zfree(aiol_zone, ujoblist); 1612 return error; 1613 } 1614 } 1615 } 1616 1617 s = splbio(); 1618 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1619 TAILQ_NEXT(cb, plist)) { 1620 for (i = 0; i < njoblist; i++) { 1621 if (((intptr_t) 1622 cb->uaiocb._aiocb_private.kernelinfo) == 1623 ijoblist[i]) { 1624 splx(s); 1625 if (ujoblist[i] != cb->uuaiocb) 1626 error = EINVAL; 1627 zfree(aiol_zone, ijoblist); 1628 zfree(aiol_zone, ujoblist); 1629 return error; 1630 } 1631 } 1632 } 1633 1634 ki->kaio_flags |= KAIO_WAKEUP; 1635 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1636 splx(s); 1637 1638 if (error == ERESTART || error == EINTR) { 1639 zfree(aiol_zone, ijoblist); 1640 zfree(aiol_zone, ujoblist); 1641 return EINTR; 1642 } else if (error == EWOULDBLOCK) { 1643 zfree(aiol_zone, ijoblist); 1644 zfree(aiol_zone, ujoblist); 1645 return EAGAIN; 1646 } 1647 } 1648 1649/* NOTREACHED */ 1650 return EINVAL; 1651#endif /* VFS_AIO */ 1652} 1653 1654/* 1655 * aio_cancel cancels any non-physio aio operations not currently in 1656 * progress. 1657 */ 1658int 1659aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1660{ 1661#ifndef VFS_AIO 1662 return ENOSYS; 1663#else 1664 struct kaioinfo *ki; 1665 struct aiocblist *cbe, *cbn; 1666 struct file *fp; 1667 struct filedesc *fdp; 1668 struct socket *so; 1669 struct proc *po; 1670 int s,error; 1671 int cancelled=0; 1672 int notcancelled=0; 1673 struct vnode *vp; 1674 1675 fdp = p->p_fd; 1676 1677 fp = fdp->fd_ofiles[uap->fd]; 1678 1679 if (fp == NULL) { 1680 return EBADF; 1681 } 1682 1683 if (fp->f_type == DTYPE_VNODE) { 1684 vp = (struct vnode *)fp->f_data; 1685 1686 if (vn_isdisk(vp,&error)) { 1687 p->p_retval[0] = AIO_NOTCANCELED; 1688 return 0; 1689 } 1690 } else if (fp->f_type == DTYPE_SOCKET) { 1691 so = (struct socket *)fp->f_data; 1692 1693 s = splnet(); 1694 1695 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1696 cbn = TAILQ_NEXT(cbe, list); 1697 if ((uap->aiocbp == NULL) || 1698 (uap->aiocbp == cbe->uuaiocb) ) { 1699 po = cbe->userproc; 1700 ki = po->p_aioinfo; 1701 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1702 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1703 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1704 if (ki->kaio_flags & KAIO_WAKEUP) { 1705 wakeup(po); 1706 } 1707 cbe->jobstate = JOBST_JOBFINISHED; 1708 cbe->uaiocb._aiocb_private.status=-1; 1709 cbe->uaiocb._aiocb_private.error=ECANCELED; 1710 cancelled++; 1711/* XXX cancelled, knote? */ 1712 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1713 SIGEV_SIGNAL) 1714 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1715 if (uap->aiocbp) 1716 break; 1717 } 1718 } 1719 1720 splx(s); 1721 1722 if ((cancelled) && (uap->aiocbp)) { 1723 p->p_retval[0] = AIO_CANCELED; 1724 return 0; 1725 } 1726 1727 } 1728 1729 ki=p->p_aioinfo; 1730 1731 s = splnet(); 1732 1733 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1734 cbn = TAILQ_NEXT(cbe, plist); 1735 1736 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1737 ((uap->aiocbp == NULL ) || 1738 (uap->aiocbp == cbe->uuaiocb))) { 1739 1740 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1741 TAILQ_REMOVE(&aio_jobs, cbe, list); 1742 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1743 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1744 plist); 1745 cancelled++; 1746 ki->kaio_queue_finished_count++; 1747 cbe->jobstate = JOBST_JOBFINISHED; 1748 cbe->uaiocb._aiocb_private.status = -1; 1749 cbe->uaiocb._aiocb_private.error = ECANCELED; 1750/* XXX cancelled, knote? */ 1751 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1752 SIGEV_SIGNAL) 1753 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1754 } else { 1755 notcancelled++; 1756 } 1757 } 1758 } 1759 1760 splx(s); 1761 1762 1763 if (notcancelled) { 1764 p->p_retval[0] = AIO_NOTCANCELED; 1765 return 0; 1766 } 1767 1768 if (cancelled) { 1769 p->p_retval[0] = AIO_CANCELED; 1770 return 0; 1771 } 1772 1773 p->p_retval[0] = AIO_ALLDONE; 1774 1775 return 0; 1776#endif /* VFS_AIO */ 1777} 1778 1779/* 1780 * aio_error is implemented in the kernel level for compatibility purposes only. 1781 * For a user mode async implementation, it would be best to do it in a userland 1782 * subroutine. 1783 */ 1784int 1785aio_error(struct proc *p, struct aio_error_args *uap) 1786{ 1787#ifndef VFS_AIO 1788 return ENOSYS; 1789#else 1790 int s; 1791 struct aiocblist *cb; 1792 struct kaioinfo *ki; 1793 int jobref; 1794 1795 ki = p->p_aioinfo; 1796 if (ki == NULL) 1797 return EINVAL; 1798 1799 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1800 if ((jobref == -1) || (jobref == 0)) 1801 return EINVAL; 1802 1803 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1804 plist)) { 1805 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1806 jobref) { 1807 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1808 return 0; 1809 } 1810 } 1811 1812 s = splnet(); 1813 1814 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1815 plist)) { 1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1817 jobref) { 1818 p->p_retval[0] = EINPROGRESS; 1819 splx(s); 1820 return 0; 1821 } 1822 } 1823 1824 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1825 plist)) { 1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1827 jobref) { 1828 p->p_retval[0] = EINPROGRESS; 1829 splx(s); 1830 return 0; 1831 } 1832 } 1833 splx(s); 1834 1835 s = splbio(); 1836 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1837 plist)) { 1838 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1839 jobref) { 1840 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1841 splx(s); 1842 return 0; 1843 } 1844 } 1845 1846 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1847 plist)) { 1848 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1849 jobref) { 1850 p->p_retval[0] = EINPROGRESS; 1851 splx(s); 1852 return 0; 1853 } 1854 } 1855 splx(s); 1856 1857#if (0) 1858 /* 1859 * Hack for lio. 1860 */ 1861 status = fuword(&uap->aiocbp->_aiocb_private.status); 1862 if (status == -1) 1863 return fuword(&uap->aiocbp->_aiocb_private.error); 1864#endif 1865 return EINVAL; 1866#endif /* VFS_AIO */ 1867} 1868 1869int 1870aio_read(struct proc *p, struct aio_read_args *uap) 1871{ 1872#ifndef VFS_AIO 1873 return ENOSYS; 1874#else 1875 struct filedesc *fdp; 1876 struct file *fp; 1877 struct uio auio; 1878 struct iovec aiov; 1879 unsigned int fd; 1880 int cnt; 1881 struct aiocb iocb; 1882 int error, pmodes; 1883 1884 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1885 if ((pmodes & AIO_PMODE_SYNC) == 0) 1886 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1887 1888 /* Get control block. */ 1889 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1890 != 0) 1891 return error; 1892 1893 /* Get the fd info for process. */ 1894 fdp = p->p_fd; 1895 1896 /* 1897 * Range check file descriptor. 1898 */ 1899 fd = iocb.aio_fildes; 1900 if (fd >= fdp->fd_nfiles) 1901 return EBADF; 1902 fp = fdp->fd_ofiles[fd]; 1903 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1904 return EBADF; 1905 if (iocb.aio_offset == -1LL) 1906 return EINVAL; 1907 1908 auio.uio_resid = iocb.aio_nbytes; 1909 if (auio.uio_resid < 0) 1910 return (EINVAL); 1911 1912 /* 1913 * Process sync simply -- queue async request. 1914 */ 1915 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) 1916 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1917 1918 aiov.iov_base = (void *)iocb.aio_buf; 1919 aiov.iov_len = iocb.aio_nbytes; 1920 1921 auio.uio_iov = &aiov; 1922 auio.uio_iovcnt = 1; 1923 auio.uio_offset = iocb.aio_offset; 1924 auio.uio_rw = UIO_READ; 1925 auio.uio_segflg = UIO_USERSPACE; 1926 auio.uio_procp = p; 1927 1928 cnt = iocb.aio_nbytes; 1929 /* 1930 * Temporarily bump the ref count while reading to avoid the 1931 * descriptor being ripped out from under us. 1932 */ 1933 fhold(fp); 1934 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1935 fdrop(fp, p); 1936 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == 1937 EINTR || error == EWOULDBLOCK)) 1938 error = 0; 1939 cnt -= auio.uio_resid; 1940 p->p_retval[0] = cnt; 1941 return error; 1942#endif /* VFS_AIO */ 1943} 1944 1945int 1946aio_write(struct proc *p, struct aio_write_args *uap) 1947{ 1948#ifndef VFS_AIO 1949 return ENOSYS; 1950#else 1951 struct filedesc *fdp; 1952 struct file *fp; 1953 struct uio auio; 1954 struct iovec aiov; 1955 unsigned int fd; 1956 int cnt; 1957 struct aiocb iocb; 1958 int error; 1959 int pmodes; 1960 1961 /* 1962 * Process sync simply -- queue async request. 1963 */ 1964 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1965 if ((pmodes & AIO_PMODE_SYNC) == 0) 1966 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE); 1967 1968 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1969 != 0) 1970 return error; 1971 1972 /* Get the fd info for process. */ 1973 fdp = p->p_fd; 1974 1975 /* 1976 * Range check file descriptor. 1977 */ 1978 fd = iocb.aio_fildes; 1979 if (fd >= fdp->fd_nfiles) 1980 return EBADF; 1981 fp = fdp->fd_ofiles[fd]; 1982 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1983 return EBADF; 1984 if (iocb.aio_offset == -1LL) 1985 return EINVAL; 1986 1987 aiov.iov_base = (void *)iocb.aio_buf; 1988 aiov.iov_len = iocb.aio_nbytes; 1989 auio.uio_iov = &aiov; 1990 auio.uio_iovcnt = 1; 1991 auio.uio_offset = iocb.aio_offset; 1992 1993 auio.uio_resid = iocb.aio_nbytes; 1994 if (auio.uio_resid < 0) 1995 return (EINVAL); 1996 1997 auio.uio_rw = UIO_WRITE; 1998 auio.uio_segflg = UIO_USERSPACE; 1999 auio.uio_procp = p; 2000 2001 cnt = iocb.aio_nbytes; 2002 /* 2003 * Temporarily bump the ref count while writing to avoid the 2004 * descriptor being ripped out from under us. 2005 */ 2006 fhold(fp); 2007 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 2008 fdrop(fp, p); 2009 if (error) { 2010 if (auio.uio_resid != cnt) { 2011 if (error == ERESTART || error == EINTR || error == 2012 EWOULDBLOCK) 2013 error = 0; 2014 if (error == EPIPE) 2015 psignal(p, SIGPIPE); 2016 } 2017 } 2018 cnt -= auio.uio_resid; 2019 p->p_retval[0] = cnt; 2020 return error; 2021#endif /* VFS_AIO */ 2022} 2023 2024int 2025lio_listio(struct proc *p, struct lio_listio_args *uap) 2026{ 2027#ifndef VFS_AIO 2028 return ENOSYS; 2029#else 2030 int nent, nentqueued; 2031 struct aiocb *iocb, * const *cbptr; 2032 struct aiocblist *cb; 2033 struct kaioinfo *ki; 2034 struct aio_liojob *lj; 2035 int error, runningcode; 2036 int nerror; 2037 int i; 2038 int s; 2039 2040 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2041 return EINVAL; 2042 2043 nent = uap->nent; 2044 if (nent > AIO_LISTIO_MAX) 2045 return EINVAL; 2046 2047 if (p->p_aioinfo == NULL) 2048 aio_init_aioinfo(p); 2049 2050 if ((nent + num_queue_count) > max_queue_count) 2051 return EAGAIN; 2052 2053 ki = p->p_aioinfo; 2054 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 2055 return EAGAIN; 2056 2057 lj = zalloc(aiolio_zone); 2058 if (!lj) 2059 return EAGAIN; 2060 2061 lj->lioj_flags = 0; 2062 lj->lioj_buffer_count = 0; 2063 lj->lioj_buffer_finished_count = 0; 2064 lj->lioj_queue_count = 0; 2065 lj->lioj_queue_finished_count = 0; 2066 lj->lioj_ki = ki; 2067 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2068 2069 /* 2070 * Setup signal. 2071 */ 2072 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2073 error = copyin(uap->sig, &lj->lioj_signal, 2074 sizeof(lj->lioj_signal)); 2075 if (error) 2076 return error; 2077 lj->lioj_flags |= LIOJ_SIGNAL; 2078 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2079 } else 2080 lj->lioj_flags &= ~LIOJ_SIGNAL; 2081 2082 /* 2083 * Get pointers to the list of I/O requests. 2084 */ 2085 nerror = 0; 2086 nentqueued = 0; 2087 cbptr = uap->acb_list; 2088 for (i = 0; i < uap->nent; i++) { 2089 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2090 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2091 error = _aio_aqueue(p, iocb, lj, 0); 2092 if (error == 0) 2093 nentqueued++; 2094 else 2095 nerror++; 2096 } 2097 } 2098 2099 /* 2100 * If we haven't queued any, then just return error. 2101 */ 2102 if (nentqueued == 0) 2103 return 0; 2104 2105 /* 2106 * Calculate the appropriate error return. 2107 */ 2108 runningcode = 0; 2109 if (nerror) 2110 runningcode = EIO; 2111 2112 if (uap->mode == LIO_WAIT) { 2113 int command, found, jobref; 2114 2115 for (;;) { 2116 found = 0; 2117 for (i = 0; i < uap->nent; i++) { 2118 /* 2119 * Fetch address of the control buf pointer in 2120 * user space. 2121 */ 2122 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2123 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2124 == 0)) 2125 continue; 2126 2127 /* 2128 * Fetch the associated command from user space. 2129 */ 2130 command = fuword(&iocb->aio_lio_opcode); 2131 if (command == LIO_NOP) { 2132 found++; 2133 continue; 2134 } 2135 2136 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2137 2138 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 2139 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2140 == jobref) { 2141 if (cb->uaiocb.aio_lio_opcode 2142 == LIO_WRITE) { 2143 curproc->p_stats->p_ru.ru_oublock 2144 += 2145 cb->outputcharge; 2146 cb->outputcharge = 0; 2147 } else if (cb->uaiocb.aio_lio_opcode 2148 == LIO_READ) { 2149 curproc->p_stats->p_ru.ru_inblock 2150 += cb->inputcharge; 2151 cb->inputcharge = 0; 2152 } 2153 found++; 2154 break; 2155 } 2156 } 2157 2158 s = splbio(); 2159 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 2160 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2161 == jobref) { 2162 found++; 2163 break; 2164 } 2165 } 2166 splx(s); 2167 } 2168 2169 /* 2170 * If all I/Os have been disposed of, then we can 2171 * return. 2172 */ 2173 if (found == nentqueued) 2174 return runningcode; 2175 2176 ki->kaio_flags |= KAIO_WAKEUP; 2177 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2178 2179 if (error == EINTR) 2180 return EINTR; 2181 else if (error == EWOULDBLOCK) 2182 return EAGAIN; 2183 } 2184 } 2185 2186 return runningcode; 2187#endif /* VFS_AIO */ 2188} 2189 2190#ifdef VFS_AIO 2191/* 2192 * This is a weird hack so that we can post a signal. It is safe to do so from 2193 * a timeout routine, but *not* from an interrupt routine. 2194 */ 2195static void 2196process_signal(void *aioj) 2197{ 2198 struct aiocblist *aiocbe = aioj; 2199 struct aio_liojob *lj = aiocbe->lio; 2200 struct aiocb *cb = &aiocbe->uaiocb; 2201 2202 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2203 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2204 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2205 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2206 } 2207 2208 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2209 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2210} 2211 2212/* 2213 * Interrupt handler for physio, performs the necessary process wakeups, and 2214 * signals. 2215 */ 2216static void 2217aio_physwakeup(struct buf *bp) 2218{ 2219 struct aiocblist *aiocbe; 2220 struct proc *p; 2221 struct kaioinfo *ki; 2222 struct aio_liojob *lj; 2223 2224 wakeup((caddr_t)bp); 2225 2226 aiocbe = (struct aiocblist *)bp->b_spc; 2227 if (aiocbe) { 2228 p = bp->b_caller1; 2229 2230 aiocbe->jobstate = JOBST_JOBBFINISHED; 2231 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2232 aiocbe->uaiocb._aiocb_private.error = 0; 2233 aiocbe->jobflags |= AIOCBLIST_DONE; 2234 2235 if (bp->b_ioflags & BIO_ERROR) 2236 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2237 2238 lj = aiocbe->lio; 2239 if (lj) { 2240 lj->lioj_buffer_finished_count++; 2241 2242 /* 2243 * wakeup/signal if all of the interrupt jobs are done. 2244 */ 2245 if (lj->lioj_buffer_finished_count == 2246 lj->lioj_buffer_count) { 2247 /* 2248 * Post a signal if it is called for. 2249 */ 2250 if ((lj->lioj_flags & 2251 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2252 LIOJ_SIGNAL) { 2253 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2254 timeout(process_signal, aiocbe, 0); 2255 } 2256 } 2257 } 2258 2259 ki = p->p_aioinfo; 2260 if (ki) { 2261 ki->kaio_buffer_finished_count++; 2262 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2263 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2264 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2265 2266 KNOTE(&aiocbe->klist, 0); 2267 /* Do the wakeup. */ 2268 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2269 ki->kaio_flags &= ~KAIO_WAKEUP; 2270 wakeup(p); 2271 } 2272 } 2273 2274 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2275 timeout(process_signal, aiocbe, 0); 2276 } 2277} 2278#endif /* VFS_AIO */ 2279 2280int 2281aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap) 2282{ 2283#ifndef VFS_AIO 2284 return ENOSYS; 2285#else 2286 struct timeval atv; 2287 struct timespec ts; 2288 struct aiocb **cbptr; 2289 struct kaioinfo *ki; 2290 struct aiocblist *cb = NULL; 2291 int error, s, timo; 2292 2293 suword(uap->aiocbp, (int)NULL); 2294 2295 timo = 0; 2296 if (uap->timeout) { 2297 /* Get timespec struct. */ 2298 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, 2299 sizeof(ts)); 2300 if (error) 2301 return error; 2302 2303 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2304 return (EINVAL); 2305 2306 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2307 if (itimerfix(&atv)) 2308 return (EINVAL); 2309 timo = tvtohz(&atv); 2310 } 2311 2312 ki = p->p_aioinfo; 2313 if (ki == NULL) 2314 return EAGAIN; 2315 2316 cbptr = uap->aiocbp; 2317 2318 for (;;) { 2319 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2320 suword(uap->aiocbp, (int)cb->uuaiocb); 2321 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2322 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2323 curproc->p_stats->p_ru.ru_oublock += 2324 cb->outputcharge; 2325 cb->outputcharge = 0; 2326 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2327 curproc->p_stats->p_ru.ru_inblock += 2328 cb->inputcharge; 2329 cb->inputcharge = 0; 2330 } 2331 aio_free_entry(cb); 2332 return cb->uaiocb._aiocb_private.error; 2333 } 2334 2335 s = splbio(); 2336 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2337 splx(s); 2338 suword(uap->aiocbp, (int)cb->uuaiocb); 2339 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2340 aio_free_entry(cb); 2341 return cb->uaiocb._aiocb_private.error; 2342 } 2343 2344 ki->kaio_flags |= KAIO_WAKEUP; 2345 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2346 splx(s); 2347 2348 if (error == ERESTART) 2349 return EINTR; 2350 else if (error < 0) 2351 return error; 2352 else if (error == EINTR) 2353 return EINTR; 2354 else if (error == EWOULDBLOCK) 2355 return EAGAIN; 2356 } 2357#endif /* VFS_AIO */ 2358} 2359 2360 2361#ifndef VFS_AIO 2362static int 2363filt_aioattach(struct knote *kn) 2364{ 2365 2366 return (ENXIO); 2367} 2368 2369struct filterops aio_filtops = 2370 { 0, filt_aioattach, NULL, NULL }; 2371 2372#else 2373static int 2374filt_aioattach(struct knote *kn) 2375{ 2376 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2377 2378 /* 2379 * The aiocbe pointer must be validated before using it, so 2380 * registration is restricted to the kernel; the user cannot 2381 * set EV_FLAG1. 2382 */ 2383 if ((kn->kn_flags & EV_FLAG1) == 0) 2384 return (EPERM); 2385 kn->kn_flags &= ~EV_FLAG1; 2386 2387 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2388 2389 return (0); 2390} 2391 2392static void 2393filt_aiodetach(struct knote *kn) 2394{ 2395 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2396 int s = splhigh(); /* XXX no clue, so overkill */ 2397 2398 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2399 splx(s); 2400} 2401 2402/*ARGSUSED*/ 2403static int 2404filt_aio(struct knote *kn, long hint) 2405{ 2406 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2407 2408 kn->kn_data = 0; /* XXX data returned? */ 2409 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2410 aiocbe->jobstate != JOBST_JOBBFINISHED) 2411 return (0); 2412 kn->kn_flags |= EV_EOF; 2413 return (1); 2414} 2415 2416struct filterops aio_filtops = 2417 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2418#endif /* VFS_AIO */
| 642 /* 643 * Local copies of curproc (cp) and vmspace (myvm) 644 */ 645 mycp = curproc; 646 myvm = mycp->p_vmspace; 647 648 if (mycp->p_textvp) { 649 vrele(mycp->p_textvp); 650 mycp->p_textvp = NULL; 651 } 652 653 /* 654 * Allocate and ready the aio control info. There is one aiop structure 655 * per daemon. 656 */ 657 aiop = zalloc(aiop_zone); 658 aiop->aioproc = mycp; 659 aiop->aioprocflags |= AIOP_FREE; 660 TAILQ_INIT(&aiop->jobtorun); 661 662 s = splnet(); 663 664 /* 665 * Place thread (lightweight process) onto the AIO free thread list. 666 */ 667 if (TAILQ_EMPTY(&aio_freeproc)) 668 wakeup(&aio_freeproc); 669 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 670 671 splx(s); 672 673 /* Make up a name for the daemon. */ 674 strcpy(mycp->p_comm, "aiod"); 675 676 /* 677 * Get rid of our current filedescriptors. AIOD's don't need any 678 * filedescriptors, except as temporarily inherited from the client. 679 * Credentials are also cloned, and made equivalent to "root". 680 */ 681 fdfree(mycp); 682 mycp->p_fd = NULL; 683 mycp->p_ucred = crcopy(mycp->p_ucred); 684 mycp->p_ucred->cr_uid = 0; 685 uifree(mycp->p_ucred->cr_uidinfo); 686 mycp->p_ucred->cr_uidinfo = uifind(0); 687 mycp->p_ucred->cr_ngroups = 1; 688 mycp->p_ucred->cr_groups[0] = 1; 689 690 /* The daemon resides in its own pgrp. */ 691 enterpgrp(mycp, mycp->p_pid, 1); 692 693 /* Mark special process type. */ 694 mycp->p_flag |= P_SYSTEM; 695 696 /* 697 * Wakeup parent process. (Parent sleeps to keep from blasting away 698 * creating to many daemons.) 699 */ 700 wakeup(mycp); 701 702 for (;;) { 703 /* 704 * curcp is the current daemon process context. 705 * userp is the current user process context. 706 */ 707 curcp = mycp; 708 709 /* 710 * Take daemon off of free queue 711 */ 712 if (aiop->aioprocflags & AIOP_FREE) { 713 s = splnet(); 714 TAILQ_REMOVE(&aio_freeproc, aiop, list); 715 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 716 aiop->aioprocflags &= ~AIOP_FREE; 717 splx(s); 718 } 719 aiop->aioprocflags &= ~AIOP_SCHED; 720 721 /* 722 * Check for jobs. 723 */ 724 while ((aiocbe = aio_selectjob(aiop)) != NULL) { 725 cb = &aiocbe->uaiocb; 726 userp = aiocbe->userproc; 727 728 aiocbe->jobstate = JOBST_JOBRUNNING; 729 730 /* 731 * Connect to process address space for user program. 732 */ 733 if (userp != curcp) { 734 /* 735 * Save the current address space that we are 736 * connected to. 737 */ 738 tmpvm = mycp->p_vmspace; 739 740 /* 741 * Point to the new user address space, and 742 * refer to it. 743 */ 744 mycp->p_vmspace = userp->p_vmspace; 745 mycp->p_vmspace->vm_refcnt++; 746 747 /* Activate the new mapping. */ 748 pmap_activate(mycp); 749 750 /* 751 * If the old address space wasn't the daemons 752 * own address space, then we need to remove the 753 * daemon's reference from the other process 754 * that it was acting on behalf of. 755 */ 756 if (tmpvm != myvm) { 757 vmspace_free(tmpvm); 758 } 759 760 /* 761 * Disassociate from previous clients file 762 * descriptors, and associate to the new clients 763 * descriptors. Note that the daemon doesn't 764 * need to worry about its orginal descriptors, 765 * because they were originally freed. 766 */ 767 if (mycp->p_fd) 768 fdfree(mycp); 769 mycp->p_fd = fdshare(userp); 770 curcp = userp; 771 } 772 773 ki = userp->p_aioinfo; 774 lj = aiocbe->lio; 775 776 /* Account for currently active jobs. */ 777 ki->kaio_active_count++; 778 779 /* Do the I/O function. */ 780 aiocbe->jobaioproc = aiop; 781 aio_process(aiocbe); 782 783 /* Decrement the active job count. */ 784 ki->kaio_active_count--; 785 786 /* 787 * Increment the completion count for wakeup/signal 788 * comparisons. 789 */ 790 aiocbe->jobflags |= AIOCBLIST_DONE; 791 ki->kaio_queue_finished_count++; 792 if (lj) 793 lj->lioj_queue_finished_count++; 794 if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags 795 & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) { 796 ki->kaio_flags &= ~KAIO_WAKEUP; 797 wakeup(userp); 798 } 799 800 s = splbio(); 801 if (lj && (lj->lioj_flags & 802 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) { 803 if ((lj->lioj_queue_finished_count == 804 lj->lioj_queue_count) && 805 (lj->lioj_buffer_finished_count == 806 lj->lioj_buffer_count)) { 807 psignal(userp, 808 lj->lioj_signal.sigev_signo); 809 lj->lioj_flags |= 810 LIOJ_SIGNAL_POSTED; 811 } 812 } 813 splx(s); 814 815 aiocbe->jobstate = JOBST_JOBFINISHED; 816 817 /* 818 * If the I/O request should be automatically rundown, 819 * do the needed cleanup. Otherwise, place the queue 820 * entry for the just finished I/O request into the done 821 * queue for the associated client. 822 */ 823 s = splnet(); 824 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 825 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 826 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 827 } else { 828 TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); 829 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, 830 plist); 831 } 832 splx(s); 833 KNOTE(&aiocbe->klist, 0); 834 835 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 836 wakeup(aiocbe); 837 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 838 } 839 840 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 841 psignal(userp, cb->aio_sigevent.sigev_signo); 842 } 843 } 844 845 /* 846 * Disconnect from user address space. 847 */ 848 if (curcp != mycp) { 849 /* Get the user address space to disconnect from. */ 850 tmpvm = mycp->p_vmspace; 851 852 /* Get original address space for daemon. */ 853 mycp->p_vmspace = myvm; 854 855 /* Activate the daemon's address space. */ 856 pmap_activate(mycp); 857#ifdef DIAGNOSTIC 858 if (tmpvm == myvm) { 859 printf("AIOD: vmspace problem -- %d\n", 860 mycp->p_pid); 861 } 862#endif 863 /* Remove our vmspace reference. */ 864 vmspace_free(tmpvm); 865 866 /* 867 * Disassociate from the user process's file 868 * descriptors. 869 */ 870 if (mycp->p_fd) 871 fdfree(mycp); 872 mycp->p_fd = NULL; 873 curcp = mycp; 874 } 875 876 /* 877 * If we are the first to be put onto the free queue, wakeup 878 * anyone waiting for a daemon. 879 */ 880 s = splnet(); 881 TAILQ_REMOVE(&aio_activeproc, aiop, list); 882 if (TAILQ_EMPTY(&aio_freeproc)) 883 wakeup(&aio_freeproc); 884 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 885 aiop->aioprocflags |= AIOP_FREE; 886 splx(s); 887 888 /* 889 * If daemon is inactive for a long time, allow it to exit, 890 * thereby freeing resources. 891 */ 892 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && tsleep(mycp, 893 PRIBIO, "aiordy", aiod_lifetime)) { 894 s = splnet(); 895 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 896 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 897 if ((aiop->aioprocflags & AIOP_FREE) && 898 (num_aio_procs > target_aio_procs)) { 899 TAILQ_REMOVE(&aio_freeproc, aiop, list); 900 splx(s); 901 zfree(aiop_zone, aiop); 902 num_aio_procs--; 903#ifdef DIAGNOSTIC 904 if (mycp->p_vmspace->vm_refcnt <= 1) { 905 printf("AIOD: bad vm refcnt for" 906 " exiting daemon: %d\n", 907 mycp->p_vmspace->vm_refcnt); 908 } 909#endif 910 exit1(mycp, 0); 911 } 912 } 913 splx(s); 914 } 915 } 916} 917 918/* 919 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The 920 * AIO daemon modifies its environment itself. 921 */ 922static int 923aio_newproc() 924{ 925 int error; 926 struct proc *p, *np; 927 928 p = &proc0; 929 error = fork1(p, RFPROC|RFMEM|RFNOWAIT, &np); 930 if (error) 931 return error; 932 cpu_set_fork_handler(np, aio_daemon, curproc); 933 934 /* 935 * Wait until daemon is started, but continue on just in case to 936 * handle error conditions. 937 */ 938 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 939 num_aio_procs++; 940 941 return error; 942} 943 944/* 945 * Try the high-performance physio method for eligible VCHR devices. This 946 * routine doesn't require the use of any additional threads, and have overhead. 947 */ 948int 949aio_qphysio(struct proc *p, struct aiocblist *aiocbe) 950{ 951 int error; 952 struct aiocb *cb; 953 struct file *fp; 954 struct buf *bp; 955 struct vnode *vp; 956 struct kaioinfo *ki; 957 struct filedesc *fdp; 958 struct aio_liojob *lj; 959 int fd; 960 int s; 961 int notify; 962 963 cb = &aiocbe->uaiocb; 964 fdp = p->p_fd; 965 fd = cb->aio_fildes; 966 fp = fdp->fd_ofiles[fd]; 967 968 if (fp->f_type != DTYPE_VNODE) 969 return (-1); 970 971 vp = (struct vnode *)fp->f_data; 972 973 /* 974 * If its not a disk, we don't want to return a positive error. 975 * It causes the aio code to not fall through to try the thread 976 * way when you're talking to a regular file. 977 */ 978 if (!vn_isdisk(vp, &error)) { 979 if (error == ENOTBLK) 980 return (-1); 981 else 982 return (error); 983 } 984 985 if (cb->aio_nbytes % vp->v_rdev->si_bsize_phys) 986 return (-1); 987 988 if (cb->aio_nbytes > MAXPHYS) 989 return (-1); 990 991 ki = p->p_aioinfo; 992 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) 993 return (-1); 994 995 fhold(fp); 996 997 ki->kaio_buffer_count++; 998 999 lj = aiocbe->lio; 1000 if (lj) 1001 lj->lioj_buffer_count++; 1002 1003 /* Create and build a buffer header for a transfer. */ 1004 bp = (struct buf *)getpbuf(NULL); 1005 1006 /* 1007 * Get a copy of the kva from the physical buffer. 1008 */ 1009 bp->b_caller1 = p; 1010 bp->b_dev = vp->v_rdev; 1011 error = bp->b_error = 0; 1012 1013 bp->b_bcount = cb->aio_nbytes; 1014 bp->b_bufsize = cb->aio_nbytes; 1015 bp->b_flags = B_PHYS; 1016 bp->b_iodone = aio_physwakeup; 1017 bp->b_saveaddr = bp->b_data; 1018 bp->b_data = (void *)cb->aio_buf; 1019 bp->b_blkno = btodb(cb->aio_offset); 1020 1021 if (cb->aio_lio_opcode == LIO_WRITE) { 1022 bp->b_iocmd = BIO_WRITE; 1023 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_READ)) { 1024 error = EFAULT; 1025 goto doerror; 1026 } 1027 } else { 1028 bp->b_iocmd = BIO_READ; 1029 if (!useracc(bp->b_data, bp->b_bufsize, VM_PROT_WRITE)) { 1030 error = EFAULT; 1031 goto doerror; 1032 } 1033 } 1034 1035 /* Bring buffer into kernel space. */ 1036 vmapbuf(bp); 1037 1038 s = splbio(); 1039 aiocbe->bp = bp; 1040 bp->b_spc = (void *)aiocbe; 1041 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1042 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1043 aiocbe->jobstate = JOBST_JOBQBUF; 1044 cb->_aiocb_private.status = cb->aio_nbytes; 1045 num_buf_aio++; 1046 bp->b_error = 0; 1047 1048 splx(s); 1049 1050 /* Perform transfer. */ 1051 DEV_STRATEGY(bp, 0); 1052 1053 notify = 0; 1054 s = splbio(); 1055 1056 /* 1057 * If we had an error invoking the request, or an error in processing 1058 * the request before we have returned, we process it as an error in 1059 * transfer. Note that such an I/O error is not indicated immediately, 1060 * but is returned using the aio_error mechanism. In this case, 1061 * aio_suspend will return immediately. 1062 */ 1063 if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) { 1064 struct aiocb *job = aiocbe->uuaiocb; 1065 1066 aiocbe->uaiocb._aiocb_private.status = 0; 1067 suword(&job->_aiocb_private.status, 0); 1068 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1069 suword(&job->_aiocb_private.error, bp->b_error); 1070 1071 ki->kaio_buffer_finished_count++; 1072 1073 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1074 aiocbe->jobstate = JOBST_JOBBFINISHED; 1075 aiocbe->jobflags |= AIOCBLIST_DONE; 1076 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1077 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1078 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1079 notify = 1; 1080 } 1081 } 1082 splx(s); 1083 if (notify) 1084 KNOTE(&aiocbe->klist, 0); 1085 fdrop(fp, p); 1086 return 0; 1087 1088doerror: 1089 ki->kaio_buffer_count--; 1090 if (lj) 1091 lj->lioj_buffer_count--; 1092 aiocbe->bp = NULL; 1093 relpbuf(bp, NULL); 1094 fdrop(fp, p); 1095 return error; 1096} 1097 1098/* 1099 * This waits/tests physio completion. 1100 */ 1101int 1102aio_fphysio(struct proc *p, struct aiocblist *iocb, int flgwait) 1103{ 1104 int s; 1105 struct buf *bp; 1106 int error; 1107 1108 bp = iocb->bp; 1109 1110 s = splbio(); 1111 if (flgwait == 0) { 1112 if ((bp->b_flags & B_DONE) == 0) { 1113 splx(s); 1114 return EINPROGRESS; 1115 } 1116 } 1117 1118 while ((bp->b_flags & B_DONE) == 0) { 1119 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1120 if ((bp->b_flags & B_DONE) == 0) { 1121 splx(s); 1122 return EINPROGRESS; 1123 } else 1124 break; 1125 } 1126 } 1127 1128 /* Release mapping into kernel space. */ 1129 vunmapbuf(bp); 1130 iocb->bp = 0; 1131 1132 error = 0; 1133 1134 /* Check for an error. */ 1135 if (bp->b_ioflags & BIO_ERROR) 1136 error = bp->b_error; 1137 1138 relpbuf(bp, NULL); 1139 return (error); 1140} 1141#endif /* VFS_AIO */ 1142 1143/* 1144 * Wake up aio requests that may be serviceable now. 1145 */ 1146void 1147aio_swake(struct socket *so, struct sockbuf *sb) 1148{ 1149#ifndef VFS_AIO 1150 return; 1151#else 1152 struct aiocblist *cb,*cbn; 1153 struct proc *p; 1154 struct kaioinfo *ki = NULL; 1155 int opcode, wakecount = 0; 1156 struct aioproclist *aiop; 1157 1158 if (sb == &so->so_snd) { 1159 opcode = LIO_WRITE; 1160 so->so_snd.sb_flags &= ~SB_AIO; 1161 } else { 1162 opcode = LIO_READ; 1163 so->so_rcv.sb_flags &= ~SB_AIO; 1164 } 1165 1166 for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) { 1167 cbn = TAILQ_NEXT(cb, list); 1168 if (opcode == cb->uaiocb.aio_lio_opcode) { 1169 p = cb->userproc; 1170 ki = p->p_aioinfo; 1171 TAILQ_REMOVE(&so->so_aiojobq, cb, list); 1172 TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist); 1173 TAILQ_INSERT_TAIL(&aio_jobs, cb, list); 1174 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist); 1175 wakecount++; 1176 if (cb->jobstate != JOBST_JOBQGLOBAL) 1177 panic("invalid queue value"); 1178 } 1179 } 1180 1181 while (wakecount--) { 1182 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) { 1183 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1184 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1185 aiop->aioprocflags &= ~AIOP_FREE; 1186 wakeup(aiop->aioproc); 1187 } 1188 } 1189#endif /* VFS_AIO */ 1190} 1191 1192#ifdef VFS_AIO 1193/* 1194 * Queue a new AIO request. Choosing either the threaded or direct physio VCHR 1195 * technique is done in this code. 1196 */ 1197static int 1198_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1199{ 1200 struct filedesc *fdp; 1201 struct file *fp; 1202 unsigned int fd; 1203 struct socket *so; 1204 int s; 1205 int error; 1206 int opcode; 1207 struct aiocblist *aiocbe; 1208 struct aioproclist *aiop; 1209 struct kaioinfo *ki; 1210 struct kevent kev; 1211 struct kqueue *kq; 1212 struct file *kq_fp; 1213 1214 if ((aiocbe = TAILQ_FIRST(&aio_freejobs)) != NULL) 1215 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1216 else 1217 aiocbe = zalloc (aiocb_zone); 1218 1219 aiocbe->inputcharge = 0; 1220 aiocbe->outputcharge = 0; 1221 SLIST_INIT(&aiocbe->klist); 1222 1223 suword(&job->_aiocb_private.status, -1); 1224 suword(&job->_aiocb_private.error, 0); 1225 suword(&job->_aiocb_private.kernelinfo, -1); 1226 1227 error = copyin((caddr_t)job, (caddr_t) &aiocbe->uaiocb, sizeof 1228 aiocbe->uaiocb); 1229 if (error) { 1230 suword(&job->_aiocb_private.error, error); 1231 1232 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1233 return error; 1234 } 1235 1236 /* Save userspace address of the job info. */ 1237 aiocbe->uuaiocb = job; 1238 1239 /* Get the opcode. */ 1240 if (type != LIO_NOP) 1241 aiocbe->uaiocb.aio_lio_opcode = type; 1242 opcode = aiocbe->uaiocb.aio_lio_opcode; 1243 1244 /* Get the fd info for process. */ 1245 fdp = p->p_fd; 1246 1247 /* 1248 * Range check file descriptor. 1249 */ 1250 fd = aiocbe->uaiocb.aio_fildes; 1251 if (fd >= fdp->fd_nfiles) { 1252 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1253 if (type == 0) 1254 suword(&job->_aiocb_private.error, EBADF); 1255 return EBADF; 1256 } 1257 1258 fp = aiocbe->fd_file = fdp->fd_ofiles[fd]; 1259 if ((fp == NULL) || ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 1260 0))) { 1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1262 if (type == 0) 1263 suword(&job->_aiocb_private.error, EBADF); 1264 return EBADF; 1265 } 1266 1267 if (aiocbe->uaiocb.aio_offset == -1LL) { 1268 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1269 if (type == 0) 1270 suword(&job->_aiocb_private.error, EINVAL); 1271 return EINVAL; 1272 } 1273 1274 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1275 if (error) { 1276 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1277 if (type == 0) 1278 suword(&job->_aiocb_private.error, EINVAL); 1279 return error; 1280 } 1281 1282 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; 1283 if (jobrefid == LONG_MAX) 1284 jobrefid = 1; 1285 else 1286 jobrefid++; 1287 1288 if (opcode == LIO_NOP) { 1289 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1290 if (type == 0) { 1291 suword(&job->_aiocb_private.error, 0); 1292 suword(&job->_aiocb_private.status, 0); 1293 suword(&job->_aiocb_private.kernelinfo, 0); 1294 } 1295 return 0; 1296 } 1297 1298 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1299 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1300 if (type == 0) { 1301 suword(&job->_aiocb_private.status, 0); 1302 suword(&job->_aiocb_private.error, EINVAL); 1303 } 1304 return EINVAL; 1305 } 1306 1307 fhold(fp); 1308 1309 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { 1310 kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; 1311 kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr; 1312 } 1313 else { 1314 /* 1315 * This method for requesting kevent-based notification won't 1316 * work on the alpha, since we're passing in a pointer 1317 * via aio_lio_opcode, which is an int. Use the SIGEV_KEVENT- 1318 * based method instead. 1319 */ 1320 struct kevent *kevp; 1321 1322 kevp = (struct kevent *)job->aio_lio_opcode; 1323 if (kevp == NULL) 1324 goto no_kqueue; 1325 1326 error = copyin((caddr_t)kevp, (caddr_t)&kev, sizeof(kev)); 1327 if (error) 1328 goto aqueue_fail; 1329 } 1330 if ((u_int)kev.ident >= fdp->fd_nfiles || 1331 (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL || 1332 (kq_fp->f_type != DTYPE_KQUEUE)) { 1333 error = EBADF; 1334 goto aqueue_fail; 1335 } 1336 kq = (struct kqueue *)kq_fp->f_data; 1337 kev.ident = (uintptr_t)aiocbe; 1338 kev.filter = EVFILT_AIO; 1339 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; 1340 error = kqueue_register(kq, &kev, p); 1341aqueue_fail: 1342 if (error) { 1343 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1344 if (type == 0) 1345 suword(&job->_aiocb_private.error, error); 1346 goto done; 1347 } 1348no_kqueue: 1349 1350 suword(&job->_aiocb_private.error, EINPROGRESS); 1351 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1352 aiocbe->userproc = p; 1353 aiocbe->jobflags = 0; 1354 aiocbe->lio = lj; 1355 ki = p->p_aioinfo; 1356 1357 if (fp->f_type == DTYPE_SOCKET) { 1358 /* 1359 * Alternate queueing for socket ops: Reach down into the 1360 * descriptor to get the socket data. Then check to see if the 1361 * socket is ready to be read or written (based on the requested 1362 * operation). 1363 * 1364 * If it is not ready for io, then queue the aiocbe on the 1365 * socket, and set the flags so we get a call when sbnotify() 1366 * happens. 1367 */ 1368 so = (struct socket *)fp->f_data; 1369 s = splnet(); 1370 if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == 1371 LIO_WRITE) && (!sowriteable(so)))) { 1372 TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); 1373 TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist); 1374 if (opcode == LIO_READ) 1375 so->so_rcv.sb_flags |= SB_AIO; 1376 else 1377 so->so_snd.sb_flags |= SB_AIO; 1378 aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */ 1379 ki->kaio_queue_count++; 1380 num_queue_count++; 1381 splx(s); 1382 error = 0; 1383 goto done; 1384 } 1385 splx(s); 1386 } 1387 1388 if ((error = aio_qphysio(p, aiocbe)) == 0) 1389 goto done; 1390 if (error > 0) { 1391 suword(&job->_aiocb_private.status, 0); 1392 aiocbe->uaiocb._aiocb_private.error = error; 1393 suword(&job->_aiocb_private.error, error); 1394 goto done; 1395 } 1396 1397 /* No buffer for daemon I/O. */ 1398 aiocbe->bp = NULL; 1399 1400 ki->kaio_queue_count++; 1401 if (lj) 1402 lj->lioj_queue_count++; 1403 s = splnet(); 1404 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1405 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1406 splx(s); 1407 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1408 1409 num_queue_count++; 1410 error = 0; 1411 1412 /* 1413 * If we don't have a free AIO process, and we are below our quota, then 1414 * start one. Otherwise, depend on the subsequent I/O completions to 1415 * pick-up this job. If we don't sucessfully create the new process 1416 * (thread) due to resource issues, we return an error for now (EAGAIN), 1417 * which is likely not the correct thing to do. 1418 */ 1419retryproc: 1420 s = splnet(); 1421 if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { 1422 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1423 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1424 aiop->aioprocflags &= ~AIOP_FREE; 1425 wakeup(aiop->aioproc); 1426 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1427 ((ki->kaio_active_count + num_aio_resv_start) < 1428 ki->kaio_maxactive_count)) { 1429 num_aio_resv_start++; 1430 if ((error = aio_newproc()) == 0) { 1431 num_aio_resv_start--; 1432 p->p_retval[0] = 0; 1433 goto retryproc; 1434 } 1435 num_aio_resv_start--; 1436 } 1437 splx(s); 1438done: 1439 fdrop(fp, p); 1440 return error; 1441} 1442 1443/* 1444 * This routine queues an AIO request, checking for quotas. 1445 */ 1446static int 1447aio_aqueue(struct proc *p, struct aiocb *job, int type) 1448{ 1449 struct kaioinfo *ki; 1450 1451 if (p->p_aioinfo == NULL) 1452 aio_init_aioinfo(p); 1453 1454 if (num_queue_count >= max_queue_count) 1455 return EAGAIN; 1456 1457 ki = p->p_aioinfo; 1458 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1459 return EAGAIN; 1460 1461 return _aio_aqueue(p, job, NULL, type); 1462} 1463#endif /* VFS_AIO */ 1464 1465/* 1466 * Support the aio_return system call, as a side-effect, kernel resources are 1467 * released. 1468 */ 1469int 1470aio_return(struct proc *p, struct aio_return_args *uap) 1471{ 1472#ifndef VFS_AIO 1473 return ENOSYS; 1474#else 1475 int s; 1476 int jobref; 1477 struct aiocblist *cb, *ncb; 1478 struct aiocb *ujob; 1479 struct kaioinfo *ki; 1480 1481 ki = p->p_aioinfo; 1482 if (ki == NULL) 1483 return EINVAL; 1484 1485 ujob = uap->aiocbp; 1486 1487 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1488 if (jobref == -1 || jobref == 0) 1489 return EINVAL; 1490 1491 s = splnet(); 1492 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1493 plist)) { 1494 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == 1495 jobref) { 1496 splx(s); 1497 if (ujob == cb->uuaiocb) { 1498 p->p_retval[0] = 1499 cb->uaiocb._aiocb_private.status; 1500 } else 1501 p->p_retval[0] = EFAULT; 1502 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1503 curproc->p_stats->p_ru.ru_oublock += 1504 cb->outputcharge; 1505 cb->outputcharge = 0; 1506 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1507 curproc->p_stats->p_ru.ru_inblock += 1508 cb->inputcharge; 1509 cb->inputcharge = 0; 1510 } 1511 aio_free_entry(cb); 1512 return 0; 1513 } 1514 } 1515 splx(s); 1516 1517 s = splbio(); 1518 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) { 1519 ncb = TAILQ_NEXT(cb, plist); 1520 if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) 1521 == jobref) { 1522 splx(s); 1523 if (ujob == cb->uuaiocb) { 1524 p->p_retval[0] = 1525 cb->uaiocb._aiocb_private.status; 1526 } else 1527 p->p_retval[0] = EFAULT; 1528 aio_free_entry(cb); 1529 return 0; 1530 } 1531 } 1532 splx(s); 1533 1534 return (EINVAL); 1535#endif /* VFS_AIO */ 1536} 1537 1538/* 1539 * Allow a process to wakeup when any of the I/O requests are completed. 1540 */ 1541int 1542aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1543{ 1544#ifndef VFS_AIO 1545 return ENOSYS; 1546#else 1547 struct timeval atv; 1548 struct timespec ts; 1549 struct aiocb *const *cbptr, *cbp; 1550 struct kaioinfo *ki; 1551 struct aiocblist *cb; 1552 int i; 1553 int njoblist; 1554 int error, s, timo; 1555 int *ijoblist; 1556 struct aiocb **ujoblist; 1557 1558 if (uap->nent >= AIO_LISTIO_MAX) 1559 return EINVAL; 1560 1561 timo = 0; 1562 if (uap->timeout) { 1563 /* Get timespec struct. */ 1564 if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) 1565 return error; 1566 1567 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1568 return (EINVAL); 1569 1570 TIMESPEC_TO_TIMEVAL(&atv, &ts); 1571 if (itimerfix(&atv)) 1572 return (EINVAL); 1573 timo = tvtohz(&atv); 1574 } 1575 1576 ki = p->p_aioinfo; 1577 if (ki == NULL) 1578 return EAGAIN; 1579 1580 njoblist = 0; 1581 ijoblist = zalloc(aiol_zone); 1582 ujoblist = zalloc(aiol_zone); 1583 cbptr = uap->aiocbp; 1584 1585 for (i = 0; i < uap->nent; i++) { 1586 cbp = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 1587 if (cbp == 0) 1588 continue; 1589 ujoblist[njoblist] = cbp; 1590 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1591 njoblist++; 1592 } 1593 1594 if (njoblist == 0) { 1595 zfree(aiol_zone, ijoblist); 1596 zfree(aiol_zone, ujoblist); 1597 return 0; 1598 } 1599 1600 error = 0; 1601 for (;;) { 1602 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = 1603 TAILQ_NEXT(cb, plist)) { 1604 for (i = 0; i < njoblist; i++) { 1605 if (((intptr_t) 1606 cb->uaiocb._aiocb_private.kernelinfo) == 1607 ijoblist[i]) { 1608 if (ujoblist[i] != cb->uuaiocb) 1609 error = EINVAL; 1610 zfree(aiol_zone, ijoblist); 1611 zfree(aiol_zone, ujoblist); 1612 return error; 1613 } 1614 } 1615 } 1616 1617 s = splbio(); 1618 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = 1619 TAILQ_NEXT(cb, plist)) { 1620 for (i = 0; i < njoblist; i++) { 1621 if (((intptr_t) 1622 cb->uaiocb._aiocb_private.kernelinfo) == 1623 ijoblist[i]) { 1624 splx(s); 1625 if (ujoblist[i] != cb->uuaiocb) 1626 error = EINVAL; 1627 zfree(aiol_zone, ijoblist); 1628 zfree(aiol_zone, ujoblist); 1629 return error; 1630 } 1631 } 1632 } 1633 1634 ki->kaio_flags |= KAIO_WAKEUP; 1635 error = tsleep(p, PRIBIO | PCATCH, "aiospn", timo); 1636 splx(s); 1637 1638 if (error == ERESTART || error == EINTR) { 1639 zfree(aiol_zone, ijoblist); 1640 zfree(aiol_zone, ujoblist); 1641 return EINTR; 1642 } else if (error == EWOULDBLOCK) { 1643 zfree(aiol_zone, ijoblist); 1644 zfree(aiol_zone, ujoblist); 1645 return EAGAIN; 1646 } 1647 } 1648 1649/* NOTREACHED */ 1650 return EINVAL; 1651#endif /* VFS_AIO */ 1652} 1653 1654/* 1655 * aio_cancel cancels any non-physio aio operations not currently in 1656 * progress. 1657 */ 1658int 1659aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1660{ 1661#ifndef VFS_AIO 1662 return ENOSYS; 1663#else 1664 struct kaioinfo *ki; 1665 struct aiocblist *cbe, *cbn; 1666 struct file *fp; 1667 struct filedesc *fdp; 1668 struct socket *so; 1669 struct proc *po; 1670 int s,error; 1671 int cancelled=0; 1672 int notcancelled=0; 1673 struct vnode *vp; 1674 1675 fdp = p->p_fd; 1676 1677 fp = fdp->fd_ofiles[uap->fd]; 1678 1679 if (fp == NULL) { 1680 return EBADF; 1681 } 1682 1683 if (fp->f_type == DTYPE_VNODE) { 1684 vp = (struct vnode *)fp->f_data; 1685 1686 if (vn_isdisk(vp,&error)) { 1687 p->p_retval[0] = AIO_NOTCANCELED; 1688 return 0; 1689 } 1690 } else if (fp->f_type == DTYPE_SOCKET) { 1691 so = (struct socket *)fp->f_data; 1692 1693 s = splnet(); 1694 1695 for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) { 1696 cbn = TAILQ_NEXT(cbe, list); 1697 if ((uap->aiocbp == NULL) || 1698 (uap->aiocbp == cbe->uuaiocb) ) { 1699 po = cbe->userproc; 1700 ki = po->p_aioinfo; 1701 TAILQ_REMOVE(&so->so_aiojobq, cbe, list); 1702 TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist); 1703 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist); 1704 if (ki->kaio_flags & KAIO_WAKEUP) { 1705 wakeup(po); 1706 } 1707 cbe->jobstate = JOBST_JOBFINISHED; 1708 cbe->uaiocb._aiocb_private.status=-1; 1709 cbe->uaiocb._aiocb_private.error=ECANCELED; 1710 cancelled++; 1711/* XXX cancelled, knote? */ 1712 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1713 SIGEV_SIGNAL) 1714 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1715 if (uap->aiocbp) 1716 break; 1717 } 1718 } 1719 1720 splx(s); 1721 1722 if ((cancelled) && (uap->aiocbp)) { 1723 p->p_retval[0] = AIO_CANCELED; 1724 return 0; 1725 } 1726 1727 } 1728 1729 ki=p->p_aioinfo; 1730 1731 s = splnet(); 1732 1733 for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) { 1734 cbn = TAILQ_NEXT(cbe, plist); 1735 1736 if ((uap->fd == cbe->uaiocb.aio_fildes) && 1737 ((uap->aiocbp == NULL ) || 1738 (uap->aiocbp == cbe->uuaiocb))) { 1739 1740 if (cbe->jobstate == JOBST_JOBQGLOBAL) { 1741 TAILQ_REMOVE(&aio_jobs, cbe, list); 1742 TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); 1743 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, 1744 plist); 1745 cancelled++; 1746 ki->kaio_queue_finished_count++; 1747 cbe->jobstate = JOBST_JOBFINISHED; 1748 cbe->uaiocb._aiocb_private.status = -1; 1749 cbe->uaiocb._aiocb_private.error = ECANCELED; 1750/* XXX cancelled, knote? */ 1751 if (cbe->uaiocb.aio_sigevent.sigev_notify == 1752 SIGEV_SIGNAL) 1753 psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo); 1754 } else { 1755 notcancelled++; 1756 } 1757 } 1758 } 1759 1760 splx(s); 1761 1762 1763 if (notcancelled) { 1764 p->p_retval[0] = AIO_NOTCANCELED; 1765 return 0; 1766 } 1767 1768 if (cancelled) { 1769 p->p_retval[0] = AIO_CANCELED; 1770 return 0; 1771 } 1772 1773 p->p_retval[0] = AIO_ALLDONE; 1774 1775 return 0; 1776#endif /* VFS_AIO */ 1777} 1778 1779/* 1780 * aio_error is implemented in the kernel level for compatibility purposes only. 1781 * For a user mode async implementation, it would be best to do it in a userland 1782 * subroutine. 1783 */ 1784int 1785aio_error(struct proc *p, struct aio_error_args *uap) 1786{ 1787#ifndef VFS_AIO 1788 return ENOSYS; 1789#else 1790 int s; 1791 struct aiocblist *cb; 1792 struct kaioinfo *ki; 1793 int jobref; 1794 1795 ki = p->p_aioinfo; 1796 if (ki == NULL) 1797 return EINVAL; 1798 1799 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1800 if ((jobref == -1) || (jobref == 0)) 1801 return EINVAL; 1802 1803 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); cb; cb = TAILQ_NEXT(cb, 1804 plist)) { 1805 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1806 jobref) { 1807 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1808 return 0; 1809 } 1810 } 1811 1812 s = splnet(); 1813 1814 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb, 1815 plist)) { 1816 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1817 jobref) { 1818 p->p_retval[0] = EINPROGRESS; 1819 splx(s); 1820 return 0; 1821 } 1822 } 1823 1824 for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb, 1825 plist)) { 1826 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1827 jobref) { 1828 p->p_retval[0] = EINPROGRESS; 1829 splx(s); 1830 return 0; 1831 } 1832 } 1833 splx(s); 1834 1835 s = splbio(); 1836 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb, 1837 plist)) { 1838 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1839 jobref) { 1840 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1841 splx(s); 1842 return 0; 1843 } 1844 } 1845 1846 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb, 1847 plist)) { 1848 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) == 1849 jobref) { 1850 p->p_retval[0] = EINPROGRESS; 1851 splx(s); 1852 return 0; 1853 } 1854 } 1855 splx(s); 1856 1857#if (0) 1858 /* 1859 * Hack for lio. 1860 */ 1861 status = fuword(&uap->aiocbp->_aiocb_private.status); 1862 if (status == -1) 1863 return fuword(&uap->aiocbp->_aiocb_private.error); 1864#endif 1865 return EINVAL; 1866#endif /* VFS_AIO */ 1867} 1868 1869int 1870aio_read(struct proc *p, struct aio_read_args *uap) 1871{ 1872#ifndef VFS_AIO 1873 return ENOSYS; 1874#else 1875 struct filedesc *fdp; 1876 struct file *fp; 1877 struct uio auio; 1878 struct iovec aiov; 1879 unsigned int fd; 1880 int cnt; 1881 struct aiocb iocb; 1882 int error, pmodes; 1883 1884 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1885 if ((pmodes & AIO_PMODE_SYNC) == 0) 1886 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1887 1888 /* Get control block. */ 1889 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1890 != 0) 1891 return error; 1892 1893 /* Get the fd info for process. */ 1894 fdp = p->p_fd; 1895 1896 /* 1897 * Range check file descriptor. 1898 */ 1899 fd = iocb.aio_fildes; 1900 if (fd >= fdp->fd_nfiles) 1901 return EBADF; 1902 fp = fdp->fd_ofiles[fd]; 1903 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1904 return EBADF; 1905 if (iocb.aio_offset == -1LL) 1906 return EINVAL; 1907 1908 auio.uio_resid = iocb.aio_nbytes; 1909 if (auio.uio_resid < 0) 1910 return (EINVAL); 1911 1912 /* 1913 * Process sync simply -- queue async request. 1914 */ 1915 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) 1916 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_READ); 1917 1918 aiov.iov_base = (void *)iocb.aio_buf; 1919 aiov.iov_len = iocb.aio_nbytes; 1920 1921 auio.uio_iov = &aiov; 1922 auio.uio_iovcnt = 1; 1923 auio.uio_offset = iocb.aio_offset; 1924 auio.uio_rw = UIO_READ; 1925 auio.uio_segflg = UIO_USERSPACE; 1926 auio.uio_procp = p; 1927 1928 cnt = iocb.aio_nbytes; 1929 /* 1930 * Temporarily bump the ref count while reading to avoid the 1931 * descriptor being ripped out from under us. 1932 */ 1933 fhold(fp); 1934 error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, p); 1935 fdrop(fp, p); 1936 if (error && (auio.uio_resid != cnt) && (error == ERESTART || error == 1937 EINTR || error == EWOULDBLOCK)) 1938 error = 0; 1939 cnt -= auio.uio_resid; 1940 p->p_retval[0] = cnt; 1941 return error; 1942#endif /* VFS_AIO */ 1943} 1944 1945int 1946aio_write(struct proc *p, struct aio_write_args *uap) 1947{ 1948#ifndef VFS_AIO 1949 return ENOSYS; 1950#else 1951 struct filedesc *fdp; 1952 struct file *fp; 1953 struct uio auio; 1954 struct iovec aiov; 1955 unsigned int fd; 1956 int cnt; 1957 struct aiocb iocb; 1958 int error; 1959 int pmodes; 1960 1961 /* 1962 * Process sync simply -- queue async request. 1963 */ 1964 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1965 if ((pmodes & AIO_PMODE_SYNC) == 0) 1966 return aio_aqueue(p, (struct aiocb *)uap->aiocbp, LIO_WRITE); 1967 1968 if ((error = copyin((caddr_t)uap->aiocbp, (caddr_t)&iocb, sizeof iocb)) 1969 != 0) 1970 return error; 1971 1972 /* Get the fd info for process. */ 1973 fdp = p->p_fd; 1974 1975 /* 1976 * Range check file descriptor. 1977 */ 1978 fd = iocb.aio_fildes; 1979 if (fd >= fdp->fd_nfiles) 1980 return EBADF; 1981 fp = fdp->fd_ofiles[fd]; 1982 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1983 return EBADF; 1984 if (iocb.aio_offset == -1LL) 1985 return EINVAL; 1986 1987 aiov.iov_base = (void *)iocb.aio_buf; 1988 aiov.iov_len = iocb.aio_nbytes; 1989 auio.uio_iov = &aiov; 1990 auio.uio_iovcnt = 1; 1991 auio.uio_offset = iocb.aio_offset; 1992 1993 auio.uio_resid = iocb.aio_nbytes; 1994 if (auio.uio_resid < 0) 1995 return (EINVAL); 1996 1997 auio.uio_rw = UIO_WRITE; 1998 auio.uio_segflg = UIO_USERSPACE; 1999 auio.uio_procp = p; 2000 2001 cnt = iocb.aio_nbytes; 2002 /* 2003 * Temporarily bump the ref count while writing to avoid the 2004 * descriptor being ripped out from under us. 2005 */ 2006 fhold(fp); 2007 error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, p); 2008 fdrop(fp, p); 2009 if (error) { 2010 if (auio.uio_resid != cnt) { 2011 if (error == ERESTART || error == EINTR || error == 2012 EWOULDBLOCK) 2013 error = 0; 2014 if (error == EPIPE) 2015 psignal(p, SIGPIPE); 2016 } 2017 } 2018 cnt -= auio.uio_resid; 2019 p->p_retval[0] = cnt; 2020 return error; 2021#endif /* VFS_AIO */ 2022} 2023 2024int 2025lio_listio(struct proc *p, struct lio_listio_args *uap) 2026{ 2027#ifndef VFS_AIO 2028 return ENOSYS; 2029#else 2030 int nent, nentqueued; 2031 struct aiocb *iocb, * const *cbptr; 2032 struct aiocblist *cb; 2033 struct kaioinfo *ki; 2034 struct aio_liojob *lj; 2035 int error, runningcode; 2036 int nerror; 2037 int i; 2038 int s; 2039 2040 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) 2041 return EINVAL; 2042 2043 nent = uap->nent; 2044 if (nent > AIO_LISTIO_MAX) 2045 return EINVAL; 2046 2047 if (p->p_aioinfo == NULL) 2048 aio_init_aioinfo(p); 2049 2050 if ((nent + num_queue_count) > max_queue_count) 2051 return EAGAIN; 2052 2053 ki = p->p_aioinfo; 2054 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) 2055 return EAGAIN; 2056 2057 lj = zalloc(aiolio_zone); 2058 if (!lj) 2059 return EAGAIN; 2060 2061 lj->lioj_flags = 0; 2062 lj->lioj_buffer_count = 0; 2063 lj->lioj_buffer_finished_count = 0; 2064 lj->lioj_queue_count = 0; 2065 lj->lioj_queue_finished_count = 0; 2066 lj->lioj_ki = ki; 2067 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 2068 2069 /* 2070 * Setup signal. 2071 */ 2072 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 2073 error = copyin(uap->sig, &lj->lioj_signal, 2074 sizeof(lj->lioj_signal)); 2075 if (error) 2076 return error; 2077 lj->lioj_flags |= LIOJ_SIGNAL; 2078 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 2079 } else 2080 lj->lioj_flags &= ~LIOJ_SIGNAL; 2081 2082 /* 2083 * Get pointers to the list of I/O requests. 2084 */ 2085 nerror = 0; 2086 nentqueued = 0; 2087 cbptr = uap->acb_list; 2088 for (i = 0; i < uap->nent; i++) { 2089 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2090 if (((intptr_t)iocb != -1) && ((intptr_t)iocb != NULL)) { 2091 error = _aio_aqueue(p, iocb, lj, 0); 2092 if (error == 0) 2093 nentqueued++; 2094 else 2095 nerror++; 2096 } 2097 } 2098 2099 /* 2100 * If we haven't queued any, then just return error. 2101 */ 2102 if (nentqueued == 0) 2103 return 0; 2104 2105 /* 2106 * Calculate the appropriate error return. 2107 */ 2108 runningcode = 0; 2109 if (nerror) 2110 runningcode = EIO; 2111 2112 if (uap->mode == LIO_WAIT) { 2113 int command, found, jobref; 2114 2115 for (;;) { 2116 found = 0; 2117 for (i = 0; i < uap->nent; i++) { 2118 /* 2119 * Fetch address of the control buf pointer in 2120 * user space. 2121 */ 2122 iocb = (struct aiocb *)(intptr_t)fuword((caddr_t)&cbptr[i]); 2123 if (((intptr_t)iocb == -1) || ((intptr_t)iocb 2124 == 0)) 2125 continue; 2126 2127 /* 2128 * Fetch the associated command from user space. 2129 */ 2130 command = fuword(&iocb->aio_lio_opcode); 2131 if (command == LIO_NOP) { 2132 found++; 2133 continue; 2134 } 2135 2136 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 2137 2138 TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) { 2139 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2140 == jobref) { 2141 if (cb->uaiocb.aio_lio_opcode 2142 == LIO_WRITE) { 2143 curproc->p_stats->p_ru.ru_oublock 2144 += 2145 cb->outputcharge; 2146 cb->outputcharge = 0; 2147 } else if (cb->uaiocb.aio_lio_opcode 2148 == LIO_READ) { 2149 curproc->p_stats->p_ru.ru_inblock 2150 += cb->inputcharge; 2151 cb->inputcharge = 0; 2152 } 2153 found++; 2154 break; 2155 } 2156 } 2157 2158 s = splbio(); 2159 TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) { 2160 if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) 2161 == jobref) { 2162 found++; 2163 break; 2164 } 2165 } 2166 splx(s); 2167 } 2168 2169 /* 2170 * If all I/Os have been disposed of, then we can 2171 * return. 2172 */ 2173 if (found == nentqueued) 2174 return runningcode; 2175 2176 ki->kaio_flags |= KAIO_WAKEUP; 2177 error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0); 2178 2179 if (error == EINTR) 2180 return EINTR; 2181 else if (error == EWOULDBLOCK) 2182 return EAGAIN; 2183 } 2184 } 2185 2186 return runningcode; 2187#endif /* VFS_AIO */ 2188} 2189 2190#ifdef VFS_AIO 2191/* 2192 * This is a weird hack so that we can post a signal. It is safe to do so from 2193 * a timeout routine, but *not* from an interrupt routine. 2194 */ 2195static void 2196process_signal(void *aioj) 2197{ 2198 struct aiocblist *aiocbe = aioj; 2199 struct aio_liojob *lj = aiocbe->lio; 2200 struct aiocb *cb = &aiocbe->uaiocb; 2201 2202 if ((lj) && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) && 2203 (lj->lioj_queue_count == lj->lioj_queue_finished_count)) { 2204 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 2205 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2206 } 2207 2208 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2209 psignal(aiocbe->userproc, cb->aio_sigevent.sigev_signo); 2210} 2211 2212/* 2213 * Interrupt handler for physio, performs the necessary process wakeups, and 2214 * signals. 2215 */ 2216static void 2217aio_physwakeup(struct buf *bp) 2218{ 2219 struct aiocblist *aiocbe; 2220 struct proc *p; 2221 struct kaioinfo *ki; 2222 struct aio_liojob *lj; 2223 2224 wakeup((caddr_t)bp); 2225 2226 aiocbe = (struct aiocblist *)bp->b_spc; 2227 if (aiocbe) { 2228 p = bp->b_caller1; 2229 2230 aiocbe->jobstate = JOBST_JOBBFINISHED; 2231 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2232 aiocbe->uaiocb._aiocb_private.error = 0; 2233 aiocbe->jobflags |= AIOCBLIST_DONE; 2234 2235 if (bp->b_ioflags & BIO_ERROR) 2236 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2237 2238 lj = aiocbe->lio; 2239 if (lj) { 2240 lj->lioj_buffer_finished_count++; 2241 2242 /* 2243 * wakeup/signal if all of the interrupt jobs are done. 2244 */ 2245 if (lj->lioj_buffer_finished_count == 2246 lj->lioj_buffer_count) { 2247 /* 2248 * Post a signal if it is called for. 2249 */ 2250 if ((lj->lioj_flags & 2251 (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2252 LIOJ_SIGNAL) { 2253 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2254 timeout(process_signal, aiocbe, 0); 2255 } 2256 } 2257 } 2258 2259 ki = p->p_aioinfo; 2260 if (ki) { 2261 ki->kaio_buffer_finished_count++; 2262 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2263 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2264 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2265 2266 KNOTE(&aiocbe->klist, 0); 2267 /* Do the wakeup. */ 2268 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2269 ki->kaio_flags &= ~KAIO_WAKEUP; 2270 wakeup(p); 2271 } 2272 } 2273 2274 if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) 2275 timeout(process_signal, aiocbe, 0); 2276 } 2277} 2278#endif /* VFS_AIO */ 2279 2280int 2281aio_waitcomplete(struct proc *p, struct aio_waitcomplete_args *uap) 2282{ 2283#ifndef VFS_AIO 2284 return ENOSYS; 2285#else 2286 struct timeval atv; 2287 struct timespec ts; 2288 struct aiocb **cbptr; 2289 struct kaioinfo *ki; 2290 struct aiocblist *cb = NULL; 2291 int error, s, timo; 2292 2293 suword(uap->aiocbp, (int)NULL); 2294 2295 timo = 0; 2296 if (uap->timeout) { 2297 /* Get timespec struct. */ 2298 error = copyin((caddr_t)uap->timeout, (caddr_t)&ts, 2299 sizeof(ts)); 2300 if (error) 2301 return error; 2302 2303 if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) 2304 return (EINVAL); 2305 2306 TIMESPEC_TO_TIMEVAL(&atv, &ts); 2307 if (itimerfix(&atv)) 2308 return (EINVAL); 2309 timo = tvtohz(&atv); 2310 } 2311 2312 ki = p->p_aioinfo; 2313 if (ki == NULL) 2314 return EAGAIN; 2315 2316 cbptr = uap->aiocbp; 2317 2318 for (;;) { 2319 if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) { 2320 suword(uap->aiocbp, (int)cb->uuaiocb); 2321 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2322 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 2323 curproc->p_stats->p_ru.ru_oublock += 2324 cb->outputcharge; 2325 cb->outputcharge = 0; 2326 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 2327 curproc->p_stats->p_ru.ru_inblock += 2328 cb->inputcharge; 2329 cb->inputcharge = 0; 2330 } 2331 aio_free_entry(cb); 2332 return cb->uaiocb._aiocb_private.error; 2333 } 2334 2335 s = splbio(); 2336 if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) { 2337 splx(s); 2338 suword(uap->aiocbp, (int)cb->uuaiocb); 2339 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 2340 aio_free_entry(cb); 2341 return cb->uaiocb._aiocb_private.error; 2342 } 2343 2344 ki->kaio_flags |= KAIO_WAKEUP; 2345 error = tsleep(p, PRIBIO | PCATCH, "aiowc", timo); 2346 splx(s); 2347 2348 if (error == ERESTART) 2349 return EINTR; 2350 else if (error < 0) 2351 return error; 2352 else if (error == EINTR) 2353 return EINTR; 2354 else if (error == EWOULDBLOCK) 2355 return EAGAIN; 2356 } 2357#endif /* VFS_AIO */ 2358} 2359 2360 2361#ifndef VFS_AIO 2362static int 2363filt_aioattach(struct knote *kn) 2364{ 2365 2366 return (ENXIO); 2367} 2368 2369struct filterops aio_filtops = 2370 { 0, filt_aioattach, NULL, NULL }; 2371 2372#else 2373static int 2374filt_aioattach(struct knote *kn) 2375{ 2376 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2377 2378 /* 2379 * The aiocbe pointer must be validated before using it, so 2380 * registration is restricted to the kernel; the user cannot 2381 * set EV_FLAG1. 2382 */ 2383 if ((kn->kn_flags & EV_FLAG1) == 0) 2384 return (EPERM); 2385 kn->kn_flags &= ~EV_FLAG1; 2386 2387 SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext); 2388 2389 return (0); 2390} 2391 2392static void 2393filt_aiodetach(struct knote *kn) 2394{ 2395 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2396 int s = splhigh(); /* XXX no clue, so overkill */ 2397 2398 SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext); 2399 splx(s); 2400} 2401 2402/*ARGSUSED*/ 2403static int 2404filt_aio(struct knote *kn, long hint) 2405{ 2406 struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_id; 2407 2408 kn->kn_data = 0; /* XXX data returned? */ 2409 if (aiocbe->jobstate != JOBST_JOBFINISHED && 2410 aiocbe->jobstate != JOBST_JOBBFINISHED) 2411 return (0); 2412 kn->kn_flags |= EV_EOF; 2413 return (1); 2414} 2415 2416struct filterops aio_filtops = 2417 { 0, filt_aioattach, filt_aiodetach, filt_aio }; 2418#endif /* VFS_AIO */
|