vfs_aio.c revision 31473
1/* 2 * Copyright (c) 1997 John S. Dyson. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. John S. Dyson's name may not be used to endorse or promote products 10 * derived from this software without specific prior written permission. 11 * 12 * DISCLAIMER: This code isn't warranted to do anything useful. Anything 13 * bad that happens because of using this software isn't the responsibility 14 * of the author. This software is distributed AS-IS. 15 * 16 * $Id: vfs_aio.c,v 1.16 1997/11/30 23:21:08 dyson Exp $ 17 */ 18 19/* 20 * This file contains support for the POSIX.4 AIO/LIO facility. 21 */ 22 23#include <sys/param.h> 24#include <sys/systm.h> 25#include <sys/sysproto.h> 26#include <sys/filedesc.h> 27#include <sys/kernel.h> 28#include <sys/fcntl.h> 29#include <sys/file.h> 30#include <sys/lock.h> 31#include <sys/unistd.h> 32#include <sys/proc.h> 33#include <sys/uio.h> 34#include <sys/malloc.h> 35#include <sys/signalvar.h> 36#include <sys/sysctl.h> 37#include <sys/vnode.h> 38#include <sys/conf.h> 39#include <miscfs/specfs/specdev.h> 40 41#include <vm/vm.h> 42#include <vm/vm_param.h> 43#include <vm/vm_extern.h> 44#include <vm/pmap.h> 45#include <vm/vm_map.h> 46#include <vm/vm_zone.h> 47#include <sys/aio.h> 48#include <sys/shm.h> 49#include <sys/user.h> 50 51#include <machine/cpu.h> 52 53static int jobrefid; 54 55#define JOBST_NULL 0x0 56#define JOBST_JOBQPROC 0x1 57#define JOBST_JOBQGLOBAL 0x2 58#define JOBST_JOBRUNNING 0x3 59#define JOBST_JOBFINISHED 0x4 60#define JOBST_JOBQBUF 0x5 61#define JOBST_JOBBFINISHED 0x6 62 63#ifndef MAX_AIO_PER_PROC 64#define MAX_AIO_PER_PROC 32 65#endif 66 67#ifndef MAX_AIO_QUEUE_PER_PROC 68#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ 69#endif 70 71#ifndef MAX_AIO_PROCS 72#define MAX_AIO_PROCS 32 73#endif 74 75#ifndef MAX_AIO_QUEUE 76#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ 77#endif 78 79#ifndef TARGET_AIO_PROCS 80#define TARGET_AIO_PROCS 0 81#endif 82 83#ifndef MAX_BUF_AIO 84#define MAX_BUF_AIO 16 85#endif 86 87#ifndef AIOD_TIMEOUT_DEFAULT 88#define AIOD_TIMEOUT_DEFAULT (10 * hz) 89#endif 90 91#ifndef AIOD_LIFETIME_DEFAULT 92#define AIOD_LIFETIME_DEFAULT (30 * hz) 93#endif 94 95int max_aio_procs = MAX_AIO_PROCS; 96int num_aio_procs = 0; 97int target_aio_procs = TARGET_AIO_PROCS; 98int max_queue_count = MAX_AIO_QUEUE; 99int num_queue_count = 0; 100int num_buf_aio = 0; 101int num_aio_resv_start = 0; 102int aiod_timeout; 103int aiod_lifetime; 104 105int max_aio_per_proc = MAX_AIO_PER_PROC, 106 max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; 107 108int max_buf_aio = MAX_BUF_AIO; 109 110SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); 111 112SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, 113 CTLFLAG_RW, &max_aio_per_proc, 0, ""); 114 115SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, 116 CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); 117 118SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, 119 CTLFLAG_RW, &max_aio_procs, 0, ""); 120 121SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, 122 CTLFLAG_RD, &num_aio_procs, 0, ""); 123 124SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, 125 CTLFLAG_RD, &num_queue_count, 0, ""); 126 127SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, 128 CTLFLAG_RW, &max_queue_count, 0, ""); 129 130SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, 131 CTLFLAG_RW, &target_aio_procs, 0, ""); 132 133SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, 134 CTLFLAG_RW, &max_buf_aio, 0, ""); 135 136SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, 137 CTLFLAG_RD, &num_buf_aio, 0, ""); 138 139SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, 140 CTLFLAG_RW, &aiod_lifetime, 0, ""); 141 142SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, 143 CTLFLAG_RW, &aiod_timeout, 0, ""); 144 145 146/* 147 * Job queue item 148 */ 149 150#define AIOCBLIST_CANCELLED 0x1 151#define AIOCBLIST_RUNDOWN 0x4 152#define AIOCBLIST_ASYNCFREE 0x8 153#define AIOCBLIST_DONE 0x10 154 155struct aiocblist { 156 TAILQ_ENTRY (aiocblist) list; /* List of jobs */ 157 TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ 158 int jobflags; 159 int jobstate; 160 int inputcharge, outputcharge; 161 struct buf *bp; /* buffer pointer */ 162 struct proc *userproc; /* User process */ 163 struct aioproclist *jobaioproc; /* AIO process descriptor */ 164 struct aio_liojob *lio; /* optional lio job */ 165 struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ 166 struct aiocb uaiocb; /* Kernel I/O control block */ 167}; 168 169 170/* 171 * AIO process info 172 */ 173#define AIOP_FREE 0x1 /* proc on free queue */ 174#define AIOP_SCHED 0x2 /* proc explicitly scheduled */ 175 176struct aioproclist { 177 int aioprocflags; /* AIO proc flags */ 178 TAILQ_ENTRY(aioproclist) list; /* List of processes */ 179 struct proc *aioproc; /* The AIO thread */ 180 TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ 181}; 182 183/* 184 * data-structure for lio signal management 185 */ 186struct aio_liojob { 187 int lioj_flags; 188 int lioj_buffer_count; 189 int lioj_buffer_finished_count; 190 int lioj_queue_count; 191 int lioj_queue_finished_count; 192 struct sigevent lioj_signal; /* signal on all I/O done */ 193 TAILQ_ENTRY (aio_liojob) lioj_list; 194 struct kaioinfo *lioj_ki; 195}; 196#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ 197#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ 198 199/* 200 * per process aio data structure 201 */ 202struct kaioinfo { 203 int kaio_flags; /* per process kaio flags */ 204 int kaio_maxactive_count; /* maximum number of AIOs */ 205 int kaio_active_count; /* number of currently used AIOs */ 206 int kaio_qallowed_count; /* maxiumu size of AIO queue */ 207 int kaio_queue_count; /* size of AIO queue */ 208 int kaio_ballowed_count; /* maximum number of buffers */ 209 int kaio_queue_finished_count; /* number of daemon jobs finished */ 210 int kaio_buffer_count; /* number of physio buffers */ 211 int kaio_buffer_finished_count; /* count of I/O done */ 212 struct proc *kaio_p; /* process that uses this kaio block */ 213 TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ 214 TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ 215 TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ 216 TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ 217 TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ 218}; 219 220#define KAIO_RUNDOWN 0x1 /* process is being run down */ 221#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant 222 event */ 223 224 225TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; 226TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ 227TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ 228TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ 229 230static void aio_init_aioinfo(struct proc *p) ; 231static void aio_onceonly(void *) ; 232static int aio_free_entry(struct aiocblist *aiocbe); 233static void aio_process(struct aiocblist *aiocbe); 234static int aio_newproc(void) ; 235static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; 236static void aio_physwakeup(struct buf *bp); 237static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); 238static int aio_qphysio(struct proc *p, struct aiocblist *iocb); 239static void aio_daemon(void *uproc); 240 241SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); 242 243static vm_zone_t kaio_zone=0, aiop_zone=0, 244 aiocb_zone=0, aiol_zone=0, aiolio_zone=0; 245 246/* 247 * Single AIOD vmspace shared amongst all of them 248 */ 249static struct vmspace *aiovmspace = NULL; 250 251/* 252 * Startup initialization 253 */ 254void 255aio_onceonly(void *na) 256{ 257 TAILQ_INIT(&aio_freeproc); 258 TAILQ_INIT(&aio_activeproc); 259 TAILQ_INIT(&aio_jobs); 260 TAILQ_INIT(&aio_bufjobs); 261 TAILQ_INIT(&aio_freejobs); 262 kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); 263 aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); 264 aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); 265 aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); 266 aiolio_zone = zinit("AIOLIO", 267 AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); 268 aiod_timeout = AIOD_TIMEOUT_DEFAULT; 269 aiod_lifetime = AIOD_LIFETIME_DEFAULT; 270 jobrefid = 1; 271} 272 273/* 274 * Init the per-process aioinfo structure. 275 * The aioinfo limits are set per-process for user limit (resource) management. 276 */ 277void 278aio_init_aioinfo(struct proc *p) 279{ 280 struct kaioinfo *ki; 281 if (p->p_aioinfo == NULL) { 282 ki = zalloc(kaio_zone); 283 p->p_aioinfo = ki; 284 ki->kaio_flags = 0; 285 ki->kaio_maxactive_count = max_aio_per_proc; 286 ki->kaio_active_count = 0; 287 ki->kaio_qallowed_count = max_aio_queue_per_proc; 288 ki->kaio_queue_count = 0; 289 ki->kaio_ballowed_count = max_buf_aio; 290 ki->kaio_buffer_count = 0; 291 ki->kaio_buffer_finished_count = 0; 292 ki->kaio_p = p; 293 TAILQ_INIT(&ki->kaio_jobdone); 294 TAILQ_INIT(&ki->kaio_jobqueue); 295 TAILQ_INIT(&ki->kaio_bufdone); 296 TAILQ_INIT(&ki->kaio_bufqueue); 297 TAILQ_INIT(&ki->kaio_liojoblist); 298 } 299} 300 301/* 302 * Free a job entry. Wait for completion if it is currently 303 * active, but don't delay forever. If we delay, we return 304 * a flag that says that we have to restart the queue scan. 305 */ 306int 307aio_free_entry(struct aiocblist *aiocbe) 308{ 309 struct kaioinfo *ki; 310 struct aioproclist *aiop; 311 struct aio_liojob *lj; 312 struct proc *p; 313 int error; 314 int s; 315 316 if (aiocbe->jobstate == JOBST_NULL) 317 panic("aio_free_entry: freeing already free job"); 318 319 p = aiocbe->userproc; 320 ki = p->p_aioinfo; 321 lj = aiocbe->lio; 322 if (ki == NULL) 323 panic("aio_free_entry: missing p->p_aioinfo"); 324 325 if (aiocbe->jobstate == JOBST_JOBRUNNING) { 326 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) 327 return 0; 328 aiocbe->jobflags |= AIOCBLIST_RUNDOWN; 329 tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); 330 } 331 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 332 333 if (aiocbe->bp == NULL) { 334 if (ki->kaio_queue_count <= 0) 335 panic("aio_free_entry: process queue size <= 0"); 336 if (num_queue_count <= 0) 337 panic("aio_free_entry: system wide queue size <= 0"); 338 339 if(lj) { 340 lj->lioj_queue_count--; 341 if (aiocbe->jobflags & AIOCBLIST_DONE) 342 lj->lioj_queue_finished_count--; 343 } 344 ki->kaio_queue_count--; 345 if (aiocbe->jobflags & AIOCBLIST_DONE) 346 ki->kaio_queue_finished_count--; 347 num_queue_count--; 348 349 } else { 350 if(lj) { 351 lj->lioj_buffer_count--; 352 if (aiocbe->jobflags & AIOCBLIST_DONE) 353 lj->lioj_buffer_finished_count--; 354 } 355 if (aiocbe->jobflags & AIOCBLIST_DONE) 356 ki->kaio_buffer_finished_count--; 357 ki->kaio_buffer_count--; 358 num_buf_aio--; 359 360 } 361 362 if ((ki->kaio_flags & KAIO_WAKEUP) || 363 (ki->kaio_flags & KAIO_RUNDOWN) && 364 ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) { 365 ki->kaio_flags &= ~KAIO_WAKEUP; 366 wakeup(p); 367 } 368 369 if ( aiocbe->jobstate == JOBST_JOBQBUF) { 370 if ((error = aio_fphysio(p, aiocbe, 1)) != 0) 371 return error; 372 if (aiocbe->jobstate != JOBST_JOBBFINISHED) 373 panic("aio_free_entry: invalid physio finish-up state"); 374 s = splbio(); 375 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 376 splx(s); 377 } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { 378 aiop = aiocbe->jobaioproc; 379 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 380 } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { 381 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 382 } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { 383 TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); 384 } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { 385 s = splbio(); 386 TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); 387 splx(s); 388 if (aiocbe->bp) { 389 vunmapbuf(aiocbe->bp); 390 relpbuf(aiocbe->bp); 391 aiocbe->bp = NULL; 392 } 393 } 394 if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 395 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 396 zfree(aiolio_zone, lj); 397 } 398 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 399 aiocbe->jobstate = JOBST_NULL; 400 return 0; 401} 402 403/* 404 * Rundown the jobs for a given process. 405 */ 406void 407aio_proc_rundown(struct proc *p) 408{ 409 int s; 410 struct kaioinfo *ki; 411 struct aio_liojob *lj, *ljn; 412 struct aiocblist *aiocbe, *aiocbn; 413 414 ki = p->p_aioinfo; 415 if (ki == NULL) 416 return; 417 418 ki->kaio_flags |= LIOJ_SIGNAL_POSTED; 419 while ((ki->kaio_active_count > 0) || 420 (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { 421 ki->kaio_flags |= KAIO_RUNDOWN; 422 if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) 423 break; 424 } 425 426restart1: 427 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); 428 aiocbe; 429 aiocbe = aiocbn) { 430 aiocbn = TAILQ_NEXT(aiocbe, plist); 431 if (aio_free_entry(aiocbe)) 432 goto restart1; 433 } 434 435restart2: 436 for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); 437 aiocbe; 438 aiocbe = aiocbn) { 439 aiocbn = TAILQ_NEXT(aiocbe, plist); 440 if (aio_free_entry(aiocbe)) 441 goto restart2; 442 } 443 444/* 445 * Note the use of lots of splbio here, trying to avoid 446 * splbio for long chains of I/O. Probably unnecessary. 447 */ 448 449restart3: 450 s = splbio(); 451 while (TAILQ_FIRST(&ki->kaio_bufqueue)) { 452 ki->kaio_flags |= KAIO_WAKEUP; 453 tsleep (p, PRIBIO, "aioprn", 0); 454 splx(s); 455 goto restart3; 456 } 457 splx(s); 458 459restart4: 460 s = splbio(); 461 for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); 462 aiocbe; 463 aiocbe = aiocbn) { 464 aiocbn = TAILQ_NEXT(aiocbe, plist); 465 if (aio_free_entry(aiocbe)) { 466 splx(s); 467 goto restart4; 468 } 469 } 470 splx(s); 471 472 for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); 473 lj; 474 lj = ljn) { 475 ljn = TAILQ_NEXT(lj, lioj_list); 476 if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { 477 TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); 478 zfree(aiolio_zone, lj); 479 } else { 480#if defined(DIAGNOSTIC) 481 printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", 482 lj->lioj_buffer_count, lj->lioj_buffer_finished_count, 483 lj->lioj_queue_count, lj->lioj_queue_finished_count); 484#endif 485 } 486 } 487 488 zfree(kaio_zone, ki); 489 p->p_aioinfo = NULL; 490} 491 492/* 493 * Select a job to run (called by an AIO daemon) 494 */ 495static struct aiocblist * 496aio_selectjob(struct aioproclist *aiop) 497{ 498 499 struct aiocblist *aiocbe; 500 501 aiocbe = TAILQ_FIRST(&aiop->jobtorun); 502 if (aiocbe) { 503 TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); 504 return aiocbe; 505 } 506 507 for (aiocbe = TAILQ_FIRST(&aio_jobs); 508 aiocbe; 509 aiocbe = TAILQ_NEXT(aiocbe, list)) { 510 struct kaioinfo *ki; 511 struct proc *userp; 512 513 userp = aiocbe->userproc; 514 ki = userp->p_aioinfo; 515 516 if (ki->kaio_active_count < ki->kaio_maxactive_count) { 517 TAILQ_REMOVE(&aio_jobs, aiocbe, list); 518 return aiocbe; 519 } 520 } 521 522 return NULL; 523} 524 525/* 526 * The AIO processing activity. This is the code that does the 527 * I/O request for the non-physio version of the operations. The 528 * normal vn operations are used, and this code should work in 529 * all instances for every type of file, including pipes, sockets, 530 * fifos, and regular files. 531 */ 532void 533aio_process(struct aiocblist *aiocbe) 534{ 535 struct filedesc *fdp; 536 struct proc *userp, *mycp; 537 struct aiocb *cb; 538 struct file *fp; 539 struct uio auio; 540 struct iovec aiov; 541 unsigned int fd; 542 int cnt; 543 static nperline=0; 544 int error; 545 off_t offset; 546 int oublock_st, oublock_end; 547 int inblock_st, inblock_end; 548 549 userp = aiocbe->userproc; 550 cb = &aiocbe->uaiocb; 551 552 mycp = curproc; 553 554 fdp = mycp->p_fd; 555 fd = cb->aio_fildes; 556 fp = fdp->fd_ofiles[fd]; 557 558 aiov.iov_base = cb->aio_buf; 559 aiov.iov_len = cb->aio_nbytes; 560 561 auio.uio_iov = &aiov; 562 auio.uio_iovcnt = 1; 563 auio.uio_offset = offset = cb->aio_offset; 564 auio.uio_resid = cb->aio_nbytes; 565 cnt = cb->aio_nbytes; 566 auio.uio_segflg = UIO_USERSPACE; 567 auio.uio_procp = mycp; 568 569 inblock_st = mycp->p_stats->p_ru.ru_inblock; 570 oublock_st = mycp->p_stats->p_ru.ru_oublock; 571 if (cb->aio_lio_opcode == LIO_READ) { 572 auio.uio_rw = UIO_READ; 573 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); 574 } else { 575 auio.uio_rw = UIO_WRITE; 576 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); 577 } 578 inblock_end = mycp->p_stats->p_ru.ru_inblock; 579 oublock_end = mycp->p_stats->p_ru.ru_oublock; 580 581 aiocbe->inputcharge = inblock_end - inblock_st; 582 aiocbe->outputcharge = oublock_end - oublock_st; 583 584 if (error) { 585 if (auio.uio_resid != cnt) { 586 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 587 error = 0; 588 if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) 589 psignal(userp, SIGPIPE); 590 } 591 } 592 593 cnt -= auio.uio_resid; 594 cb->_aiocb_private.error = error; 595 cb->_aiocb_private.status = cnt; 596 597 return; 598 599} 600 601/* 602 * The AIO daemon, most of the actual work is done in aio_process, 603 * but the setup (and address space mgmt) is done in this routine. 604 */ 605static void 606aio_daemon(void *uproc) 607{ 608 int s; 609 struct aioproclist *aiop; 610 struct vmspace *myvm, *aiovm; 611 struct proc *mycp; 612 613 /* 614 * Local copies of curproc (cp) and vmspace (myvm) 615 */ 616 mycp = curproc; 617 myvm = mycp->p_vmspace; 618 619 /* 620 * We manage to create only one VM space for all AIOD processes. 621 * The VM space for the first AIOD created becomes the shared VM 622 * space for all of them. We add an additional reference count, 623 * even for the first AIOD, so the address space does not go away, 624 * and we continue to use that original VM space even if the first 625 * AIOD exits. 626 */ 627 if ((aiovm = aiovmspace) == NULL) { 628 aiovmspace = myvm; 629 myvm->vm_refcnt++; 630 /* 631 * Remove userland cruft from address space. 632 */ 633 if (myvm->vm_shm) 634 shmexit(mycp); 635 pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK); 636 vm_map_remove(&myvm->vm_map, 0, USRSTACK); 637 myvm->vm_tsize = 0; 638 myvm->vm_dsize = 0; 639 myvm->vm_ssize = 0; 640 } else { 641 aiovm->vm_refcnt++; 642 mycp->p_vmspace = aiovm; 643 pmap_activate(mycp); 644 vmspace_free(myvm); 645 myvm = aiovm; 646 } 647 648 if (mycp->p_textvp) { 649 vrele(mycp->p_textvp); 650 mycp->p_textvp = NULL; 651 } 652 653 /* 654 * Allocate and ready the aio control info. There is one 655 * aiop structure per daemon. 656 */ 657 aiop = zalloc(aiop_zone); 658 aiop->aioproc = mycp; 659 aiop->aioprocflags |= AIOP_FREE; 660 TAILQ_INIT(&aiop->jobtorun); 661 662 /* 663 * Place thread (lightweight process) onto the AIO free thread list 664 */ 665 if (TAILQ_EMPTY(&aio_freeproc)) 666 wakeup(&aio_freeproc); 667 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 668 669 /* 670 * Make up a name for the daemon 671 */ 672 strcpy(mycp->p_comm, "aiod"); 673 674 /* 675 * Get rid of our current filedescriptors. AIOD's don't need any 676 * filedescriptors, except as temporarily inherited from the client. 677 * Credentials are also cloned, and made equivalent to "root." 678 */ 679 fdfree(mycp); 680 mycp->p_fd = NULL; 681 mycp->p_ucred = crcopy(mycp->p_ucred); 682 mycp->p_ucred->cr_uid = 0; 683 mycp->p_ucred->cr_ngroups = 1; 684 mycp->p_ucred->cr_groups[0] = 1; 685 686 /* 687 * The daemon resides in it's own pgrp. 688 */ 689 enterpgrp(mycp, mycp->p_pid, 1); 690 691 /* 692 * Mark special process type 693 */ 694 mycp->p_flag |= P_SYSTEM|P_KTHREADP; 695 696 /* 697 * Wakeup parent process. (Parent sleeps to keep from blasting away 698 * creating to many daemons.) 699 */ 700 wakeup(mycp); 701 702 while(1) { 703 struct proc *curcp; 704 struct aiocblist *aiocbe; 705 706 /* 707 * curcp is the current daemon process context. 708 * userp is the current user process context. 709 */ 710 curcp = mycp; 711 712 /* 713 * Take daemon off of free queue 714 */ 715 if (aiop->aioprocflags & AIOP_FREE) { 716 TAILQ_REMOVE(&aio_freeproc, aiop, list); 717 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 718 aiop->aioprocflags &= ~AIOP_FREE; 719 } 720 aiop->aioprocflags &= ~AIOP_SCHED; 721 722 /* 723 * Check for jobs 724 */ 725 while ( aiocbe = aio_selectjob(aiop)) { 726 struct proc *userp; 727 struct aiocb *cb; 728 struct kaioinfo *ki; 729 struct aio_liojob *lj; 730 731 cb = &aiocbe->uaiocb; 732 userp = aiocbe->userproc; 733 734 aiocbe->jobstate = JOBST_JOBRUNNING; 735 736 /* 737 * Connect to process address space for user program 738 */ 739 if (userp != curcp) { 740 struct vmspace *tmpvm; 741 /* 742 * Save the current address space that we are connected to. 743 */ 744 tmpvm = mycp->p_vmspace; 745 /* 746 * Point to the new user address space, and refer to it. 747 */ 748 mycp->p_vmspace = userp->p_vmspace; 749 mycp->p_vmspace->vm_refcnt++; 750 /* 751 * Activate the new mapping. 752 */ 753 pmap_activate(mycp); 754 /* 755 * If the old address space wasn't the daemons own address 756 * space, then we need to remove the daemon's reference from 757 * the other process that it was acting on behalf of. 758 */ 759 if (tmpvm != myvm) { 760 vmspace_free(tmpvm); 761 } 762 /* 763 * Disassociate from previous clients file descriptors, and 764 * associate to the new clients descriptors. Note that 765 * the daemon doesn't need to worry about it's orginal 766 * descriptors, because they were originally freed. 767 */ 768 if (mycp->p_fd) 769 fdfree(mycp); 770 mycp->p_fd = fdshare(userp); 771 curcp = userp; 772 } 773 774 ki = userp->p_aioinfo; 775 lj = aiocbe->lio; 776 777 /* 778 * Account for currently active jobs 779 */ 780 ki->kaio_active_count++; 781 782 /* 783 * Do the I/O function 784 */ 785 aiocbe->jobaioproc = aiop; 786 aio_process(aiocbe); 787 788 /* 789 * decrement the active job count 790 */ 791 ki->kaio_active_count--; 792 793 /* 794 * increment the completion count for wakeup/signal comparisons 795 */ 796 aiocbe->jobflags |= AIOCBLIST_DONE; 797 ki->kaio_queue_finished_count++; 798 if (lj) { 799 lj->lioj_queue_finished_count++; 800 } 801 if ((ki->kaio_flags & KAIO_WAKEUP) || 802 (ki->kaio_flags & KAIO_RUNDOWN) && 803 (ki->kaio_active_count == 0)) { 804 ki->kaio_flags &= ~KAIO_WAKEUP; 805 wakeup(userp); 806 } 807 808 s = splbio(); 809 if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 810 LIOJ_SIGNAL) { 811 if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && 812 (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { 813 psignal(userp, lj->lioj_signal.sigev_signo); 814 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 815 } 816 } 817 splx(s); 818 819 aiocbe->jobstate = JOBST_JOBFINISHED; 820 821 /* 822 * If the I/O request should be automatically rundown, do the 823 * needed cleanup. Otherwise, place the queue entry for 824 * the just finished I/O request into the done queue for the 825 * associated client. 826 */ 827 if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { 828 aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; 829 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 830 } else { 831 TAILQ_REMOVE(&ki->kaio_jobqueue, 832 aiocbe, plist); 833 TAILQ_INSERT_TAIL(&ki->kaio_jobdone, 834 aiocbe, plist); 835 } 836 837 if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { 838 wakeup(aiocbe); 839 aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; 840 } 841 842 if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { 843 psignal(userp, cb->aio_sigevent.sigev_signo); 844 } 845 } 846 847 /* 848 * Disconnect from user address space 849 */ 850 if (curcp != mycp) { 851 struct vmspace *tmpvm; 852 /* 853 * Get the user address space to disconnect from. 854 */ 855 tmpvm = mycp->p_vmspace; 856 /* 857 * Get original address space for daemon. 858 */ 859 mycp->p_vmspace = myvm; 860 /* 861 * Activate the daemon's address space. 862 */ 863 pmap_activate(mycp); 864#if defined(DIAGNOSTIC) 865 if (tmpvm == myvm) 866 printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); 867#endif 868 /* 869 * remove our vmspace reference. 870 */ 871 vmspace_free(tmpvm); 872 /* 873 * disassociate from the user process's file descriptors. 874 */ 875 if (mycp->p_fd) 876 fdfree(mycp); 877 mycp->p_fd = NULL; 878 curcp = mycp; 879 } 880 881 /* 882 * If we are the first to be put onto the free queue, wakeup 883 * anyone waiting for a daemon. 884 */ 885 TAILQ_REMOVE(&aio_activeproc, aiop, list); 886 if (TAILQ_EMPTY(&aio_freeproc)) 887 wakeup(&aio_freeproc); 888 TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); 889 aiop->aioprocflags |= AIOP_FREE; 890 891 /* 892 * If daemon is inactive for a long time, allow it to exit, thereby 893 * freeing resources. 894 */ 895 if (((aiop->aioprocflags & AIOP_SCHED) == 0) && 896 tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { 897 if ((TAILQ_FIRST(&aio_jobs) == NULL) && 898 (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { 899 if ((aiop->aioprocflags & AIOP_FREE) && 900 (num_aio_procs > target_aio_procs)) { 901 TAILQ_REMOVE(&aio_freeproc, aiop, list); 902 zfree(aiop_zone, aiop); 903 num_aio_procs--; 904#if defined(DIAGNOSTIC) 905 if (mycp->p_vmspace->vm_refcnt <= 1) 906 printf("AIOD: bad vm refcnt for exiting daemon: %d\n", 907 mycp->p_vmspace->vm_refcnt); 908#endif 909 exit1(mycp, 0); 910 } 911 } 912 } 913 } 914} 915 916/* 917 * Create a new AIO daemon. This is mostly a kernel-thread fork routine. 918 * The AIO daemon modifies it's environment itself. 919 */ 920static int 921aio_newproc() 922{ 923 int error; 924 struct rfork_args rfa; 925 struct proc *p, *np; 926 927 rfa.flags = RFPROC | RFCFDG; 928 929 p = curproc; 930 if (error = rfork(p, &rfa)) 931 return error; 932 933 np = pfind(p->p_retval[0]); 934 cpu_set_fork_handler(np, aio_daemon, p); 935 936 /* 937 * Wait until daemon is started, but continue on just in case (to 938 * handle error conditions. 939 */ 940 error = tsleep(np, PZERO, "aiosta", aiod_timeout); 941 num_aio_procs++; 942 943 return error; 944 945} 946 947/* 948 * Try the high-performance physio method for eligible VCHR devices. This 949 * routine doesn't require the use of any additional threads, and have 950 * overhead. 951 */ 952int 953aio_qphysio(p, aiocbe) 954 struct proc *p; 955 struct aiocblist *aiocbe; 956{ 957 int error; 958 caddr_t sa; 959 struct aiocb *cb; 960 struct file *fp; 961 struct buf *bp; 962 int bflags; 963 struct vnode *vp; 964 struct kaioinfo *ki; 965 struct filedesc *fdp; 966 struct aio_liojob *lj; 967 int fd; 968 int majordev; 969 int s; 970 int cnt; 971 dev_t dev; 972 int rw; 973 d_strategy_t *fstrategy; 974 struct cdevsw *cdev; 975 struct bdevsw *bdev; 976 977 cb = &aiocbe->uaiocb; 978 fdp = p->p_fd; 979 fd = cb->aio_fildes; 980 fp = fdp->fd_ofiles[fd]; 981 982 if (fp->f_type != DTYPE_VNODE) { 983 return -1; 984 } 985 986 vp = (struct vnode *)fp->f_data; 987 if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { 988 return -1; 989 } 990 991 if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { 992 return -1; 993 } 994 995 if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) { 996 return -1; 997 } 998 999 majordev = major(vp->v_rdev); 1000 if (majordev == NODEV) { 1001 return -1; 1002 } 1003 1004 cdev = cdevsw[major(vp->v_rdev)]; 1005 if (cdev == NULL) { 1006 return -1; 1007 } 1008 bdev = cdev->d_bdev; 1009 if (bdev == NULL) { 1010 return -1; 1011 } 1012 1013 ki = p->p_aioinfo; 1014 if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { 1015 return -1; 1016 } 1017 1018 cnt = cb->aio_nbytes; 1019 if (cnt > MAXPHYS) { 1020 return -1; 1021 } 1022 1023 dev = makedev(bdev->d_maj, minor(vp->v_rdev)); 1024 1025 /* 1026 * Physical I/O is charged directly to the process, so we don't have 1027 * to fake it. 1028 */ 1029 aiocbe->inputcharge = 0; 1030 aiocbe->outputcharge = 0; 1031 1032 ki->kaio_buffer_count++; 1033 1034 lj = aiocbe->lio; 1035 if (lj) { 1036 lj->lioj_buffer_count++; 1037 } 1038 1039 /* create and build a buffer header for a transfer */ 1040 bp = (struct buf *)getpbuf(); 1041 1042 /* 1043 * get a copy of the kva from the physical buffer 1044 */ 1045 bp->b_proc = p; 1046 bp->b_dev = dev; 1047 error = bp->b_error = 0; 1048 1049 if (cb->aio_lio_opcode == LIO_WRITE) { 1050 rw = 0; 1051 bflags = B_WRITE; 1052 } else { 1053 rw = 1; 1054 bflags = B_READ; 1055 } 1056 1057 bp->b_bcount = cb->aio_nbytes; 1058 bp->b_bufsize = cb->aio_nbytes; 1059 bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags; 1060 bp->b_iodone = aio_physwakeup; 1061 bp->b_saveaddr = bp->b_data; 1062 bp->b_data = cb->aio_buf; 1063 bp->b_blkno = btodb(cb->aio_offset); 1064 1065 if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { 1066 error = EFAULT; 1067 goto doerror; 1068 } 1069 if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { 1070 error = EFAULT; 1071 goto doerror; 1072 } 1073 1074 /* bring buffer into kernel space */ 1075 vmapbuf(bp); 1076 1077 s = splbio(); 1078 aiocbe->bp = bp; 1079 bp->b_spc = (void *)aiocbe; 1080 TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); 1081 TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); 1082 aiocbe->jobstate = JOBST_JOBQBUF; 1083 cb->_aiocb_private.status = cb->aio_nbytes; 1084 num_buf_aio++; 1085 fstrategy = bdev->d_strategy; 1086 bp->b_error = 0; 1087 1088 splx(s); 1089 /* perform transfer */ 1090 (*fstrategy)(bp); 1091 1092 s = splbio(); 1093 /* 1094 * If we had an error invoking the request, or an error in processing 1095 * the request before we have returned, we process it as an error 1096 * in transfer. Note that such an I/O error is not indicated immediately, 1097 * but is returned using the aio_error mechanism. In this case, aio_suspend 1098 * will return immediately. 1099 */ 1100 if (bp->b_error || (bp->b_flags & B_ERROR)) { 1101 struct aiocb *job = aiocbe->uuaiocb; 1102 1103 aiocbe->uaiocb._aiocb_private.status = 0; 1104 suword(&job->_aiocb_private.status, 0); 1105 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 1106 suword(&job->_aiocb_private.error, bp->b_error); 1107 1108 ki->kaio_buffer_finished_count++; 1109 1110 if (aiocbe->jobstate != JOBST_JOBBFINISHED) { 1111 aiocbe->jobstate = JOBST_JOBBFINISHED; 1112 aiocbe->jobflags |= AIOCBLIST_DONE; 1113 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 1114 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 1115 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 1116 } 1117 } 1118 splx(s); 1119 return 0; 1120 1121doerror: 1122 ki->kaio_buffer_count--; 1123 if (lj) { 1124 lj->lioj_buffer_count--; 1125 } 1126 aiocbe->bp = NULL; 1127 relpbuf(bp); 1128 return error; 1129} 1130 1131/* 1132 * This waits/tests physio completion. 1133 */ 1134int 1135aio_fphysio(p, iocb, flgwait) 1136 struct proc *p; 1137 struct aiocblist *iocb; 1138 int flgwait; 1139{ 1140 int s; 1141 struct buf *bp; 1142 int error; 1143 1144 bp = iocb->bp; 1145 1146 s = splbio(); 1147 if (flgwait == 0) { 1148 if ((bp->b_flags & B_DONE) == 0) { 1149 splx(s); 1150 return EINPROGRESS; 1151 } 1152 } 1153 1154 while ((bp->b_flags & B_DONE) == 0) { 1155 if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { 1156 if ((bp->b_flags & B_DONE) == 0) { 1157 splx(s); 1158 return EINPROGRESS; 1159 } else { 1160 break; 1161 } 1162 } 1163 } 1164 1165 /* release mapping into kernel space */ 1166 vunmapbuf(bp); 1167 iocb->bp = 0; 1168 1169 error = 0; 1170 /* 1171 * check for an error 1172 */ 1173 if (bp->b_flags & B_ERROR) { 1174 error = bp->b_error; 1175 } 1176 1177 relpbuf(bp); 1178 return (error); 1179} 1180 1181/* 1182 * Queue a new AIO request. Choosing either the threaded or direct physio 1183 * VCHR technique is done in this code. 1184 */ 1185static int 1186_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) 1187{ 1188 struct filedesc *fdp; 1189 struct file *fp; 1190 unsigned int fd; 1191 1192 int error; 1193 int opcode; 1194 struct aiocblist *aiocbe; 1195 struct aioproclist *aiop; 1196 struct kaioinfo *ki; 1197 1198 if (aiocbe = TAILQ_FIRST(&aio_freejobs)) { 1199 TAILQ_REMOVE(&aio_freejobs, aiocbe, list); 1200 } else { 1201 aiocbe = zalloc (aiocb_zone); 1202 } 1203 1204 aiocbe->inputcharge = 0; 1205 aiocbe->outputcharge = 0; 1206 1207 suword(&job->_aiocb_private.status, -1); 1208 suword(&job->_aiocb_private.error, 0); 1209 suword(&job->_aiocb_private.kernelinfo, -1); 1210 1211 error = copyin((caddr_t)job, 1212 (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); 1213 if (error) { 1214 suword(&job->_aiocb_private.error, error); 1215 1216 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1217 return error; 1218 } 1219 1220 /* 1221 * Save userspace address of the job info 1222 */ 1223 aiocbe->uuaiocb = job; 1224 1225 /* 1226 * Get the opcode 1227 */ 1228 if (type != LIO_NOP) { 1229 aiocbe->uaiocb.aio_lio_opcode = type; 1230 } 1231 opcode = aiocbe->uaiocb.aio_lio_opcode; 1232 1233 /* 1234 * Get the fd info for process 1235 */ 1236 fdp = p->p_fd; 1237 1238 /* 1239 * Range check file descriptor 1240 */ 1241 fd = aiocbe->uaiocb.aio_fildes; 1242 if (fd >= fdp->fd_nfiles) { 1243 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1244 if (type == 0) { 1245 suword(&job->_aiocb_private.error, EBADF); 1246 } 1247 return EBADF; 1248 } 1249 1250 fp = fdp->fd_ofiles[fd]; 1251 if ((fp == NULL) || 1252 ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { 1253 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1254 if (type == 0) { 1255 suword(&job->_aiocb_private.error, EBADF); 1256 } 1257 return EBADF; 1258 } 1259 1260 if (aiocbe->uaiocb.aio_offset == -1LL) { 1261 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1262 if (type == 0) { 1263 suword(&job->_aiocb_private.error, EINVAL); 1264 } 1265 return EINVAL; 1266 } 1267 1268 error = suword(&job->_aiocb_private.kernelinfo, jobrefid); 1269 if (error) { 1270 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1271 if (type == 0) { 1272 suword(&job->_aiocb_private.error, EINVAL); 1273 } 1274 return error; 1275 } 1276 1277 aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)jobrefid; 1278 jobrefid++; 1279 if (jobrefid > INT_MAX) 1280 jobrefid = 1; 1281 1282 if (opcode == LIO_NOP) { 1283 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1284 if (type == 0) { 1285 suword(&job->_aiocb_private.error, 0); 1286 suword(&job->_aiocb_private.status, 0); 1287 suword(&job->_aiocb_private.kernelinfo, 0); 1288 } 1289 return 0; 1290 } 1291 1292 if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { 1293 TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); 1294 if (type == 0) { 1295 suword(&job->_aiocb_private.status, 0); 1296 suword(&job->_aiocb_private.error, EINVAL); 1297 } 1298 return EINVAL; 1299 } 1300 1301 suword(&job->_aiocb_private.error, EINPROGRESS); 1302 aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; 1303 aiocbe->userproc = p; 1304 aiocbe->jobflags = 0; 1305 aiocbe->lio = lj; 1306 ki = p->p_aioinfo; 1307 1308 if ((error = aio_qphysio(p, aiocbe)) == 0) { 1309 return 0; 1310 } else if (error > 0) { 1311 suword(&job->_aiocb_private.status, 0); 1312 aiocbe->uaiocb._aiocb_private.error = error; 1313 suword(&job->_aiocb_private.error, error); 1314 return error; 1315 } 1316 1317 /* 1318 * No buffer for daemon I/O 1319 */ 1320 aiocbe->bp = NULL; 1321 1322 ki->kaio_queue_count++; 1323 if (lj) { 1324 lj->lioj_queue_count++; 1325 } 1326 TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); 1327 TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); 1328 aiocbe->jobstate = JOBST_JOBQGLOBAL; 1329 1330 num_queue_count++; 1331 error = 0; 1332 1333 /* 1334 * If we don't have a free AIO process, and we are below our 1335 * quota, then start one. Otherwise, depend on the subsequent 1336 * I/O completions to pick-up this job. If we don't sucessfully 1337 * create the new process (thread) due to resource issues, we 1338 * return an error for now (EAGAIN), which is likely not the 1339 * correct thing to do. 1340 */ 1341retryproc: 1342 if (aiop = TAILQ_FIRST(&aio_freeproc)) { 1343 TAILQ_REMOVE(&aio_freeproc, aiop, list); 1344 TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); 1345 aiop->aioprocflags &= ~AIOP_FREE; 1346 wakeup(aiop->aioproc); 1347 } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && 1348 ((ki->kaio_active_count + num_aio_resv_start) < 1349 ki->kaio_maxactive_count)) { 1350 num_aio_resv_start++; 1351 if ((error = aio_newproc()) == 0) { 1352 num_aio_resv_start--; 1353 goto retryproc; 1354 } 1355 num_aio_resv_start--; 1356 } 1357 return error; 1358} 1359 1360/* 1361 * This routine queues an AIO request, checking for quotas. 1362 */ 1363static int 1364aio_aqueue(struct proc *p, struct aiocb *job, int type) 1365{ 1366 struct kaioinfo *ki; 1367 1368 if (p->p_aioinfo == NULL) { 1369 aio_init_aioinfo(p); 1370 } 1371 1372 if (num_queue_count >= max_queue_count) 1373 return EAGAIN; 1374 1375 ki = p->p_aioinfo; 1376 if (ki->kaio_queue_count >= ki->kaio_qallowed_count) 1377 return EAGAIN; 1378 1379 return _aio_aqueue(p, job, NULL, type); 1380} 1381 1382/* 1383 * Support the aio_return system call, as a side-effect, kernel 1384 * resources are released. 1385 */ 1386int 1387aio_return(struct proc *p, struct aio_return_args *uap) 1388{ 1389 int s; 1390 int jobref, status; 1391 struct aiocblist *cb, *ncb; 1392 struct aiocb *ujob; 1393 struct kaioinfo *ki; 1394 struct proc *userp; 1395 1396 ki = p->p_aioinfo; 1397 if (ki == NULL) { 1398 return EINVAL; 1399 } 1400 1401 ujob = uap->aiocbp; 1402 1403 jobref = fuword(&ujob->_aiocb_private.kernelinfo); 1404 if (jobref == -1 || jobref == 0) 1405 return EINVAL; 1406 1407 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1408 cb; 1409 cb = TAILQ_NEXT(cb, plist)) { 1410 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1411 if (ujob == cb->uuaiocb) { 1412 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1413 } else { 1414 p->p_retval[0] = EFAULT; 1415 } 1416 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1417 curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; 1418 cb->outputcharge = 0; 1419 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1420 curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; 1421 cb->inputcharge = 0; 1422 } 1423 aio_free_entry(cb); 1424 return 0; 1425 } 1426 } 1427 1428 s = splbio(); 1429 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1430 cb; 1431 cb = ncb) { 1432 ncb = TAILQ_NEXT(cb, plist); 1433 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1434 splx(s); 1435 if (ujob == cb->uuaiocb) { 1436 p->p_retval[0] = cb->uaiocb._aiocb_private.status; 1437 } else { 1438 p->p_retval[0] = EFAULT; 1439 } 1440 aio_free_entry(cb); 1441 return 0; 1442 } 1443 } 1444 splx(s); 1445 1446 return (EINVAL); 1447} 1448 1449/* 1450 * Allow a process to wakeup when any of the I/O requests are 1451 * completed. 1452 */ 1453int 1454aio_suspend(struct proc *p, struct aio_suspend_args *uap) 1455{ 1456 struct timeval atv; 1457 struct timespec ts; 1458 struct aiocb *const *cbptr, *cbp; 1459 struct kaioinfo *ki; 1460 struct aiocblist *cb; 1461 int i; 1462 int njoblist; 1463 int error, s, timo; 1464 int *ijoblist; 1465 struct aiocb **ujoblist; 1466 1467 if (uap->nent >= AIO_LISTIO_MAX) 1468 return EINVAL; 1469 1470 timo = 0; 1471 if (uap->timeout) { 1472 /* 1473 * Get timespec struct 1474 */ 1475 if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) { 1476 return error; 1477 } 1478 1479 if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) 1480 return (EINVAL); 1481 1482 TIMESPEC_TO_TIMEVAL(&atv, &ts) 1483 if (itimerfix(&atv)) 1484 return (EINVAL); 1485 /* 1486 * XXX this is not as careful as settimeofday() about minimising 1487 * interrupt latency. The hzto() interface is inconvenient as usual. 1488 */ 1489 s = splclock(); 1490 timevaladd(&atv, &time); 1491 timo = hzto(&atv); 1492 splx(s); 1493 if (timo == 0) 1494 timo = 1; 1495 } 1496 1497 ki = p->p_aioinfo; 1498 if (ki == NULL) 1499 return EAGAIN; 1500 1501 njoblist = 0; 1502 ijoblist = zalloc(aiol_zone); 1503 ujoblist = zalloc(aiol_zone); 1504 cbptr = uap->aiocbp; 1505 1506 for(i = 0; i < uap->nent; i++) { 1507 cbp = (struct aiocb *) fuword((caddr_t) &cbptr[i]); 1508 if (cbp == 0) 1509 continue; 1510 ujoblist[njoblist] = cbp; 1511 ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); 1512 njoblist++; 1513 } 1514 if (njoblist == 0) { 1515 zfree(aiol_zone, ijoblist); 1516 zfree(aiol_zone, ujoblist); 1517 return 0; 1518 } 1519 1520 error = 0; 1521 while (1) { 1522 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1523 cb; cb = TAILQ_NEXT(cb, plist)) { 1524 for(i = 0; i < njoblist; i++) { 1525 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == 1526 ijoblist[i]) { 1527 if (ujoblist[i] != cb->uuaiocb) 1528 error = EINVAL; 1529 zfree(aiol_zone, ijoblist); 1530 zfree(aiol_zone, ujoblist); 1531 return error; 1532 } 1533 } 1534 } 1535 1536 s = splbio(); 1537 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1538 cb; cb = TAILQ_NEXT(cb, plist)) { 1539 for(i = 0; i < njoblist; i++) { 1540 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == 1541 ijoblist[i]) { 1542 splx(s); 1543 if (ujoblist[i] != cb->uuaiocb) 1544 error = EINVAL; 1545 zfree(aiol_zone, ijoblist); 1546 zfree(aiol_zone, ujoblist); 1547 return error; 1548 } 1549 } 1550 } 1551 1552 ki->kaio_flags |= KAIO_WAKEUP; 1553 error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); 1554 splx(s); 1555 1556 if (error == EINTR) { 1557 zfree(aiol_zone, ijoblist); 1558 zfree(aiol_zone, ujoblist); 1559 return EINTR; 1560 } else if (error == EWOULDBLOCK) { 1561 zfree(aiol_zone, ijoblist); 1562 zfree(aiol_zone, ujoblist); 1563 return EAGAIN; 1564 } 1565 } 1566 1567/* NOTREACHED */ 1568 return EINVAL; 1569} 1570 1571/* 1572 * aio_cancel at the kernel level is a NOOP right now. It 1573 * might be possible to support it partially in user mode, or 1574 * in kernel mode later on. 1575 */ 1576int 1577aio_cancel(struct proc *p, struct aio_cancel_args *uap) 1578{ 1579 return AIO_NOTCANCELLED; 1580} 1581 1582/* 1583 * aio_error is implemented in the kernel level for compatibility 1584 * purposes only. For a user mode async implementation, it would be 1585 * best to do it in a userland subroutine. 1586 */ 1587int 1588aio_error(struct proc *p, struct aio_error_args *uap) 1589{ 1590 int s; 1591 struct aiocblist *cb; 1592 struct kaioinfo *ki; 1593 int jobref; 1594 int error, status; 1595 1596 ki = p->p_aioinfo; 1597 if (ki == NULL) 1598 return EINVAL; 1599 1600 jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); 1601 if ((jobref == -1) || (jobref == 0)) 1602 return EINVAL; 1603 1604 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1605 cb; 1606 cb = TAILQ_NEXT(cb, plist)) { 1607 1608 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1609 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1610 return 0; 1611 } 1612 } 1613 1614 for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); 1615 cb; 1616 cb = TAILQ_NEXT(cb, plist)) { 1617 1618 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1619 p->p_retval[0] = EINPROGRESS; 1620 return 0; 1621 } 1622 } 1623 1624 s = splbio(); 1625 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1626 cb; 1627 cb = TAILQ_NEXT(cb, plist)) { 1628 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1629 p->p_retval[0] = cb->uaiocb._aiocb_private.error; 1630 splx(s); 1631 return 0; 1632 } 1633 } 1634 1635 for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); 1636 cb; 1637 cb = TAILQ_NEXT(cb, plist)) { 1638 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { 1639 p->p_retval[0] = EINPROGRESS; 1640 splx(s); 1641 return 0; 1642 } 1643 } 1644 splx(s); 1645 1646 1647 /* 1648 * Hack for lio 1649 */ 1650/* 1651 status = fuword(&uap->aiocbp->_aiocb_private.status); 1652 if (status == -1) { 1653 return fuword(&uap->aiocbp->_aiocb_private.error); 1654 } 1655*/ 1656 return EINVAL; 1657} 1658 1659int 1660aio_read(struct proc *p, struct aio_read_args *uap) 1661{ 1662 struct filedesc *fdp; 1663 struct file *fp; 1664 struct uio auio; 1665 struct iovec aiov; 1666 unsigned int fd; 1667 int cnt; 1668 struct aiocb iocb; 1669 int error, pmodes; 1670 1671 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1672 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1673 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1674 } 1675 1676 /* 1677 * Get control block 1678 */ 1679 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) 1680 return error; 1681 1682 /* 1683 * Get the fd info for process 1684 */ 1685 fdp = p->p_fd; 1686 1687 /* 1688 * Range check file descriptor 1689 */ 1690 fd = iocb.aio_fildes; 1691 if (fd >= fdp->fd_nfiles) 1692 return EBADF; 1693 fp = fdp->fd_ofiles[fd]; 1694 if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) 1695 return EBADF; 1696 if (iocb.aio_offset == -1LL) 1697 return EINVAL; 1698 1699 auio.uio_resid = iocb.aio_nbytes; 1700 if (auio.uio_resid < 0) 1701 return (EINVAL); 1702 1703 /* 1704 * Process sync simply -- queue async request. 1705 */ 1706 if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { 1707 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); 1708 } 1709 1710 aiov.iov_base = iocb.aio_buf; 1711 aiov.iov_len = iocb.aio_nbytes; 1712 1713 auio.uio_iov = &aiov; 1714 auio.uio_iovcnt = 1; 1715 auio.uio_offset = iocb.aio_offset; 1716 auio.uio_rw = UIO_READ; 1717 auio.uio_segflg = UIO_USERSPACE; 1718 auio.uio_procp = p; 1719 1720 cnt = iocb.aio_nbytes; 1721 error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); 1722 if (error && 1723 (auio.uio_resid != cnt) && 1724 (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) 1725 error = 0; 1726 cnt -= auio.uio_resid; 1727 p->p_retval[0] = cnt; 1728 return error; 1729} 1730 1731int 1732aio_write(struct proc *p, struct aio_write_args *uap) 1733{ 1734 struct filedesc *fdp; 1735 struct file *fp; 1736 struct uio auio; 1737 struct iovec aiov; 1738 unsigned int fd; 1739 int cnt; 1740 struct aiocb iocb; 1741 int error; 1742 int pmodes; 1743 1744 /* 1745 * Process sync simply -- queue async request. 1746 */ 1747 pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); 1748 if ((pmodes & AIO_PMODE_SYNC) == 0) { 1749 return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); 1750 } 1751 1752 if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) 1753 return error; 1754 1755 /* 1756 * Get the fd info for process 1757 */ 1758 fdp = p->p_fd; 1759 1760 /* 1761 * Range check file descriptor 1762 */ 1763 fd = iocb.aio_fildes; 1764 if (fd >= fdp->fd_nfiles) 1765 return EBADF; 1766 fp = fdp->fd_ofiles[fd]; 1767 if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) 1768 return EBADF; 1769 if (iocb.aio_offset == -1LL) 1770 return EINVAL; 1771 1772 aiov.iov_base = iocb.aio_buf; 1773 aiov.iov_len = iocb.aio_nbytes; 1774 auio.uio_iov = &aiov; 1775 auio.uio_iovcnt = 1; 1776 auio.uio_offset = iocb.aio_offset; 1777 1778 auio.uio_resid = iocb.aio_nbytes; 1779 if (auio.uio_resid < 0) 1780 return (EINVAL); 1781 1782 auio.uio_rw = UIO_WRITE; 1783 auio.uio_segflg = UIO_USERSPACE; 1784 auio.uio_procp = p; 1785 1786 cnt = iocb.aio_nbytes; 1787 error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); 1788 if (error) { 1789 if (auio.uio_resid != cnt) { 1790 if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) 1791 error = 0; 1792 if (error == EPIPE) 1793 psignal(p, SIGPIPE); 1794 } 1795 } 1796 cnt -= auio.uio_resid; 1797 p->p_retval[0] = cnt; 1798 return error; 1799} 1800 1801int 1802lio_listio(struct proc *p, struct lio_listio_args *uap) 1803{ 1804 int nent, nentqueued; 1805 struct aiocb *iocb, * const *cbptr; 1806 struct aiocblist *cb; 1807 struct kaioinfo *ki; 1808 struct aio_liojob *lj; 1809 int error, runningcode; 1810 int nerror; 1811 int i; 1812 int s; 1813 1814 if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { 1815 return EINVAL; 1816 } 1817 1818 nent = uap->nent; 1819 if (nent > AIO_LISTIO_MAX) { 1820 return EINVAL; 1821 } 1822 1823 if (p->p_aioinfo == NULL) { 1824 aio_init_aioinfo(p); 1825 } 1826 1827 if ((nent + num_queue_count) > max_queue_count) { 1828 return EAGAIN; 1829 } 1830 1831 ki = p->p_aioinfo; 1832 if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { 1833 return EAGAIN; 1834 } 1835 1836 lj = zalloc(aiolio_zone); 1837 if (!lj) { 1838 return EAGAIN; 1839 } 1840 1841 lj->lioj_flags = 0; 1842 lj->lioj_buffer_count = 0; 1843 lj->lioj_buffer_finished_count = 0; 1844 lj->lioj_queue_count = 0; 1845 lj->lioj_queue_finished_count = 0; 1846 lj->lioj_ki = ki; 1847 TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); 1848 1849 /* 1850 * Setup signal 1851 */ 1852 if (uap->sig && (uap->mode == LIO_NOWAIT)) { 1853 error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); 1854 if (error) 1855 return error; 1856 lj->lioj_flags |= LIOJ_SIGNAL; 1857 lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; 1858 } else { 1859 lj->lioj_flags &= ~LIOJ_SIGNAL; 1860 } 1861 1862/* 1863 * get pointers to the list of I/O requests 1864 */ 1865 1866 nerror = 0; 1867 nentqueued = 0; 1868 cbptr = uap->acb_list; 1869 for(i = 0; i < uap->nent; i++) { 1870 iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]); 1871 if (((int) iocb != -1) && ((int) iocb != NULL)) { 1872 error = _aio_aqueue(p, iocb, lj, 0); 1873 if (error == 0) { 1874 nentqueued++; 1875 } else { 1876 nerror++; 1877 } 1878 } 1879 } 1880 1881 /* 1882 * If we haven't queued any, then just return error 1883 */ 1884 if (nentqueued == 0) { 1885 return 0; 1886 } 1887 1888 /* 1889 * Calculate the appropriate error return 1890 */ 1891 runningcode = 0; 1892 if (nerror) 1893 runningcode = EIO; 1894 1895 if (uap->mode == LIO_WAIT) { 1896 while (1) { 1897 int found; 1898 found = 0; 1899 for(i = 0; i < uap->nent; i++) { 1900 int jobref, command; 1901 1902 /* 1903 * Fetch address of the control buf pointer in user space 1904 */ 1905 iocb = (struct aiocb *) fuword((caddr_t) &cbptr[i]); 1906 if (((int) iocb == -1) || ((int) iocb == 0)) 1907 continue; 1908 1909 /* 1910 * Fetch the associated command from user space 1911 */ 1912 command = fuword(&iocb->aio_lio_opcode); 1913 if (command == LIO_NOP) { 1914 found++; 1915 continue; 1916 } 1917 1918 jobref = fuword(&iocb->_aiocb_private.kernelinfo); 1919 1920 for (cb = TAILQ_FIRST(&ki->kaio_jobdone); 1921 cb; 1922 cb = TAILQ_NEXT(cb, plist)) { 1923 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == 1924 jobref) { 1925 if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { 1926 curproc->p_stats->p_ru.ru_oublock += 1927 cb->outputcharge; 1928 cb->outputcharge = 0; 1929 } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { 1930 curproc->p_stats->p_ru.ru_inblock += 1931 cb->inputcharge; 1932 cb->inputcharge = 0; 1933 } 1934 found++; 1935 break; 1936 } 1937 } 1938 1939 s = splbio(); 1940 for (cb = TAILQ_FIRST(&ki->kaio_bufdone); 1941 cb; 1942 cb = TAILQ_NEXT(cb, plist)) { 1943 if (((int) cb->uaiocb._aiocb_private.kernelinfo) == 1944 jobref) { 1945 found++; 1946 break; 1947 } 1948 } 1949 splx(s); 1950 1951 } 1952 1953 /* 1954 * If all I/Os have been disposed of, then we can return 1955 */ 1956 if (found == nentqueued) { 1957 return runningcode; 1958 } 1959 1960 ki->kaio_flags |= KAIO_WAKEUP; 1961 error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); 1962 1963 if (error == EINTR) { 1964 return EINTR; 1965 } else if (error == EWOULDBLOCK) { 1966 return EAGAIN; 1967 } 1968 1969 } 1970 } 1971 1972 return runningcode; 1973} 1974 1975/* 1976 * This is a wierd hack so that we can post a signal. It is safe 1977 * to do so from a timeout routine, but *not* from an interrupt routine. 1978 */ 1979static void 1980process_signal(void *ljarg) 1981{ 1982 struct aio_liojob *lj = ljarg; 1983 if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { 1984 if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { 1985 psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); 1986 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 1987 } 1988 } 1989} 1990 1991/* 1992 * Interrupt handler for physio, performs the necessary process wakeups, 1993 * and signals. 1994 */ 1995static void 1996aio_physwakeup(bp) 1997 struct buf *bp; 1998{ 1999 struct aiocblist *aiocbe; 2000 struct proc *p; 2001 struct kaioinfo *ki; 2002 struct aio_liojob *lj; 2003 int s; 2004 s = splbio(); 2005 2006 wakeup((caddr_t) bp); 2007 bp->b_flags &= ~B_CALL; 2008 bp->b_flags |= B_DONE; 2009 2010 aiocbe = (struct aiocblist *)bp->b_spc; 2011 if (aiocbe) { 2012 p = bp->b_proc; 2013 2014 aiocbe->jobstate = JOBST_JOBBFINISHED; 2015 aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; 2016 aiocbe->uaiocb._aiocb_private.error = 0; 2017 aiocbe->jobflags |= AIOCBLIST_DONE; 2018 2019 if (bp->b_flags & B_ERROR) { 2020 aiocbe->uaiocb._aiocb_private.error = bp->b_error; 2021 } 2022 2023 lj = aiocbe->lio; 2024 if (lj) { 2025 lj->lioj_buffer_finished_count++; 2026 /* 2027 * wakeup/signal if all of the interrupt jobs are done 2028 */ 2029 if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { 2030 /* 2031 * post a signal if it is called for 2032 */ 2033 if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == 2034 LIOJ_SIGNAL) { 2035 lj->lioj_flags |= LIOJ_SIGNAL_POSTED; 2036 timeout(process_signal, lj, 0); 2037 } 2038 } 2039 } 2040 2041 ki = p->p_aioinfo; 2042 if (ki) { 2043 ki->kaio_buffer_finished_count++; 2044 TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); 2045 TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); 2046 TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); 2047 /* 2048 * and do the wakeup 2049 */ 2050 if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { 2051 ki->kaio_flags &= ~KAIO_WAKEUP; 2052 wakeup(p); 2053 } 2054 } 2055 } 2056 splx(s); 2057} 2058