1/* 2 * Copyright (c) 2003-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 30/* 31 * todo: 32 * 1) ramesh is looking into how to replace taking a reference on 33 * the user's map (vm_map_reference()) since it is believed that 34 * would not hold the process for us. 35 * 2) david is looking into a way for us to set the priority of the 36 * worker threads to match that of the user's thread when the 37 * async IO was queued. 38 */ 39 40 41/* 42 * This file contains support for the POSIX 1003.1B AIO/LIO facility. 43 */ 44 45#include <sys/systm.h> 46#include <sys/fcntl.h> 47#include <sys/file_internal.h> 48#include <sys/filedesc.h> 49#include <sys/kernel.h> 50#include <sys/vnode_internal.h> 51#include <sys/malloc.h> 52#include <sys/mount_internal.h> 53#include <sys/param.h> 54#include <sys/proc_internal.h> 55#include <sys/sysctl.h> 56#include <sys/unistd.h> 57#include <sys/user.h> 58 59#include <sys/aio_kern.h> 60#include <sys/sysproto.h> 61 62#include <machine/limits.h> 63 64#include <mach/mach_types.h> 65#include <kern/kern_types.h> 66#include <kern/zalloc.h> 67#include <kern/task.h> 68#include <kern/sched_prim.h> 69 70#include <vm/vm_map.h> 71 72#include <libkern/OSAtomic.h> 73 74#include <sys/kdebug.h> 75#define AIO_work_queued 1 76#define AIO_worker_wake 2 77#define AIO_completion_sig 3 78#define AIO_completion_cleanup_wait 4 79#define AIO_completion_cleanup_wake 5 80#define AIO_completion_suspend_wake 6 81#define AIO_fsync_delay 7 82#define AIO_cancel 10 83#define AIO_cancel_async_workq 11 84#define AIO_cancel_sync_workq 12 85#define AIO_cancel_activeq 13 86#define AIO_cancel_doneq 14 87#define AIO_fsync 20 88#define AIO_read 30 89#define AIO_write 40 90#define AIO_listio 50 91#define AIO_error 60 92#define AIO_error_val 61 93#define AIO_error_activeq 62 94#define AIO_error_workq 63 95#define AIO_return 70 96#define AIO_return_val 71 97#define AIO_return_activeq 72 98#define AIO_return_workq 73 99#define AIO_exec 80 100#define AIO_exit 90 101#define AIO_exit_sleep 91 102#define AIO_close 100 103#define AIO_close_sleep 101 104#define AIO_suspend 110 105#define AIO_suspend_sleep 111 106#define AIO_worker_thread 120 107 108#if 0 109#undef KERNEL_DEBUG 110#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT 111#endif 112 113/* 114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for 115 * lio_listio LIO_WAIT). Requests then move to the per process aio_activeq 116 * (proc.aio_activeq) when one of our worker threads start the IO. 117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq) 118 * when the IO request completes. The request remains on aio_doneq until 119 * user process calls aio_return or the process exits, either way that is our 120 * trigger to release aio resources. 121 */ 122typedef struct aio_workq { 123 TAILQ_HEAD(, aio_workq_entry) aioq_entries; 124 int aioq_count; 125 lck_mtx_t aioq_mtx; 126 wait_queue_t aioq_waitq; 127} *aio_workq_t; 128 129#define AIO_NUM_WORK_QUEUES 1 130struct aio_anchor_cb 131{ 132 volatile int32_t aio_inflight_count; /* entries that have been taken from a workq */ 133 volatile int32_t aio_done_count; /* entries on all done queues (proc.aio_doneq) */ 134 volatile int32_t aio_total_count; /* total extant entries */ 135 136 /* Hash table of queues here */ 137 int aio_num_workqs; 138 struct aio_workq aio_async_workqs[AIO_NUM_WORK_QUEUES]; 139}; 140typedef struct aio_anchor_cb aio_anchor_cb; 141 142struct aio_lio_context 143{ 144 int io_waiter; 145 int io_issued; 146 int io_completed; 147}; 148typedef struct aio_lio_context aio_lio_context; 149 150 151/* 152 * Notes on aio sleep / wake channels. 153 * We currently pick a couple fields within the proc structure that will allow 154 * us sleep channels that currently do not collide with any other kernel routines. 155 * At this time, for binary compatibility reasons, we cannot create new proc fields. 156 */ 157#define AIO_SUSPEND_SLEEP_CHAN p_aio_active_count 158#define AIO_CLEANUP_SLEEP_CHAN p_aio_total_count 159 160#define ASSERT_AIO_FROM_PROC(aiop, theproc) \ 161 if ((aiop)->procp != (theproc)) { \ 162 panic("AIO on a proc list that does not belong to that proc.\n"); \ 163 } 164 165/* 166 * LOCAL PROTOTYPES 167 */ 168static void aio_proc_lock(proc_t procp); 169static void aio_proc_lock_spin(proc_t procp); 170static void aio_proc_unlock(proc_t procp); 171static lck_mtx_t* aio_proc_mutex(proc_t procp); 172static void aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp); 173static void aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp); 174static int aio_get_process_count(proc_t procp ); 175static int aio_active_requests_for_process(proc_t procp ); 176static int aio_proc_active_requests_for_file(proc_t procp, int fd); 177static boolean_t is_already_queued(proc_t procp, user_addr_t aiocbp ); 178static boolean_t should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd); 179 180static void aio_entry_lock(aio_workq_entry *entryp); 181static void aio_entry_lock_spin(aio_workq_entry *entryp); 182static aio_workq_t aio_entry_workq(aio_workq_entry *entryp); 183static lck_mtx_t* aio_entry_mutex(__unused aio_workq_entry *entryp); 184static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp); 185static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp); 186static void aio_entry_ref_locked(aio_workq_entry *entryp); 187static void aio_entry_unref_locked(aio_workq_entry *entryp); 188static void aio_entry_ref(aio_workq_entry *entryp); 189static void aio_entry_unref(aio_workq_entry *entryp); 190static void aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, 191 int wait_for_completion, boolean_t disable_notification); 192static int aio_entry_try_workq_remove(aio_workq_entry *entryp); 193static boolean_t aio_delay_fsync_request( aio_workq_entry *entryp ); 194static int aio_free_request(aio_workq_entry *entryp); 195 196static void aio_workq_init(aio_workq_t wq); 197static void aio_workq_lock_spin(aio_workq_t wq); 198static void aio_workq_unlock(aio_workq_t wq); 199static lck_mtx_t* aio_workq_mutex(aio_workq_t wq); 200 201static void aio_work_thread( void ); 202static aio_workq_entry *aio_get_some_work( void ); 203 204static int aio_get_all_queues_count( void ); 205static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO ); 206static int aio_validate( aio_workq_entry *entryp ); 207static int aio_increment_total_count(void); 208static int aio_decrement_total_count(void); 209 210static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification ); 211static void do_aio_completion( aio_workq_entry *entryp ); 212static int do_aio_fsync( aio_workq_entry *entryp ); 213static int do_aio_read( aio_workq_entry *entryp ); 214static int do_aio_write( aio_workq_entry *entryp ); 215static void do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ); 216static void do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ); 217static int lio_create_entry(proc_t procp, 218 user_addr_t aiocbp, 219 void *group_tag, 220 aio_workq_entry **entrypp ); 221static aio_workq_entry *aio_create_queue_entry(proc_t procp, 222 user_addr_t aiocbp, 223 void *group_tag, 224 int kindOfIO); 225static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent); 226static void free_lio_context(aio_lio_context* context); 227static void aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked); 228 229#define ASSERT_AIO_PROC_LOCK_OWNED(p) lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED) 230#define ASSERT_AIO_WORKQ_LOCK_OWNED(q) lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED) 231#define ASSERT_AIO_ENTRY_LOCK_OWNED(e) lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED) 232 233/* 234 * EXTERNAL PROTOTYPES 235 */ 236 237/* in ...bsd/kern/sys_generic.c */ 238extern int dofileread(vfs_context_t ctx, struct fileproc *fp, 239 user_addr_t bufp, user_size_t nbyte, 240 off_t offset, int flags, user_ssize_t *retval ); 241extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp, 242 user_addr_t bufp, user_size_t nbyte, off_t offset, 243 int flags, user_ssize_t *retval ); 244#if DEBUG 245static uint32_t lio_contexts_alloced = 0; 246#endif /* DEBUG */ 247 248/* 249 * aio external global variables. 250 */ 251extern int aio_max_requests; /* AIO_MAX - configurable */ 252extern int aio_max_requests_per_process; /* AIO_PROCESS_MAX - configurable */ 253extern int aio_worker_threads; /* AIO_THREAD_COUNT - configurable */ 254 255 256/* 257 * aio static variables. 258 */ 259static aio_anchor_cb aio_anchor; 260static lck_grp_t *aio_proc_lock_grp; 261static lck_grp_t *aio_entry_lock_grp; 262static lck_grp_t *aio_queue_lock_grp; 263static lck_attr_t *aio_lock_attr; 264static lck_grp_attr_t *aio_lock_grp_attr; 265static struct zone *aio_workq_zonep; 266static lck_mtx_t aio_entry_mtx; 267static lck_mtx_t aio_proc_mtx; 268 269static void 270aio_entry_lock(__unused aio_workq_entry *entryp) 271{ 272 lck_mtx_lock(&aio_entry_mtx); 273} 274 275static void 276aio_entry_lock_spin(__unused aio_workq_entry *entryp) 277{ 278 lck_mtx_lock_spin(&aio_entry_mtx); 279} 280 281static void 282aio_entry_unlock(__unused aio_workq_entry *entryp) 283{ 284 lck_mtx_unlock(&aio_entry_mtx); 285} 286 287/* Hash */ 288static aio_workq_t 289aio_entry_workq(__unused aio_workq_entry *entryp) 290{ 291 return &aio_anchor.aio_async_workqs[0]; 292} 293 294static lck_mtx_t* 295aio_entry_mutex(__unused aio_workq_entry *entryp) 296{ 297 return &aio_entry_mtx; 298} 299 300static void 301aio_workq_init(aio_workq_t wq) 302{ 303 TAILQ_INIT(&wq->aioq_entries); 304 wq->aioq_count = 0; 305 lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr); 306 wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO); 307} 308 309 310/* 311 * Can be passed a queue which is locked spin. 312 */ 313static void 314aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp) 315{ 316 ASSERT_AIO_WORKQ_LOCK_OWNED(queue); 317 318 if (entryp->aio_workq_link.tqe_prev == NULL) { 319 panic("Trying to remove an entry from a work queue, but it is not on a queue\n"); 320 } 321 322 TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link); 323 queue->aioq_count--; 324 entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */ 325 326 if (queue->aioq_count < 0) { 327 panic("Negative count on a queue.\n"); 328 } 329} 330 331static void 332aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp) 333{ 334 ASSERT_AIO_WORKQ_LOCK_OWNED(queue); 335 336 TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link); 337 if (queue->aioq_count < 0) { 338 panic("Negative count on a queue.\n"); 339 } 340 queue->aioq_count++; 341} 342 343static void 344aio_proc_lock(proc_t procp) 345{ 346 lck_mtx_lock(aio_proc_mutex(procp)); 347} 348 349static void 350aio_proc_lock_spin(proc_t procp) 351{ 352 lck_mtx_lock_spin(aio_proc_mutex(procp)); 353} 354 355static void 356aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp) 357{ 358 ASSERT_AIO_PROC_LOCK_OWNED(procp); 359 360 TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link ); 361 TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link); 362 procp->p_aio_active_count--; 363 OSIncrementAtomic(&aio_anchor.aio_done_count); 364} 365 366static void 367aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp) 368{ 369 TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link); 370 OSDecrementAtomic(&aio_anchor.aio_done_count); 371 aio_decrement_total_count(); 372 procp->p_aio_total_count--; 373} 374 375static void 376aio_proc_unlock(proc_t procp) 377{ 378 lck_mtx_unlock(aio_proc_mutex(procp)); 379} 380 381static lck_mtx_t* 382aio_proc_mutex(proc_t procp) 383{ 384 return &procp->p_mlock; 385} 386 387static void 388aio_entry_ref_locked(aio_workq_entry *entryp) 389{ 390 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp); 391 392 if (entryp->aio_refcount < 0) { 393 panic("AIO workq entry with a negative refcount.\n"); 394 } 395 entryp->aio_refcount++; 396} 397 398 399/* Return 1 if you've freed it */ 400static void 401aio_entry_unref_locked(aio_workq_entry *entryp) 402{ 403 ASSERT_AIO_ENTRY_LOCK_OWNED(entryp); 404 405 entryp->aio_refcount--; 406 if (entryp->aio_refcount < 0) { 407 panic("AIO workq entry with a negative refcount.\n"); 408 } 409} 410 411static void 412aio_entry_ref(aio_workq_entry *entryp) 413{ 414 aio_entry_lock_spin(entryp); 415 aio_entry_ref_locked(entryp); 416 aio_entry_unlock(entryp); 417} 418static void 419aio_entry_unref(aio_workq_entry *entryp) 420{ 421 aio_entry_lock_spin(entryp); 422 aio_entry_unref_locked(entryp); 423 424 if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) { 425 aio_entry_unlock(entryp); 426 aio_free_request(entryp); 427 } else { 428 aio_entry_unlock(entryp); 429 } 430 431 return; 432} 433 434static void 435aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification) 436{ 437 aio_entry_lock_spin(entryp); 438 439 if (cancelled) { 440 aio_entry_ref_locked(entryp); 441 entryp->errorval = ECANCELED; 442 entryp->returnval = -1; 443 } 444 445 if ( wait_for_completion ) { 446 entryp->flags |= wait_for_completion; /* flag for special completion processing */ 447 } 448 449 if ( disable_notification ) { 450 entryp->flags |= AIO_DISABLE; /* Don't want a signal */ 451 } 452 453 aio_entry_unlock(entryp); 454} 455 456static int 457aio_entry_try_workq_remove(aio_workq_entry *entryp) 458{ 459 /* Can only be cancelled if it's still on a work queue */ 460 if (entryp->aio_workq_link.tqe_prev != NULL) { 461 aio_workq_t queue; 462 463 /* Will have to check again under the lock */ 464 queue = aio_entry_workq(entryp); 465 aio_workq_lock_spin(queue); 466 if (entryp->aio_workq_link.tqe_prev != NULL) { 467 aio_workq_remove_entry_locked(queue, entryp); 468 aio_workq_unlock(queue); 469 return 1; 470 } else { 471 aio_workq_unlock(queue); 472 } 473 } 474 475 return 0; 476} 477 478static void 479aio_workq_lock_spin(aio_workq_t wq) 480{ 481 lck_mtx_lock_spin(aio_workq_mutex(wq)); 482} 483 484static void 485aio_workq_unlock(aio_workq_t wq) 486{ 487 lck_mtx_unlock(aio_workq_mutex(wq)); 488} 489 490static lck_mtx_t* 491aio_workq_mutex(aio_workq_t wq) 492{ 493 return &wq->aioq_mtx; 494} 495 496/* 497 * aio_cancel - attempt to cancel one or more async IO requests currently 498 * outstanding against file descriptor uap->fd. If uap->aiocbp is not 499 * NULL then only one specific IO is cancelled (if possible). If uap->aiocbp 500 * is NULL then all outstanding async IO request for the given file 501 * descriptor are cancelled (if possible). 502 */ 503int 504aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval ) 505{ 506 struct user_aiocb my_aiocb; 507 int result; 508 509 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START, 510 (int)p, (int)uap->aiocbp, 0, 0, 0 ); 511 512 /* quick check to see if there are any async IO requests queued up */ 513 if (aio_get_all_queues_count() < 1) { 514 result = 0; 515 *retval = AIO_ALLDONE; 516 goto ExitRoutine; 517 } 518 519 *retval = -1; 520 if ( uap->aiocbp != USER_ADDR_NULL ) { 521 if ( proc_is64bit(p) ) { 522 struct user64_aiocb aiocb64; 523 524 result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) ); 525 if (result == 0 ) 526 do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb); 527 528 } else { 529 struct user32_aiocb aiocb32; 530 531 result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) ); 532 if ( result == 0 ) 533 do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb ); 534 } 535 536 if ( result != 0 ) { 537 result = EAGAIN; 538 goto ExitRoutine; 539 } 540 541 /* NOTE - POSIX standard says a mismatch between the file */ 542 /* descriptor passed in and the file descriptor embedded in */ 543 /* the aiocb causes unspecified results. We return EBADF in */ 544 /* that situation. */ 545 if ( uap->fd != my_aiocb.aio_fildes ) { 546 result = EBADF; 547 goto ExitRoutine; 548 } 549 } 550 551 aio_proc_lock(p); 552 result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE ); 553 ASSERT_AIO_PROC_LOCK_OWNED(p); 554 aio_proc_unlock(p); 555 556 if ( result != -1 ) { 557 *retval = result; 558 result = 0; 559 goto ExitRoutine; 560 } 561 562 result = EBADF; 563 564ExitRoutine: 565 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END, 566 (int)p, (int)uap->aiocbp, result, 0, 0 ); 567 568 return( result ); 569 570} /* aio_cancel */ 571 572 573/* 574 * _aio_close - internal function used to clean up async IO requests for 575 * a file descriptor that is closing. 576 * THIS MAY BLOCK. 577 */ 578__private_extern__ void 579_aio_close(proc_t p, int fd ) 580{ 581 int error; 582 583 /* quick check to see if there are any async IO requests queued up */ 584 if (aio_get_all_queues_count() < 1) { 585 return; 586 } 587 588 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START, 589 (int)p, fd, 0, 0, 0 ); 590 591 /* cancel all async IO requests on our todo queues for this file descriptor */ 592 aio_proc_lock(p); 593 error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE ); 594 ASSERT_AIO_PROC_LOCK_OWNED(p); 595 if ( error == AIO_NOTCANCELED ) { 596 /* 597 * AIO_NOTCANCELED is returned when we find an aio request for this process 598 * and file descriptor on the active async IO queue. Active requests cannot 599 * be cancelled so we must wait for them to complete. We will get a special 600 * wake up call on our channel used to sleep for ALL active requests to 601 * complete. This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used 602 * when we must wait for all active aio requests. 603 */ 604 605 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE, 606 (int)p, fd, 0, 0, 0 ); 607 608 while (aio_proc_active_requests_for_file(p, fd) > 0) { 609 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 ); 610 } 611 612 } 613 614 aio_proc_unlock(p); 615 616 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END, 617 (int)p, fd, 0, 0, 0 ); 618 619 return; 620 621} /* _aio_close */ 622 623 624/* 625 * aio_error - return the error status associated with the async IO 626 * request referred to by uap->aiocbp. The error status is the errno 627 * value that would be set by the corresponding IO request (read, wrtie, 628 * fdatasync, or sync). 629 */ 630int 631aio_error(proc_t p, struct aio_error_args *uap, int *retval ) 632{ 633 aio_workq_entry *entryp; 634 int error; 635 636 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START, 637 (int)p, (int)uap->aiocbp, 0, 0, 0 ); 638 639 /* see if there are any aios to check */ 640 if (aio_get_all_queues_count() < 1) { 641 return EINVAL; 642 } 643 644 aio_proc_lock(p); 645 646 /* look for a match on our queue of async IO requests that have completed */ 647 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) { 648 if ( entryp->uaiocbp == uap->aiocbp ) { 649 ASSERT_AIO_FROM_PROC(entryp, p); 650 651 aio_entry_lock_spin(entryp); 652 *retval = entryp->errorval; 653 error = 0; 654 aio_entry_unlock(entryp); 655 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE, 656 (int)p, (int)uap->aiocbp, *retval, 0, 0 ); 657 goto ExitRoutine; 658 } 659 } 660 661 /* look for a match on our queue of active async IO requests */ 662 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) { 663 if ( entryp->uaiocbp == uap->aiocbp ) { 664 ASSERT_AIO_FROM_PROC(entryp, p); 665 *retval = EINPROGRESS; 666 error = 0; 667 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE, 668 (int)p, (int)uap->aiocbp, *retval, 0, 0 ); 669 goto ExitRoutine; 670 } 671 } 672 673 error = EINVAL; 674 675ExitRoutine: 676 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END, 677 (int)p, (int)uap->aiocbp, error, 0, 0 ); 678 aio_proc_unlock(p); 679 680 return( error ); 681 682} /* aio_error */ 683 684 685/* 686 * aio_fsync - asynchronously force all IO operations associated 687 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and 688 * queued at the time of the call to the synchronized completion state. 689 * NOTE - we do not support op O_DSYNC at this point since we do not support the 690 * fdatasync() call. 691 */ 692int 693aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval ) 694{ 695 int error; 696 int fsync_kind; 697 698 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START, 699 (int)p, (int)uap->aiocbp, uap->op, 0, 0 ); 700 701 *retval = 0; 702 /* 0 := O_SYNC for binary backward compatibility with Panther */ 703 if (uap->op == O_SYNC || uap->op == 0) 704 fsync_kind = AIO_FSYNC; 705 else if ( uap->op == O_DSYNC ) 706 fsync_kind = AIO_DSYNC; 707 else { 708 *retval = -1; 709 error = EINVAL; 710 goto ExitRoutine; 711 } 712 713 error = aio_queue_async_request( p, uap->aiocbp, fsync_kind ); 714 if ( error != 0 ) 715 *retval = -1; 716 717ExitRoutine: 718 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END, 719 (int)p, (int)uap->aiocbp, error, 0, 0 ); 720 721 return( error ); 722 723} /* aio_fsync */ 724 725 726/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the 727 * file descriptor (uap->aiocbp->aio_fildes) into the buffer 728 * (uap->aiocbp->aio_buf). 729 */ 730int 731aio_read(proc_t p, struct aio_read_args *uap, int *retval ) 732{ 733 int error; 734 735 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START, 736 (int)p, (int)uap->aiocbp, 0, 0, 0 ); 737 738 *retval = 0; 739 740 error = aio_queue_async_request( p, uap->aiocbp, AIO_READ ); 741 if ( error != 0 ) 742 *retval = -1; 743 744 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END, 745 (int)p, (int)uap->aiocbp, error, 0, 0 ); 746 747 return( error ); 748 749} /* aio_read */ 750 751 752/* 753 * aio_return - return the return status associated with the async IO 754 * request referred to by uap->aiocbp. The return status is the value 755 * that would be returned by corresponding IO request (read, write, 756 * fdatasync, or sync). This is where we release kernel resources 757 * held for async IO call associated with the given aiocb pointer. 758 */ 759int 760aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval ) 761{ 762 aio_workq_entry *entryp; 763 int error; 764 boolean_t proc_lock_held = FALSE; 765 766 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START, 767 (int)p, (int)uap->aiocbp, 0, 0, 0 ); 768 769 /* See if there are any entries to check */ 770 if (aio_get_all_queues_count() < 1) { 771 error = EINVAL; 772 goto ExitRoutine; 773 } 774 775 aio_proc_lock(p); 776 proc_lock_held = TRUE; 777 *retval = 0; 778 779 /* look for a match on our queue of async IO requests that have completed */ 780 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) { 781 ASSERT_AIO_FROM_PROC(entryp, p); 782 if ( entryp->uaiocbp == uap->aiocbp ) { 783 /* Done and valid for aio_return(), pull it off the list */ 784 aio_proc_remove_done_locked(p, entryp); 785 786 /* Drop the proc lock, but keep the entry locked */ 787 aio_entry_lock(entryp); 788 aio_proc_unlock(p); 789 proc_lock_held = FALSE; 790 791 *retval = entryp->returnval; 792 error = 0; 793 794 /* No references and off all lists, safe to free */ 795 if (entryp->aio_refcount == 0) { 796 aio_entry_unlock(entryp); 797 aio_free_request(entryp); 798 } 799 else { 800 /* Whoever has the refcount will have to free it */ 801 entryp->flags |= AIO_DO_FREE; 802 aio_entry_unlock(entryp); 803 } 804 805 806 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE, 807 (int)p, (int)uap->aiocbp, *retval, 0, 0 ); 808 goto ExitRoutine; 809 } 810 } 811 812 /* look for a match on our queue of active async IO requests */ 813 TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) { 814 ASSERT_AIO_FROM_PROC(entryp, p); 815 if ( entryp->uaiocbp == uap->aiocbp ) { 816 error = EINPROGRESS; 817 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE, 818 (int)p, (int)uap->aiocbp, *retval, 0, 0 ); 819 goto ExitRoutine; 820 } 821 } 822 823 error = EINVAL; 824 825ExitRoutine: 826 if (proc_lock_held) 827 aio_proc_unlock(p); 828 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END, 829 (int)p, (int)uap->aiocbp, error, 0, 0 ); 830 831 return( error ); 832 833} /* aio_return */ 834 835 836/* 837 * _aio_exec - internal function used to clean up async IO requests for 838 * a process that is going away due to exec(). We cancel any async IOs 839 * we can and wait for those already active. We also disable signaling 840 * for cancelled or active aio requests that complete. 841 * This routine MAY block! 842 */ 843__private_extern__ void 844_aio_exec(proc_t p ) 845{ 846 847 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START, 848 (int)p, 0, 0, 0, 0 ); 849 850 _aio_exit( p ); 851 852 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END, 853 (int)p, 0, 0, 0, 0 ); 854 855 return; 856 857} /* _aio_exec */ 858 859 860/* 861 * _aio_exit - internal function used to clean up async IO requests for 862 * a process that is terminating (via exit() or exec() ). We cancel any async IOs 863 * we can and wait for those already active. We also disable signaling 864 * for cancelled or active aio requests that complete. This routine MAY block! 865 */ 866__private_extern__ void 867_aio_exit(proc_t p ) 868{ 869 int error; 870 aio_workq_entry *entryp; 871 872 873 /* quick check to see if there are any async IO requests queued up */ 874 if (aio_get_all_queues_count() < 1) { 875 return; 876 } 877 878 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START, 879 (int)p, 0, 0, 0, 0 ); 880 881 aio_proc_lock(p); 882 883 /* 884 * cancel async IO requests on the todo work queue and wait for those 885 * already active to complete. 886 */ 887 error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE ); 888 ASSERT_AIO_PROC_LOCK_OWNED(p); 889 if ( error == AIO_NOTCANCELED ) { 890 /* 891 * AIO_NOTCANCELED is returned when we find an aio request for this process 892 * on the active async IO queue. Active requests cannot be cancelled so we 893 * must wait for them to complete. We will get a special wake up call on 894 * our channel used to sleep for ALL active requests to complete. This sleep 895 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all 896 * active aio requests. 897 */ 898 899 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE, 900 (int)p, 0, 0, 0, 0 ); 901 902 while (p->p_aio_active_count != 0) { 903 msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 ); 904 } 905 } 906 907 if (p->p_aio_active_count != 0) { 908 panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count); 909 } 910 911 /* release all aio resources used by this process */ 912 entryp = TAILQ_FIRST( &p->p_aio_doneq ); 913 while ( entryp != NULL ) { 914 ASSERT_AIO_FROM_PROC(entryp, p); 915 aio_workq_entry *next_entryp; 916 917 next_entryp = TAILQ_NEXT( entryp, aio_proc_link); 918 aio_proc_remove_done_locked(p, entryp); 919 920 /* we cannot free requests that are still completing */ 921 aio_entry_lock_spin(entryp); 922 if (entryp->aio_refcount == 0) { 923 aio_proc_unlock(p); 924 aio_entry_unlock(entryp); 925 aio_free_request(entryp); 926 927 /* need to start over since aio_doneq may have been */ 928 /* changed while we were away. */ 929 aio_proc_lock(p); 930 entryp = TAILQ_FIRST( &p->p_aio_doneq ); 931 continue; 932 } 933 else { 934 /* whoever has the reference will have to do the free */ 935 entryp->flags |= AIO_DO_FREE; 936 } 937 938 aio_entry_unlock(entryp); 939 entryp = next_entryp; 940 } 941 942 aio_proc_unlock(p); 943 944 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END, 945 (int)p, 0, 0, 0, 0 ); 946 return; 947 948} /* _aio_exit */ 949 950 951static boolean_t 952should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd) 953{ 954 if ( (aiocbp == USER_ADDR_NULL && fd == 0) || 955 (aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) || 956 (aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) { 957 return TRUE; 958 } 959 960 return FALSE; 961} 962 963/* 964 * do_aio_cancel_locked - cancel async IO requests (if possible). We get called by 965 * aio_cancel, close, and at exit. 966 * There are three modes of operation: 1) cancel all async IOs for a process - 967 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd 968 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given 969 * aiocbp. 970 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all 971 * target async IO requests, AIO_NOTCANCELED if we could not cancel all 972 * target async IO requests, and AIO_ALLDONE if all target async IO requests 973 * were already complete. 974 * WARNING - do not deference aiocbp in this routine, it may point to user 975 * land data that has not been copied in (when called from aio_cancel() ) 976 * 977 * Called with proc locked, and returns the same way. 978 */ 979static int 980do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, 981 int wait_for_completion, boolean_t disable_notification ) 982{ 983 ASSERT_AIO_PROC_LOCK_OWNED(p); 984 985 aio_workq_entry *entryp; 986 int result; 987 988 result = -1; 989 990 /* look for a match on our queue of async todo work. */ 991 entryp = TAILQ_FIRST(&p->p_aio_activeq); 992 while ( entryp != NULL ) { 993 ASSERT_AIO_FROM_PROC(entryp, p); 994 aio_workq_entry *next_entryp; 995 996 next_entryp = TAILQ_NEXT( entryp, aio_proc_link); 997 if (!should_cancel(entryp, aiocbp, fd)) { 998 entryp = next_entryp; 999 continue; 1000 } 1001 1002 /* Can only be cancelled if it's still on a work queue */ 1003 if (aio_entry_try_workq_remove(entryp) != 0) { 1004 /* Have removed from workq. Update entry state and take a ref */ 1005 aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification); 1006 1007 /* Put on the proc done queue and update counts, then unlock the proc */ 1008 aio_proc_move_done_locked(p, entryp); 1009 aio_proc_unlock(p); 1010 1011 /* Now it's officially cancelled. Do the completion */ 1012 result = AIO_CANCELED; 1013 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE, 1014 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); 1015 do_aio_completion(entryp); 1016 1017 /* This will free if the aio_return() has already happened ... */ 1018 aio_entry_unref(entryp); 1019 aio_proc_lock(p); 1020 1021 if ( aiocbp != USER_ADDR_NULL ) { 1022 return( result ); 1023 } 1024 1025 /* 1026 * Restart from the head of the proc active queue since it 1027 * may have been changed while we were away doing completion 1028 * processing. 1029 * 1030 * Note that if we found an uncancellable AIO before, we will 1031 * either find it again or discover that it's been completed, 1032 * so resetting the result will not cause us to return success 1033 * despite outstanding AIOs. 1034 */ 1035 entryp = TAILQ_FIRST(&p->p_aio_activeq); 1036 result = -1; /* As if beginning anew */ 1037 } else { 1038 /* 1039 * It's been taken off the active queue already, i.e. is in flight. 1040 * All we can do is ask for notification. 1041 */ 1042 result = AIO_NOTCANCELED; 1043 1044 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE, 1045 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); 1046 1047 /* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */ 1048 aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification); 1049 1050 if ( aiocbp != USER_ADDR_NULL ) { 1051 return( result ); 1052 } 1053 entryp = next_entryp; 1054 } 1055 } /* while... */ 1056 1057 /* 1058 * if we didn't find any matches on the todo or active queues then look for a 1059 * match on our queue of async IO requests that have completed and if found 1060 * return AIO_ALLDONE result. 1061 * 1062 * Proc AIO lock is still held. 1063 */ 1064 if ( result == -1 ) { 1065 TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) { 1066 ASSERT_AIO_FROM_PROC(entryp, p); 1067 if (should_cancel(entryp, aiocbp, fd)) { 1068 result = AIO_ALLDONE; 1069 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE, 1070 (int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 ); 1071 1072 if ( aiocbp != USER_ADDR_NULL ) { 1073 return( result ); 1074 } 1075 } 1076 } 1077 } 1078 1079 return( result ); 1080 1081} 1082 /* do_aio_cancel_locked */ 1083 1084 1085/* 1086 * aio_suspend - suspend the calling thread until at least one of the async 1087 * IO operations referenced by uap->aiocblist has completed, until a signal 1088 * interrupts the function, or uap->timeoutp time interval (optional) has 1089 * passed. 1090 * Returns 0 if one or more async IOs have completed else -1 and errno is 1091 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt 1092 * woke us up. 1093 */ 1094int 1095aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval ) 1096{ 1097 __pthread_testcancel(1); 1098 return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval)); 1099} 1100 1101 1102int 1103aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval ) 1104{ 1105 int error; 1106 int i, count; 1107 uint64_t abstime; 1108 struct user_timespec ts; 1109 aio_workq_entry *entryp; 1110 user_addr_t *aiocbpp; 1111 1112 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START, 1113 (int)p, uap->nent, 0, 0, 0 ); 1114 1115 *retval = -1; 1116 abstime = 0; 1117 aiocbpp = NULL; 1118 1119 count = aio_get_all_queues_count( ); 1120 if ( count < 1 ) { 1121 error = EINVAL; 1122 goto ExitThisRoutine; 1123 } 1124 1125 if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) { 1126 error = EINVAL; 1127 goto ExitThisRoutine; 1128 } 1129 1130 if ( uap->timeoutp != USER_ADDR_NULL ) { 1131 if ( proc_is64bit(p) ) { 1132 struct user64_timespec temp; 1133 error = copyin( uap->timeoutp, &temp, sizeof(temp) ); 1134 if ( error == 0 ) { 1135 ts.tv_sec = temp.tv_sec; 1136 ts.tv_nsec = temp.tv_nsec; 1137 } 1138 } 1139 else { 1140 struct user32_timespec temp; 1141 error = copyin( uap->timeoutp, &temp, sizeof(temp) ); 1142 if ( error == 0 ) { 1143 ts.tv_sec = temp.tv_sec; 1144 ts.tv_nsec = temp.tv_nsec; 1145 } 1146 } 1147 if ( error != 0 ) { 1148 error = EAGAIN; 1149 goto ExitThisRoutine; 1150 } 1151 1152 if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) { 1153 error = EINVAL; 1154 goto ExitThisRoutine; 1155 } 1156 1157 nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, 1158 &abstime ); 1159 clock_absolutetime_interval_to_deadline( abstime, &abstime ); 1160 } 1161 1162 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent); 1163 if ( aiocbpp == NULL ) { 1164 error = EAGAIN; 1165 goto ExitThisRoutine; 1166 } 1167 1168 /* check list of aio requests to see if any have completed */ 1169check_for_our_aiocbp: 1170 aio_proc_lock_spin(p); 1171 for ( i = 0; i < uap->nent; i++ ) { 1172 user_addr_t aiocbp; 1173 1174 /* NULL elements are legal so check for 'em */ 1175 aiocbp = *(aiocbpp + i); 1176 if ( aiocbp == USER_ADDR_NULL ) 1177 continue; 1178 1179 /* return immediately if any aio request in the list is done */ 1180 TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) { 1181 ASSERT_AIO_FROM_PROC(entryp, p); 1182 if ( entryp->uaiocbp == aiocbp ) { 1183 aio_proc_unlock(p); 1184 *retval = 0; 1185 error = 0; 1186 goto ExitThisRoutine; 1187 } 1188 } 1189 } /* for ( ; i < uap->nent; ) */ 1190 1191 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE, 1192 (int)p, uap->nent, 0, 0, 0 ); 1193 1194 /* 1195 * wait for an async IO to complete or a signal fires or timeout expires. 1196 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal 1197 * interrupts us. If an async IO completes before a signal fires or our 1198 * timeout expires, we get a wakeup call from aio_work_thread(). 1199 */ 1200 1201 error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */ 1202 if ( error == 0 ) { 1203 /* 1204 * got our wakeup call from aio_work_thread(). 1205 * Since we can get a wakeup on this channel from another thread in the 1206 * same process we head back up to make sure this is for the correct aiocbp. 1207 * If it is the correct aiocbp we will return from where we do the check 1208 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label) 1209 * else we will fall out and just sleep again. 1210 */ 1211 goto check_for_our_aiocbp; 1212 } 1213 else if ( error == EWOULDBLOCK ) { 1214 /* our timeout expired */ 1215 error = EAGAIN; 1216 } 1217 else { 1218 /* we were interrupted */ 1219 error = EINTR; 1220 } 1221 1222ExitThisRoutine: 1223 if ( aiocbpp != NULL ) 1224 FREE( aiocbpp, M_TEMP ); 1225 1226 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END, 1227 (int)p, uap->nent, error, 0, 0 ); 1228 1229 return( error ); 1230 1231} /* aio_suspend */ 1232 1233 1234/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the 1235 * file descriptor (uap->aiocbp->aio_fildes) from the buffer 1236 * (uap->aiocbp->aio_buf). 1237 */ 1238 1239int 1240aio_write(proc_t p, struct aio_write_args *uap, int *retval ) 1241{ 1242 int error; 1243 1244 *retval = 0; 1245 1246 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START, 1247 (int)p, (int)uap->aiocbp, 0, 0, 0 ); 1248 1249 error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE ); 1250 if ( error != 0 ) 1251 *retval = -1; 1252 1253 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END, 1254 (int)p, (int)uap->aiocbp, error, 0, 0 ); 1255 1256 return( error ); 1257 1258} /* aio_write */ 1259 1260 1261static user_addr_t * 1262aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent) 1263{ 1264 user_addr_t *aiocbpp; 1265 int i, result; 1266 1267 /* we reserve enough space for largest possible pointer size */ 1268 MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK ); 1269 if ( aiocbpp == NULL ) 1270 goto err; 1271 1272 /* copyin our aiocb pointers from list */ 1273 result = copyin( aiocblist, aiocbpp, 1274 proc_is64bit(procp) ? (nent * sizeof(user64_addr_t)) 1275 : (nent * sizeof(user32_addr_t)) ); 1276 if ( result) { 1277 FREE( aiocbpp, M_TEMP ); 1278 aiocbpp = NULL; 1279 goto err; 1280 } 1281 1282 /* 1283 * We depend on a list of user_addr_t's so we need to 1284 * munge and expand when these pointers came from a 1285 * 32-bit process 1286 */ 1287 if ( !proc_is64bit(procp) ) { 1288 /* copy from last to first to deal with overlap */ 1289 user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1); 1290 user_addr_t *my_addrp = aiocbpp + (nent - 1); 1291 1292 for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) { 1293 *my_addrp = (user_addr_t) (*my_ptrp); 1294 } 1295 } 1296 1297err: 1298 return (aiocbpp); 1299} 1300 1301 1302static int 1303aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev) 1304{ 1305 int result = 0; 1306 1307 if (sigp == USER_ADDR_NULL) 1308 goto out; 1309 1310 /* 1311 * We need to munge aio_sigevent since it contains pointers. 1312 * Since we do not know if sigev_value is an int or a ptr we do 1313 * NOT cast the ptr to a user_addr_t. This means if we send 1314 * this info back to user space we need to remember sigev_value 1315 * was not expanded for the 32-bit case. 1316 * 1317 * Notes: This does NOT affect us since we don't support 1318 * sigev_value yet in the aio context. 1319 */ 1320 if ( proc_is64bit(procp) ) { 1321 struct user64_sigevent sigevent64; 1322 1323 result = copyin( sigp, &sigevent64, sizeof(sigevent64) ); 1324 if ( result == 0 ) { 1325 sigev->sigev_notify = sigevent64.sigev_notify; 1326 sigev->sigev_signo = sigevent64.sigev_signo; 1327 sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int; 1328 sigev->sigev_notify_function = sigevent64.sigev_notify_function; 1329 sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes; 1330 } 1331 1332 } else { 1333 struct user32_sigevent sigevent32; 1334 1335 result = copyin( sigp, &sigevent32, sizeof(sigevent32) ); 1336 if ( result == 0 ) { 1337 sigev->sigev_notify = sigevent32.sigev_notify; 1338 sigev->sigev_signo = sigevent32.sigev_signo; 1339 sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int; 1340 sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function); 1341 sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes); 1342 } 1343 } 1344 1345 if ( result != 0 ) { 1346 result = EAGAIN; 1347 } 1348 1349out: 1350 return (result); 1351} 1352 1353/* 1354 * aio_enqueue_work 1355 * 1356 * Queue up the entry on the aio asynchronous work queue in priority order 1357 * based on the relative priority of the request. We calculate the relative 1358 * priority using the nice value of the caller and the value 1359 * 1360 * Parameters: procp Process queueing the I/O 1361 * entryp The work queue entry being queued 1362 * 1363 * Returns: (void) No failure modes 1364 * 1365 * Notes: This function is used for both lio_listio and aio 1366 * 1367 * XXX: At some point, we may have to consider thread priority 1368 * rather than process priority, but we don't maintain the 1369 * adjusted priority for threads the POSIX way. 1370 * 1371 * 1372 * Called with proc locked. 1373 */ 1374static void 1375aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked) 1376{ 1377#if 0 1378 aio_workq_entry *my_entryp; /* used for insertion sort */ 1379#endif /* 0 */ 1380 aio_workq_t queue = aio_entry_workq(entryp); 1381 1382 if (proc_locked == 0) { 1383 aio_proc_lock(procp); 1384 } 1385 1386 ASSERT_AIO_PROC_LOCK_OWNED(procp); 1387 1388 /* Onto proc queue */ 1389 TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp, aio_proc_link); 1390 procp->p_aio_active_count++; 1391 procp->p_aio_total_count++; 1392 1393 /* And work queue */ 1394 aio_workq_lock_spin(queue); 1395 aio_workq_add_entry_locked(queue, entryp); 1396 wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1); 1397 aio_workq_unlock(queue); 1398 1399 if (proc_locked == 0) { 1400 aio_proc_unlock(procp); 1401 } 1402 1403#if 0 1404 /* 1405 * Procedure: 1406 * 1407 * (1) The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20] 1408 * (2) The normalized nice value is in the range 0..((2 * NZERO) - 1) 1409 * which is [0..39], with 0 not being used. In nice values, the 1410 * lower the nice value, the higher the priority. 1411 * (3) The normalized scheduling prioritiy is the highest nice value 1412 * minus the current nice value. In I/O scheduling priority, the 1413 * higher the value the lower the priority, so it is the inverse 1414 * of the nice value (the higher the number, the higher the I/O 1415 * priority). 1416 * (4) From the normalized scheduling priority, we subtract the 1417 * request priority to get the request priority value number; 1418 * this means that requests are only capable of depressing their 1419 * priority relative to other requests, 1420 */ 1421 entryp->priority = (((2 * NZERO) - 1) - procp->p_nice); 1422 1423 /* only premit depressing the priority */ 1424 if (entryp->aiocb.aio_reqprio < 0) 1425 entryp->aiocb.aio_reqprio = 0; 1426 if (entryp->aiocb.aio_reqprio > 0) { 1427 entryp->priority -= entryp->aiocb.aio_reqprio; 1428 if (entryp->priority < 0) 1429 entryp->priority = 0; 1430 } 1431 1432 /* Insertion sort the entry; lowest ->priority to highest */ 1433 TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) { 1434 if ( entryp->priority <= my_entryp->priority) { 1435 TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link); 1436 break; 1437 } 1438 } 1439 if (my_entryp == NULL) 1440 TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link ); 1441#endif /* 0 */ 1442} 1443 1444 1445/* 1446 * lio_listio - initiate a list of IO requests. We process the list of 1447 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously 1448 * (mode == LIO_NOWAIT). 1449 * 1450 * The caller gets error and return status for each aiocb in the list 1451 * via aio_error and aio_return. We must keep completed requests until 1452 * released by the aio_return call. 1453 */ 1454int 1455lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) 1456{ 1457 int i; 1458 int call_result; 1459 int result; 1460 int old_count; 1461 aio_workq_entry **entryp_listp; 1462 user_addr_t *aiocbpp; 1463 struct user_sigevent aiosigev; 1464 aio_lio_context *lio_context; 1465 boolean_t free_context = FALSE; 1466 1467 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START, 1468 (int)p, uap->nent, uap->mode, 0, 0 ); 1469 1470 entryp_listp = NULL; 1471 lio_context = NULL; 1472 aiocbpp = NULL; 1473 call_result = -1; 1474 *retval = -1; 1475 if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) { 1476 call_result = EINVAL; 1477 goto ExitRoutine; 1478 } 1479 1480 if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) { 1481 call_result = EINVAL; 1482 goto ExitRoutine; 1483 } 1484 1485 /* 1486 * allocate a list of aio_workq_entry pointers that we will use 1487 * to queue up all our requests at once while holding our lock. 1488 */ 1489 MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK ); 1490 if ( entryp_listp == NULL ) { 1491 call_result = EAGAIN; 1492 goto ExitRoutine; 1493 } 1494 1495 MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK ); 1496 if ( lio_context == NULL ) { 1497 call_result = EAGAIN; 1498 goto ExitRoutine; 1499 } 1500 1501#if DEBUG 1502 OSIncrementAtomic(&lio_contexts_alloced); 1503#endif /* DEBUG */ 1504 1505 bzero(lio_context, sizeof(aio_lio_context)); 1506 1507 aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent); 1508 if ( aiocbpp == NULL ) { 1509 call_result = EAGAIN; 1510 goto ExitRoutine; 1511 } 1512 1513 /* 1514 * Use sigevent passed in to lio_listio for each of our calls, but 1515 * only do completion notification after the last request completes. 1516 */ 1517 bzero(&aiosigev, sizeof(aiosigev)); 1518 /* Only copy in an sigev if the user supplied one */ 1519 if (uap->sigp != USER_ADDR_NULL) { 1520 call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev); 1521 if ( call_result) 1522 goto ExitRoutine; 1523 } 1524 1525 /* process list of aio requests */ 1526 lio_context->io_issued = uap->nent; 1527 lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */ 1528 for ( i = 0; i < uap->nent; i++ ) { 1529 user_addr_t my_aiocbp; 1530 aio_workq_entry *entryp; 1531 1532 *(entryp_listp + i) = NULL; 1533 my_aiocbp = *(aiocbpp + i); 1534 1535 /* NULL elements are legal so check for 'em */ 1536 if ( my_aiocbp == USER_ADDR_NULL ) { 1537 aio_proc_lock_spin(p); 1538 lio_context->io_issued--; 1539 aio_proc_unlock(p); 1540 continue; 1541 } 1542 1543 /* 1544 * We use lio_context to mark IO requests for delayed completion 1545 * processing which means we wait until all IO requests in the 1546 * group have completed before we either return to the caller 1547 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT. 1548 * 1549 * We use the address of the lio_context for this, since it is 1550 * unique in the address space. 1551 */ 1552 result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) ); 1553 if ( result != 0 && call_result == -1 ) 1554 call_result = result; 1555 1556 /* NULL elements are legal so check for 'em */ 1557 entryp = *(entryp_listp + i); 1558 if ( entryp == NULL ) { 1559 aio_proc_lock_spin(p); 1560 lio_context->io_issued--; 1561 aio_proc_unlock(p); 1562 continue; 1563 } 1564 1565 if ( uap->mode == LIO_NOWAIT ) { 1566 /* Set signal hander, if any */ 1567 entryp->aiocb.aio_sigevent = aiosigev; 1568 } else { 1569 /* flag that this thread blocks pending completion */ 1570 entryp->flags |= AIO_LIO_NOTIFY; 1571 } 1572 1573 /* check our aio limits to throttle bad or rude user land behavior */ 1574 old_count = aio_increment_total_count(); 1575 1576 aio_proc_lock_spin(p); 1577 if ( old_count >= aio_max_requests || 1578 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process || 1579 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) { 1580 1581 lio_context->io_issued--; 1582 aio_proc_unlock(p); 1583 1584 aio_decrement_total_count(); 1585 1586 if ( call_result == -1 ) 1587 call_result = EAGAIN; 1588 aio_free_request(entryp); 1589 entryp_listp[i] = NULL; 1590 continue; 1591 } 1592 1593 lck_mtx_convert_spin(aio_proc_mutex(p)); 1594 aio_enqueue_work(p, entryp, 1); 1595 aio_proc_unlock(p); 1596 1597 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE, 1598 (int)p, (int)entryp->uaiocbp, 0, 0, 0 ); 1599 } 1600 1601 switch(uap->mode) { 1602 case LIO_WAIT: 1603 aio_proc_lock_spin(p); 1604 while (lio_context->io_completed < lio_context->io_issued) { 1605 result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0); 1606 1607 /* If we were interrupted, fail out (even if all finished) */ 1608 if (result != 0) { 1609 call_result = EINTR; 1610 lio_context->io_waiter = 0; 1611 break; 1612 } 1613 } 1614 1615 /* If all IOs have finished must free it */ 1616 if (lio_context->io_completed == lio_context->io_issued) { 1617 free_context = TRUE; 1618 } 1619 1620 aio_proc_unlock(p); 1621 break; 1622 1623 case LIO_NOWAIT: 1624 break; 1625 } 1626 1627 /* call_result == -1 means we had no trouble queueing up requests */ 1628 if ( call_result == -1 ) { 1629 call_result = 0; 1630 *retval = 0; 1631 } 1632 1633ExitRoutine: 1634 if ( entryp_listp != NULL ) 1635 FREE( entryp_listp, M_TEMP ); 1636 if ( aiocbpp != NULL ) 1637 FREE( aiocbpp, M_TEMP ); 1638 if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) { 1639 free_lio_context(lio_context); 1640 } 1641 1642 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END, 1643 (int)p, call_result, 0, 0, 0 ); 1644 1645 return( call_result ); 1646 1647} /* lio_listio */ 1648 1649 1650/* 1651 * aio worker thread. this is where all the real work gets done. 1652 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq 1653 * after new work is queued up. 1654 */ 1655static void 1656aio_work_thread( void ) 1657{ 1658 aio_workq_entry *entryp; 1659 int error; 1660 vm_map_t currentmap; 1661 vm_map_t oldmap = VM_MAP_NULL; 1662 task_t oldaiotask = TASK_NULL; 1663 struct uthread *uthreadp = NULL; 1664 1665 for( ;; ) { 1666 /* 1667 * returns with the entry ref'ed. 1668 * sleeps until work is available. 1669 */ 1670 entryp = aio_get_some_work(); 1671 1672 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START, 1673 (int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 ); 1674 1675 /* 1676 * Assume the target's address space identity for the duration 1677 * of the IO. Note: don't need to have the entryp locked, 1678 * because the proc and map don't change until it's freed. 1679 */ 1680 currentmap = get_task_map( (current_proc())->task ); 1681 if ( currentmap != entryp->aio_map ) { 1682 uthreadp = (struct uthread *) get_bsdthread_info(current_thread()); 1683 oldaiotask = uthreadp->uu_aio_task; 1684 uthreadp->uu_aio_task = entryp->procp->task; 1685 oldmap = vm_map_switch( entryp->aio_map ); 1686 } 1687 1688 if ( (entryp->flags & AIO_READ) != 0 ) { 1689 error = do_aio_read( entryp ); 1690 } 1691 else if ( (entryp->flags & AIO_WRITE) != 0 ) { 1692 error = do_aio_write( entryp ); 1693 } 1694 else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) { 1695 error = do_aio_fsync( entryp ); 1696 } 1697 else { 1698 printf( "%s - unknown aio request - flags 0x%02X \n", 1699 __FUNCTION__, entryp->flags ); 1700 error = EINVAL; 1701 } 1702 1703 /* Restore old map */ 1704 if ( currentmap != entryp->aio_map ) { 1705 (void) vm_map_switch( oldmap ); 1706 uthreadp->uu_aio_task = oldaiotask; 1707 } 1708 1709 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END, 1710 (int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval, 1711 entryp->returnval, 0 ); 1712 1713 1714 /* XXX COUNTS */ 1715 aio_entry_lock_spin(entryp); 1716 entryp->errorval = error; 1717 aio_entry_unlock(entryp); 1718 1719 /* we're done with the IO request so pop it off the active queue and */ 1720 /* push it on the done queue */ 1721 aio_proc_lock(entryp->procp); 1722 aio_proc_move_done_locked(entryp->procp, entryp); 1723 aio_proc_unlock(entryp->procp); 1724 1725 OSDecrementAtomic(&aio_anchor.aio_inflight_count); 1726 1727 /* remove our reference to the user land map. */ 1728 if ( VM_MAP_NULL != entryp->aio_map ) { 1729 vm_map_t my_map; 1730 1731 my_map = entryp->aio_map; 1732 entryp->aio_map = VM_MAP_NULL; 1733 vm_map_deallocate( my_map ); 1734 } 1735 1736 /* Provide notifications */ 1737 do_aio_completion( entryp ); 1738 1739 /* Will free if needed */ 1740 aio_entry_unref(entryp); 1741 1742 } /* for ( ;; ) */ 1743 1744 /* NOT REACHED */ 1745 1746} /* aio_work_thread */ 1747 1748 1749/* 1750 * aio_get_some_work - get the next async IO request that is ready to be executed. 1751 * aio_fsync complicates matters a bit since we cannot do the fsync until all async 1752 * IO requests at the time the aio_fsync call came in have completed. 1753 * NOTE - AIO_LOCK must be held by caller 1754 */ 1755static aio_workq_entry * 1756aio_get_some_work( void ) 1757{ 1758 aio_workq_entry *entryp = NULL; 1759 aio_workq_t queue = NULL; 1760 1761 /* Just one queue for the moment. In the future there will be many. */ 1762 queue = &aio_anchor.aio_async_workqs[0]; 1763 aio_workq_lock_spin(queue); 1764 if (queue->aioq_count == 0) { 1765 goto nowork; 1766 } 1767 1768 /* 1769 * Hold the queue lock. 1770 * 1771 * pop some work off the work queue and add to our active queue 1772 * Always start with the queue lock held. 1773 */ 1774 for(;;) { 1775 /* 1776 * Pull of of work queue. Once it's off, it can't be cancelled, 1777 * so we can take our ref once we drop the queue lock. 1778 */ 1779 entryp = TAILQ_FIRST(&queue->aioq_entries); 1780 1781 /* 1782 * If there's no work or only fsyncs that need delay, go to sleep 1783 * and then start anew from aio_work_thread 1784 */ 1785 if (entryp == NULL) { 1786 goto nowork; 1787 } 1788 1789 aio_workq_remove_entry_locked(queue, entryp); 1790 1791 aio_workq_unlock(queue); 1792 1793 /* 1794 * Check if it's an fsync that must be delayed. No need to lock the entry; 1795 * that flag would have been set at initialization. 1796 */ 1797 if ( (entryp->flags & AIO_FSYNC) != 0 ) { 1798 /* 1799 * Check for unfinished operations on the same file 1800 * in this proc's queue. 1801 */ 1802 aio_proc_lock_spin(entryp->procp); 1803 if ( aio_delay_fsync_request( entryp ) ) { 1804 /* It needs to be delayed. Put it back on the end of the work queue */ 1805 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE, 1806 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 1807 1808 aio_proc_unlock(entryp->procp); 1809 1810 aio_workq_lock_spin(queue); 1811 aio_workq_add_entry_locked(queue, entryp); 1812 continue; 1813 } 1814 aio_proc_unlock(entryp->procp); 1815 } 1816 1817 break; 1818 } 1819 1820 aio_entry_ref(entryp); 1821 1822 OSIncrementAtomic(&aio_anchor.aio_inflight_count); 1823 return( entryp ); 1824 1825nowork: 1826 /* We will wake up when someone enqueues something */ 1827 wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0); 1828 aio_workq_unlock(queue); 1829 thread_block( (thread_continue_t)aio_work_thread ); 1830 1831 // notreached 1832 return NULL; 1833} 1834 1835/* 1836 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed. 1837 * A big, simple hammer: only send it off if it's the most recently filed IO which has 1838 * not been completed. 1839 */ 1840static boolean_t 1841aio_delay_fsync_request( aio_workq_entry *entryp ) 1842{ 1843 if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) { 1844 return FALSE; 1845 } 1846 1847 return TRUE; 1848} /* aio_delay_fsync_request */ 1849 1850static aio_workq_entry * 1851aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO) 1852{ 1853 aio_workq_entry *entryp; 1854 int result = 0; 1855 1856 entryp = (aio_workq_entry *) zalloc( aio_workq_zonep ); 1857 if ( entryp == NULL ) { 1858 result = EAGAIN; 1859 goto error_exit; 1860 } 1861 1862 bzero( entryp, sizeof(*entryp) ); 1863 1864 /* fill in the rest of the aio_workq_entry */ 1865 entryp->procp = procp; 1866 entryp->uaiocbp = aiocbp; 1867 entryp->flags |= kindOfIO; 1868 entryp->group_tag = group_tag; 1869 entryp->aio_map = VM_MAP_NULL; 1870 entryp->aio_refcount = 0; 1871 1872 if ( proc_is64bit(procp) ) { 1873 struct user64_aiocb aiocb64; 1874 1875 result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) ); 1876 if (result == 0 ) 1877 do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb); 1878 1879 } else { 1880 struct user32_aiocb aiocb32; 1881 1882 result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) ); 1883 if ( result == 0 ) 1884 do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb ); 1885 } 1886 1887 if ( result != 0 ) { 1888 result = EAGAIN; 1889 goto error_exit; 1890 } 1891 1892 /* get a reference to the user land map in order to keep it around */ 1893 entryp->aio_map = get_task_map( procp->task ); 1894 vm_map_reference( entryp->aio_map ); 1895 1896 /* do some more validation on the aiocb and embedded file descriptor */ 1897 result = aio_validate( entryp ); 1898 if ( result != 0 ) 1899 goto error_exit_with_ref; 1900 1901 /* get a reference on the current_thread, which is passed in vfs_context. */ 1902 entryp->thread = current_thread(); 1903 thread_reference( entryp->thread ); 1904 return ( entryp ); 1905 1906error_exit_with_ref: 1907 if ( VM_MAP_NULL != entryp->aio_map ) { 1908 vm_map_deallocate( entryp->aio_map ); 1909 } 1910error_exit: 1911 if ( result && entryp != NULL ) { 1912 zfree( aio_workq_zonep, entryp ); 1913 entryp = NULL; 1914 } 1915 1916 return ( entryp ); 1917} 1918 1919 1920/* 1921 * aio_queue_async_request - queue up an async IO request on our work queue then 1922 * wake up one of our worker threads to do the actual work. We get a reference 1923 * to our caller's user land map in order to keep it around while we are 1924 * processing the request. 1925 */ 1926static int 1927aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO ) 1928{ 1929 aio_workq_entry *entryp; 1930 int result; 1931 int old_count; 1932 1933 old_count = aio_increment_total_count(); 1934 if (old_count >= aio_max_requests) { 1935 result = EAGAIN; 1936 goto error_noalloc; 1937 } 1938 1939 entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO); 1940 if ( entryp == NULL ) { 1941 result = EAGAIN; 1942 goto error_noalloc; 1943 } 1944 1945 1946 aio_proc_lock_spin(procp); 1947 1948 if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) { 1949 result = EAGAIN; 1950 goto error_exit; 1951 } 1952 1953 /* check our aio limits to throttle bad or rude user land behavior */ 1954 if (aio_get_process_count( procp ) >= aio_max_requests_per_process) { 1955 printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count); 1956 result = EAGAIN; 1957 goto error_exit; 1958 } 1959 1960 /* Add the IO to proc and work queues, wake up threads as appropriate */ 1961 lck_mtx_convert_spin(aio_proc_mutex(procp)); 1962 aio_enqueue_work(procp, entryp, 1); 1963 1964 aio_proc_unlock(procp); 1965 1966 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE, 1967 (int)procp, (int)aiocbp, 0, 0, 0 ); 1968 1969 return( 0 ); 1970 1971error_exit: 1972 /* 1973 * This entry has not been queued up so no worries about 1974 * unlocked state and aio_map 1975 */ 1976 aio_proc_unlock(procp); 1977 aio_free_request(entryp); 1978 1979error_noalloc: 1980 aio_decrement_total_count(); 1981 1982 return( result ); 1983 1984} /* aio_queue_async_request */ 1985 1986 1987/* 1988 * lio_create_entry 1989 * 1990 * Allocate an aio_workq_entry and fill it in. If all goes well return 0 1991 * and pass the aio_workq_entry pointer back to our caller. 1992 * 1993 * Parameters: procp The process makign the request 1994 * aiocbp The aio context buffer pointer 1995 * group_tag The group tag used to indicate a 1996 * group of operations has completed 1997 * entrypp Pointer to the pointer to receive the 1998 * address of the created aio_workq_entry 1999 * 2000 * Returns: 0 Successfully created 2001 * EAGAIN Try again (usually resource shortage) 2002 * 2003 * 2004 * Notes: We get a reference to our caller's user land map in order 2005 * to keep it around while we are processing the request. 2006 * 2007 * lio_listio calls behave differently at completion they do 2008 * completion notification when all async IO requests have 2009 * completed. We use group_tag to tag IO requests that behave 2010 * in the delay notification manner. 2011 * 2012 * All synchronous operations are considered to not have a 2013 * signal routine associated with them (sigp == USER_ADDR_NULL). 2014 */ 2015static int 2016lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, 2017 aio_workq_entry **entrypp ) 2018{ 2019 aio_workq_entry *entryp; 2020 int result; 2021 2022 entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO); 2023 if ( entryp == NULL ) { 2024 result = EAGAIN; 2025 goto error_exit; 2026 } 2027 2028 /* 2029 * Look for lio_listio LIO_NOP requests and ignore them; this is 2030 * not really an error, but we need to free our aio_workq_entry. 2031 */ 2032 if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) { 2033 result = 0; 2034 goto error_exit; 2035 } 2036 2037 *entrypp = entryp; 2038 return( 0 ); 2039 2040error_exit: 2041 2042 if ( entryp != NULL ) { 2043 /* 2044 * This entry has not been queued up so no worries about 2045 * unlocked state and aio_map 2046 */ 2047 aio_free_request(entryp); 2048 } 2049 2050 return( result ); 2051 2052} /* lio_create_entry */ 2053 2054 2055/* 2056 * aio_free_request - remove our reference on the user land map and 2057 * free the work queue entry resources. The entry is off all lists 2058 * and has zero refcount, so no one can have a pointer to it. 2059 */ 2060 2061static int 2062aio_free_request(aio_workq_entry *entryp) 2063{ 2064 /* remove our reference to the user land map. */ 2065 if ( VM_MAP_NULL != entryp->aio_map) { 2066 vm_map_deallocate(entryp->aio_map); 2067 } 2068 2069 /* remove our reference to thread which enqueued the request */ 2070 if ( NULL != entryp->thread ) { 2071 thread_deallocate( entryp->thread ); 2072 } 2073 2074 entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */ 2075 2076 zfree( aio_workq_zonep, entryp ); 2077 2078 return( 0 ); 2079 2080} /* aio_free_request */ 2081 2082 2083/* 2084 * aio_validate 2085 * 2086 * validate the aiocb passed in by one of the aio syscalls. 2087 */ 2088static int 2089aio_validate( aio_workq_entry *entryp ) 2090{ 2091 struct fileproc *fp; 2092 int flag; 2093 int result; 2094 2095 result = 0; 2096 2097 if ( (entryp->flags & AIO_LIO) != 0 ) { 2098 if ( entryp->aiocb.aio_lio_opcode == LIO_READ ) 2099 entryp->flags |= AIO_READ; 2100 else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE ) 2101 entryp->flags |= AIO_WRITE; 2102 else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) 2103 return( 0 ); 2104 else 2105 return( EINVAL ); 2106 } 2107 2108 flag = FREAD; 2109 if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) { 2110 flag = FWRITE; 2111 } 2112 2113 if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) { 2114 if ( entryp->aiocb.aio_nbytes > INT_MAX || 2115 entryp->aiocb.aio_buf == USER_ADDR_NULL || 2116 entryp->aiocb.aio_offset < 0 ) 2117 return( EINVAL ); 2118 } 2119 2120 /* 2121 * validate aiocb.aio_sigevent. at this point we only support 2122 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE. this means 2123 * sigev_value, sigev_notify_function, and sigev_notify_attributes 2124 * are ignored, since SIGEV_THREAD is unsupported. This is consistent 2125 * with no [RTS] (RalTime Signal) option group support. 2126 */ 2127 switch ( entryp->aiocb.aio_sigevent.sigev_notify ) { 2128 case SIGEV_SIGNAL: 2129 { 2130 int signum; 2131 2132 /* make sure we have a valid signal number */ 2133 signum = entryp->aiocb.aio_sigevent.sigev_signo; 2134 if ( signum <= 0 || signum >= NSIG || 2135 signum == SIGKILL || signum == SIGSTOP ) 2136 return (EINVAL); 2137 } 2138 break; 2139 2140 case SIGEV_NONE: 2141 break; 2142 2143 case SIGEV_THREAD: 2144 /* Unsupported [RTS] */ 2145 2146 default: 2147 return (EINVAL); 2148 } 2149 2150 /* validate the file descriptor and that the file was opened 2151 * for the appropriate read / write access. 2152 */ 2153 proc_fdlock(entryp->procp); 2154 2155 result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1); 2156 if ( result == 0 ) { 2157 if ( (fp->f_fglob->fg_flag & flag) == 0 ) { 2158 /* we don't have read or write access */ 2159 result = EBADF; 2160 } 2161 else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) { 2162 /* this is not a file */ 2163 result = ESPIPE; 2164 } else 2165 fp->f_flags |= FP_AIOISSUED; 2166 2167 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1); 2168 } 2169 else { 2170 result = EBADF; 2171 } 2172 2173 proc_fdunlock(entryp->procp); 2174 2175 return( result ); 2176 2177} /* aio_validate */ 2178 2179static int 2180aio_increment_total_count() 2181{ 2182 return OSIncrementAtomic(&aio_anchor.aio_total_count); 2183} 2184 2185static int 2186aio_decrement_total_count() 2187{ 2188 int old = OSDecrementAtomic(&aio_anchor.aio_total_count); 2189 if (old <= 0) { 2190 panic("Negative total AIO count!\n"); 2191 } 2192 2193 return old; 2194} 2195 2196static int 2197aio_get_process_count(proc_t procp ) 2198{ 2199 return procp->p_aio_total_count; 2200 2201} /* aio_get_process_count */ 2202 2203static int 2204aio_get_all_queues_count( void ) 2205{ 2206 return aio_anchor.aio_total_count; 2207 2208} /* aio_get_all_queues_count */ 2209 2210 2211/* 2212 * do_aio_completion. Handle async IO completion. 2213 */ 2214static void 2215do_aio_completion( aio_workq_entry *entryp ) 2216{ 2217 2218 boolean_t lastLioCompleted = FALSE; 2219 aio_lio_context *lio_context = NULL; 2220 int waiter = 0; 2221 2222 lio_context = (aio_lio_context *)entryp->group_tag; 2223 2224 if (lio_context != NULL) { 2225 2226 aio_proc_lock_spin(entryp->procp); 2227 2228 /* Account for this I/O completing. */ 2229 lio_context->io_completed++; 2230 2231 /* Are we done with this lio context? */ 2232 if (lio_context->io_issued == lio_context->io_completed) { 2233 lastLioCompleted = TRUE; 2234 } 2235 2236 waiter = lio_context->io_waiter; 2237 2238 /* explicit wakeup of lio_listio() waiting in LIO_WAIT */ 2239 if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) { 2240 /* wake up the waiter */ 2241 wakeup(lio_context); 2242 } 2243 2244 aio_proc_unlock(entryp->procp); 2245 } 2246 2247 if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL && 2248 (entryp->flags & AIO_DISABLE) == 0 ) { 2249 2250 boolean_t performSignal = FALSE; 2251 if (lio_context == NULL) { 2252 performSignal = TRUE; 2253 } 2254 else { 2255 /* 2256 * If this was the last request in the group and a signal 2257 * is desired, send one. 2258 */ 2259 performSignal = lastLioCompleted; 2260 } 2261 2262 if (performSignal) { 2263 2264 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE, 2265 (int)entryp->procp, (int)entryp->uaiocbp, 2266 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 ); 2267 2268 psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo ); 2269 } 2270 } 2271 2272 if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) { 2273 panic("Close and exit flags set at the same time\n"); 2274 } 2275 2276 /* 2277 * need to handle case where a process is trying to exit, exec, or 2278 * close and is currently waiting for active aio requests to complete. 2279 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any 2280 * other requests in the active queue for this process. If there are 2281 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel. 2282 * If there are some still active then do nothing - we only want to 2283 * wakeup when all active aio requests for the process are complete. 2284 * 2285 * Don't need to lock the entry or proc to check the cleanup flag. It can only be 2286 * set for cancellation, while the entryp is still on a proc list; now it's 2287 * off, so that flag is already set if it's going to be. 2288 */ 2289 if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) { 2290 int active_requests; 2291 2292 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE, 2293 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 2294 2295 aio_proc_lock_spin(entryp->procp); 2296 active_requests = aio_active_requests_for_process( entryp->procp ); 2297 if ( active_requests < 1 ) { 2298 /* 2299 * no active aio requests for this process, continue exiting. In this 2300 * case, there should be no one else waiting ont he proc in AIO... 2301 */ 2302 wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN); 2303 aio_proc_unlock(entryp->procp); 2304 2305 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE, 2306 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 2307 } else { 2308 aio_proc_unlock(entryp->procp); 2309 } 2310 } 2311 2312 if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) { 2313 int active_requests; 2314 2315 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE, 2316 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 2317 2318 aio_proc_lock_spin(entryp->procp); 2319 active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes); 2320 if ( active_requests < 1 ) { 2321 /* Can't wakeup_one(); multiple closes might be in progress. */ 2322 wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN); 2323 aio_proc_unlock(entryp->procp); 2324 2325 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE, 2326 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 2327 } else { 2328 aio_proc_unlock(entryp->procp); 2329 } 2330 } 2331 /* 2332 * A thread in aio_suspend() wants to known about completed IOs. If it checked 2333 * the done list before we moved our AIO there, then it already asserted its wait, 2334 * and we can wake it up without holding the lock. If it checked the list after 2335 * we did our move, then it already has seen the AIO that we moved. Herego, we 2336 * can do our wakeup without holding the lock. 2337 */ 2338 wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN ); 2339 KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE, 2340 (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 ); 2341 2342 /* 2343 * free the LIO context if the last lio completed and no thread is 2344 * waiting 2345 */ 2346 if (lastLioCompleted && (waiter == 0)) 2347 free_lio_context (lio_context); 2348 2349 2350} /* do_aio_completion */ 2351 2352 2353/* 2354 * do_aio_read 2355 */ 2356static int 2357do_aio_read( aio_workq_entry *entryp ) 2358{ 2359 struct fileproc *fp; 2360 int error; 2361 struct vfs_context context; 2362 2363 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) ) 2364 return(error); 2365 if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) { 2366 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2367 return(EBADF); 2368 } 2369 2370 context.vc_thread = entryp->thread; /* XXX */ 2371 context.vc_ucred = fp->f_fglob->fg_cred; 2372 2373 error = dofileread(&context, fp, 2374 entryp->aiocb.aio_buf, 2375 entryp->aiocb.aio_nbytes, 2376 entryp->aiocb.aio_offset, FOF_OFFSET, 2377 &entryp->returnval); 2378 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2379 2380 return( error ); 2381 2382} /* do_aio_read */ 2383 2384 2385/* 2386 * do_aio_write 2387 */ 2388static int 2389do_aio_write( aio_workq_entry *entryp ) 2390{ 2391 struct fileproc *fp; 2392 int error, flags; 2393 struct vfs_context context; 2394 2395 if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) ) 2396 return(error); 2397 if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) { 2398 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2399 return(EBADF); 2400 } 2401 2402 flags = FOF_PCRED; 2403 if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) { 2404 flags |= FOF_OFFSET; 2405 } 2406 2407 context.vc_thread = entryp->thread; /* XXX */ 2408 context.vc_ucred = fp->f_fglob->fg_cred; 2409 2410 /* NB: tell dofilewrite the offset, and to use the proc cred */ 2411 error = dofilewrite(&context, 2412 fp, 2413 entryp->aiocb.aio_buf, 2414 entryp->aiocb.aio_nbytes, 2415 entryp->aiocb.aio_offset, 2416 flags, 2417 &entryp->returnval); 2418 2419 if (entryp->returnval) 2420 fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp); 2421 else 2422 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2423 2424 return( error ); 2425 2426} /* do_aio_write */ 2427 2428 2429/* 2430 * aio_active_requests_for_process - return number of active async IO 2431 * requests for the given process. 2432 */ 2433static int 2434aio_active_requests_for_process(proc_t procp ) 2435{ 2436 return( procp->p_aio_active_count ); 2437 2438} /* aio_active_requests_for_process */ 2439 2440/* 2441 * Called with the proc locked. 2442 */ 2443static int 2444aio_proc_active_requests_for_file(proc_t procp, int fd) 2445{ 2446 int count = 0; 2447 aio_workq_entry *entryp; 2448 TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) { 2449 if (entryp->aiocb.aio_fildes == fd) { 2450 count++; 2451 } 2452 } 2453 2454 return count; 2455} /* aio_active_requests_for_process */ 2456 2457 2458 2459/* 2460 * do_aio_fsync 2461 */ 2462static int 2463do_aio_fsync( aio_workq_entry *entryp ) 2464{ 2465 struct vfs_context context; 2466 struct vnode *vp; 2467 struct fileproc *fp; 2468 int sync_flag; 2469 int error; 2470 2471 /* 2472 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set. 2473 * 2474 * If AIO_DSYNC is set, we can tell the lower layers that it is OK 2475 * to mark for update the metadata not strictly necessary for data 2476 * retrieval, rather than forcing it to disk. 2477 * 2478 * If AIO_FSYNC is set, we have to also wait for metadata not really 2479 * necessary to data retrival are committed to stable storage (e.g. 2480 * atime, mtime, ctime, etc.). 2481 * 2482 * Metadata necessary for data retrieval ust be committed to stable 2483 * storage in either case (file length, etc.). 2484 */ 2485 if (entryp->flags & AIO_FSYNC) 2486 sync_flag = MNT_WAIT; 2487 else 2488 sync_flag = MNT_DWAIT; 2489 2490 error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp); 2491 if ( error == 0 ) { 2492 if ( (error = vnode_getwithref(vp)) ) { 2493 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2494 entryp->returnval = -1; 2495 return(error); 2496 } 2497 context.vc_thread = current_thread(); 2498 context.vc_ucred = fp->f_fglob->fg_cred; 2499 2500 error = VNOP_FSYNC( vp, sync_flag, &context); 2501 2502 (void)vnode_put(vp); 2503 2504 fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0); 2505 } 2506 if ( error != 0 ) 2507 entryp->returnval = -1; 2508 2509 return( error ); 2510 2511} /* do_aio_fsync */ 2512 2513 2514/* 2515 * is_already_queued - runs through our queues to see if the given 2516 * aiocbp / process is there. Returns TRUE if there is a match 2517 * on any of our aio queues. 2518 * 2519 * Called with proc aio lock held (can be held spin) 2520 */ 2521static boolean_t 2522is_already_queued(proc_t procp, 2523 user_addr_t aiocbp ) 2524{ 2525 aio_workq_entry *entryp; 2526 boolean_t result; 2527 2528 result = FALSE; 2529 2530 /* look for matches on our queue of async IO requests that have completed */ 2531 TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) { 2532 if ( aiocbp == entryp->uaiocbp ) { 2533 result = TRUE; 2534 goto ExitThisRoutine; 2535 } 2536 } 2537 2538 /* look for matches on our queue of active async IO requests */ 2539 TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) { 2540 if ( aiocbp == entryp->uaiocbp ) { 2541 result = TRUE; 2542 goto ExitThisRoutine; 2543 } 2544 } 2545 2546ExitThisRoutine: 2547 return( result ); 2548 2549} /* is_already_queued */ 2550 2551 2552static void 2553free_lio_context(aio_lio_context* context) 2554{ 2555 2556#if DEBUG 2557 OSDecrementAtomic(&lio_contexts_alloced); 2558#endif /* DEBUG */ 2559 2560 FREE( context, M_TEMP ); 2561 2562} /* free_lio_context */ 2563 2564 2565/* 2566 * aio initialization 2567 */ 2568__private_extern__ void 2569aio_init( void ) 2570{ 2571 int i; 2572 2573 aio_lock_grp_attr = lck_grp_attr_alloc_init(); 2574 aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);; 2575 aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);; 2576 aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);; 2577 aio_lock_attr = lck_attr_alloc_init(); 2578 2579 lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr); 2580 lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr); 2581 2582 aio_anchor.aio_inflight_count = 0; 2583 aio_anchor.aio_done_count = 0; 2584 aio_anchor.aio_total_count = 0; 2585 aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES; 2586 2587 for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) { 2588 aio_workq_init(&aio_anchor.aio_async_workqs[i]); 2589 } 2590 2591 2592 i = sizeof( aio_workq_entry ); 2593 aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" ); 2594 2595 _aio_create_worker_threads( aio_worker_threads ); 2596 2597} /* aio_init */ 2598 2599 2600/* 2601 * aio worker threads created here. 2602 */ 2603__private_extern__ void 2604_aio_create_worker_threads( int num ) 2605{ 2606 int i; 2607 2608 /* create some worker threads to handle the async IO requests */ 2609 for ( i = 0; i < num; i++ ) { 2610 thread_t myThread; 2611 2612 if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) { 2613 printf( "%s - failed to create a work thread \n", __FUNCTION__ ); 2614 } 2615 else 2616 thread_deallocate(myThread); 2617 } 2618 2619 return; 2620 2621} /* _aio_create_worker_threads */ 2622 2623/* 2624 * Return the current activation utask 2625 */ 2626task_t 2627get_aiotask(void) 2628{ 2629 return ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task; 2630} 2631 2632 2633/* 2634 * In the case of an aiocb from a 2635 * 32-bit process we need to expand some longs and pointers to the correct 2636 * sizes in order to let downstream code always work on the same type of 2637 * aiocb (in our case that is a user_aiocb) 2638 */ 2639static void 2640do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ) 2641{ 2642 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes; 2643 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset; 2644 the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf); 2645 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes; 2646 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio; 2647 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode; 2648 2649 /* special case here. since we do not know if sigev_value is an */ 2650 /* int or a ptr we do NOT cast the ptr to a user_addr_t. This */ 2651 /* means if we send this info back to user space we need to remember */ 2652 /* sigev_value was not expanded for the 32-bit case. */ 2653 /* NOTE - this does NOT affect us since we don't support sigev_value */ 2654 /* yet in the aio context. */ 2655 //LP64 2656 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify; 2657 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo; 2658 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int = 2659 my_aiocbp->aio_sigevent.sigev_value.sival_int; 2660 the_user_aiocbp->aio_sigevent.sigev_notify_function = 2661 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function); 2662 the_user_aiocbp->aio_sigevent.sigev_notify_attributes = 2663 CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes); 2664} 2665 2666/* Similar for 64-bit user process, so that we don't need to satisfy 2667 * the alignment constraints of the original user64_aiocb 2668 */ 2669static void 2670do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp ) 2671{ 2672 the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes; 2673 the_user_aiocbp->aio_offset = my_aiocbp->aio_offset; 2674 the_user_aiocbp->aio_buf = my_aiocbp->aio_buf; 2675 the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes; 2676 the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio; 2677 the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode; 2678 2679 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify; 2680 the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo; 2681 the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int = 2682 my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int; 2683 the_user_aiocbp->aio_sigevent.sigev_notify_function = 2684 my_aiocbp->aio_sigevent.sigev_notify_function; 2685 the_user_aiocbp->aio_sigevent.sigev_notify_attributes = 2686 my_aiocbp->aio_sigevent.sigev_notify_attributes; 2687} 2688