1/*
2 * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30/*
31 * todo:
32 *		1) ramesh is looking into how to replace taking a reference on
33 *		   	the user's map (vm_map_reference()) since it is believed that
34 *			would not hold the process for us.
35 *		2) david is looking into a way for us to set the priority of the
36 *		   	worker threads to match that of the user's thread when the
37 *		   	async IO was queued.
38 */
39
40
41/*
42 * This file contains support for the POSIX 1003.1B AIO/LIO facility.
43 */
44
45#include <sys/systm.h>
46#include <sys/fcntl.h>
47#include <sys/file_internal.h>
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
50#include <sys/vnode_internal.h>
51#include <sys/malloc.h>
52#include <sys/mount_internal.h>
53#include <sys/param.h>
54#include <sys/proc_internal.h>
55#include <sys/sysctl.h>
56#include <sys/unistd.h>
57#include <sys/user.h>
58
59#include <sys/aio_kern.h>
60#include <sys/sysproto.h>
61
62#include <machine/limits.h>
63
64#include <mach/mach_types.h>
65#include <kern/kern_types.h>
66#include <kern/zalloc.h>
67#include <kern/task.h>
68#include <kern/sched_prim.h>
69
70#include <vm/vm_map.h>
71
72#include <libkern/OSAtomic.h>
73
74#include <sys/kdebug.h>
75#define AIO_work_queued					1
76#define AIO_worker_wake				 	2
77#define AIO_completion_sig				3
78#define AIO_completion_cleanup_wait		4
79#define AIO_completion_cleanup_wake		5
80#define AIO_completion_suspend_wake 	6
81#define AIO_fsync_delay					7
82#define AIO_cancel 						10
83#define AIO_cancel_async_workq			11
84#define AIO_cancel_sync_workq			12
85#define AIO_cancel_activeq				13
86#define AIO_cancel_doneq				14
87#define AIO_fsync						20
88#define AIO_read						30
89#define AIO_write						40
90#define AIO_listio						50
91#define AIO_error						60
92#define AIO_error_val					61
93#define AIO_error_activeq				62
94#define AIO_error_workq					63
95#define	AIO_return						70
96#define	AIO_return_val					71
97#define	AIO_return_activeq				72
98#define	AIO_return_workq				73
99#define AIO_exec						80
100#define AIO_exit						90
101#define AIO_exit_sleep					91
102#define AIO_close						100
103#define AIO_close_sleep					101
104#define AIO_suspend						110
105#define AIO_suspend_sleep				111
106#define AIO_worker_thread				120
107
108#if 0
109#undef KERNEL_DEBUG
110#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
111#endif
112
113/*
114 * aio requests queue up on the aio_async_workq or lio_sync_workq (for
115 * lio_listio LIO_WAIT).  Requests then move to the per process aio_activeq
116 * (proc.aio_activeq) when one of our worker threads start the IO.
117 * And finally, requests move to the per process aio_doneq (proc.aio_doneq)
118 * when the IO request completes.  The request remains on aio_doneq until
119 * user process calls aio_return or the process exits, either way that is our
120 * trigger to release aio resources.
121 */
122typedef struct aio_workq   {
123	TAILQ_HEAD(, aio_workq_entry) 	aioq_entries;
124	int				aioq_count;
125	lck_mtx_t			aioq_mtx;
126	wait_queue_t			aioq_waitq;
127} *aio_workq_t;
128
129#define AIO_NUM_WORK_QUEUES 1
130struct aio_anchor_cb
131{
132	volatile int32_t	aio_inflight_count; 	/* entries that have been taken from a workq */
133	volatile int32_t	aio_done_count; 	/* entries on all done queues (proc.aio_doneq) */
134	volatile int32_t	aio_total_count;	/* total extant entries */
135
136	/* Hash table of queues here */
137	int 			aio_num_workqs;
138	struct aio_workq 	aio_async_workqs[AIO_NUM_WORK_QUEUES];
139};
140typedef struct aio_anchor_cb aio_anchor_cb;
141
142struct aio_lio_context
143{
144	int		io_waiter;
145	int		io_issued;
146	int		io_completed;
147};
148typedef struct aio_lio_context aio_lio_context;
149
150
151/*
152 * Notes on aio sleep / wake channels.
153 * We currently pick a couple fields within the proc structure that will allow
154 * us sleep channels that currently do not collide with any other kernel routines.
155 * At this time, for binary compatibility reasons, we cannot create new proc fields.
156 */
157#define AIO_SUSPEND_SLEEP_CHAN  p_aio_active_count
158#define AIO_CLEANUP_SLEEP_CHAN 	p_aio_total_count
159
160#define ASSERT_AIO_FROM_PROC(aiop, theproc) 	\
161	if ((aiop)->procp != (theproc)) { 	\
162		panic("AIO on a proc list that does not belong to that proc.\n"); \
163	}
164
165/*
166 *  LOCAL PROTOTYPES
167 */
168static void		aio_proc_lock(proc_t procp);
169static void		aio_proc_lock_spin(proc_t procp);
170static void		aio_proc_unlock(proc_t procp);
171static lck_mtx_t*	aio_proc_mutex(proc_t procp);
172static void		aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp);
173static void		aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp);
174static int		aio_get_process_count(proc_t procp );
175static int		aio_active_requests_for_process(proc_t procp );
176static int		aio_proc_active_requests_for_file(proc_t procp, int fd);
177static boolean_t	is_already_queued(proc_t procp, user_addr_t aiocbp );
178static boolean_t	should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd);
179
180static void		aio_entry_lock(aio_workq_entry *entryp);
181static void		aio_entry_lock_spin(aio_workq_entry *entryp);
182static aio_workq_t	aio_entry_workq(aio_workq_entry *entryp);
183static lck_mtx_t*	aio_entry_mutex(__unused aio_workq_entry *entryp);
184static void		aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
185static void		aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp);
186static void		aio_entry_ref_locked(aio_workq_entry *entryp);
187static void		aio_entry_unref_locked(aio_workq_entry *entryp);
188static void		aio_entry_ref(aio_workq_entry *entryp);
189static void		aio_entry_unref(aio_workq_entry *entryp);
190static void		aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled,
191					int wait_for_completion, boolean_t disable_notification);
192static int		aio_entry_try_workq_remove(aio_workq_entry *entryp);
193static boolean_t	aio_delay_fsync_request( aio_workq_entry *entryp );
194static int		aio_free_request(aio_workq_entry *entryp);
195
196static void		aio_workq_init(aio_workq_t wq);
197static void		aio_workq_lock_spin(aio_workq_t wq);
198static void		aio_workq_unlock(aio_workq_t wq);
199static lck_mtx_t*	aio_workq_mutex(aio_workq_t wq);
200
201static void		aio_work_thread( void );
202static aio_workq_entry *aio_get_some_work( void );
203
204static int		aio_get_all_queues_count( void );
205static int		aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO );
206static int		aio_validate( aio_workq_entry *entryp );
207static int 		aio_increment_total_count(void);
208static int 		aio_decrement_total_count(void);
209
210static int		do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, int wait_for_completion, boolean_t disable_notification );
211static void		do_aio_completion( aio_workq_entry *entryp );
212static int		do_aio_fsync( aio_workq_entry *entryp );
213static int		do_aio_read( aio_workq_entry *entryp );
214static int		do_aio_write( aio_workq_entry *entryp );
215static void 		do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
216static void 		do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp );
217static int	lio_create_entry(proc_t procp,
218					 user_addr_t aiocbp,
219					 void *group_tag,
220					 aio_workq_entry **entrypp );
221static aio_workq_entry *aio_create_queue_entry(proc_t procp,
222					user_addr_t aiocbp,
223					void *group_tag,
224					int kindOfIO);
225static user_addr_t *aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent);
226static void		free_lio_context(aio_lio_context* context);
227static void 		aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked);
228
229#define ASSERT_AIO_PROC_LOCK_OWNED(p)	lck_mtx_assert(aio_proc_mutex((p)), LCK_MTX_ASSERT_OWNED)
230#define ASSERT_AIO_WORKQ_LOCK_OWNED(q)	lck_mtx_assert(aio_workq_mutex((q)), LCK_MTX_ASSERT_OWNED)
231#define ASSERT_AIO_ENTRY_LOCK_OWNED(e)	lck_mtx_assert(aio_entry_mutex((e)), LCK_MTX_ASSERT_OWNED)
232
233/*
234 *  EXTERNAL PROTOTYPES
235 */
236
237/* in ...bsd/kern/sys_generic.c */
238extern int dofileread(vfs_context_t ctx, struct fileproc *fp,
239			user_addr_t bufp, user_size_t nbyte,
240			off_t offset, int flags, user_ssize_t *retval );
241extern int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
242			 user_addr_t bufp, user_size_t nbyte, off_t offset,
243			 int flags, user_ssize_t *retval );
244#if DEBUG
245static uint32_t                         lio_contexts_alloced = 0;
246#endif  /* DEBUG */
247
248/*
249 * aio external global variables.
250 */
251extern int aio_max_requests;  			/* AIO_MAX - configurable */
252extern int aio_max_requests_per_process;	/* AIO_PROCESS_MAX - configurable */
253extern int aio_worker_threads;			/* AIO_THREAD_COUNT - configurable */
254
255
256/*
257 * aio static variables.
258 */
259static aio_anchor_cb	aio_anchor;
260static lck_grp_t	*aio_proc_lock_grp;
261static lck_grp_t	*aio_entry_lock_grp;
262static lck_grp_t	*aio_queue_lock_grp;
263static lck_attr_t	*aio_lock_attr;
264static lck_grp_attr_t	*aio_lock_grp_attr;
265static struct zone  	*aio_workq_zonep;
266static lck_mtx_t	aio_entry_mtx;
267static lck_mtx_t	aio_proc_mtx;
268
269static void
270aio_entry_lock(__unused aio_workq_entry *entryp)
271{
272	lck_mtx_lock(&aio_entry_mtx);
273}
274
275static void
276aio_entry_lock_spin(__unused aio_workq_entry *entryp)
277{
278	lck_mtx_lock_spin(&aio_entry_mtx);
279}
280
281static void
282aio_entry_unlock(__unused aio_workq_entry *entryp)
283{
284	lck_mtx_unlock(&aio_entry_mtx);
285}
286
287/* Hash */
288static aio_workq_t
289aio_entry_workq(__unused aio_workq_entry *entryp)
290{
291	return &aio_anchor.aio_async_workqs[0];
292}
293
294static lck_mtx_t*
295aio_entry_mutex(__unused aio_workq_entry *entryp)
296{
297	return &aio_entry_mtx;
298}
299
300static void
301aio_workq_init(aio_workq_t wq)
302{
303	TAILQ_INIT(&wq->aioq_entries);
304	wq->aioq_count = 0;
305	lck_mtx_init(&wq->aioq_mtx, aio_queue_lock_grp, aio_lock_attr);
306	wq->aioq_waitq = wait_queue_alloc(SYNC_POLICY_FIFO);
307}
308
309
310/*
311 * Can be passed a queue which is locked spin.
312 */
313static void
314aio_workq_remove_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
315{
316	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
317
318	if (entryp->aio_workq_link.tqe_prev == NULL) {
319		panic("Trying to remove an entry from a work queue, but it is not on a queue\n");
320	}
321
322	TAILQ_REMOVE(&queue->aioq_entries, entryp, aio_workq_link);
323	queue->aioq_count--;
324	entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */
325
326	if (queue->aioq_count  < 0) {
327		panic("Negative count on a queue.\n");
328	}
329}
330
331static void
332aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp)
333{
334	ASSERT_AIO_WORKQ_LOCK_OWNED(queue);
335
336	TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link);
337	if (queue->aioq_count  < 0) {
338		panic("Negative count on a queue.\n");
339	}
340	queue->aioq_count++;
341}
342
343static void
344aio_proc_lock(proc_t procp)
345{
346	lck_mtx_lock(aio_proc_mutex(procp));
347}
348
349static void
350aio_proc_lock_spin(proc_t procp)
351{
352	lck_mtx_lock_spin(aio_proc_mutex(procp));
353}
354
355static void
356aio_proc_move_done_locked(proc_t procp, aio_workq_entry *entryp)
357{
358	ASSERT_AIO_PROC_LOCK_OWNED(procp);
359
360	TAILQ_REMOVE(&procp->p_aio_activeq, entryp, aio_proc_link );
361	TAILQ_INSERT_TAIL( &procp->p_aio_doneq, entryp, aio_proc_link);
362	procp->p_aio_active_count--;
363	OSIncrementAtomic(&aio_anchor.aio_done_count);
364}
365
366static void
367aio_proc_remove_done_locked(proc_t procp, aio_workq_entry *entryp)
368{
369	TAILQ_REMOVE(&procp->p_aio_doneq, entryp, aio_proc_link);
370	OSDecrementAtomic(&aio_anchor.aio_done_count);
371	aio_decrement_total_count();
372	procp->p_aio_total_count--;
373}
374
375static void
376aio_proc_unlock(proc_t procp)
377{
378	lck_mtx_unlock(aio_proc_mutex(procp));
379}
380
381static lck_mtx_t*
382aio_proc_mutex(proc_t procp)
383{
384	return &procp->p_mlock;
385}
386
387static void
388aio_entry_ref_locked(aio_workq_entry *entryp)
389{
390	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
391
392	if (entryp->aio_refcount < 0) {
393		panic("AIO workq entry with a negative refcount.\n");
394	}
395	entryp->aio_refcount++;
396}
397
398
399/* Return 1 if you've freed it */
400static void
401aio_entry_unref_locked(aio_workq_entry *entryp)
402{
403	ASSERT_AIO_ENTRY_LOCK_OWNED(entryp);
404
405	entryp->aio_refcount--;
406	if (entryp->aio_refcount < 0) {
407		panic("AIO workq entry with a negative refcount.\n");
408	}
409}
410
411static void
412aio_entry_ref(aio_workq_entry *entryp)
413{
414	aio_entry_lock_spin(entryp);
415	aio_entry_ref_locked(entryp);
416	aio_entry_unlock(entryp);
417}
418static void
419aio_entry_unref(aio_workq_entry *entryp)
420{
421	aio_entry_lock_spin(entryp);
422	aio_entry_unref_locked(entryp);
423
424	if ((entryp->aio_refcount == 0) && ((entryp->flags & AIO_DO_FREE) != 0)) {
425		aio_entry_unlock(entryp);
426		aio_free_request(entryp);
427	} else {
428		aio_entry_unlock(entryp);
429	}
430
431	return;
432}
433
434static void
435aio_entry_update_for_cancel(aio_workq_entry *entryp, boolean_t cancelled, int wait_for_completion, boolean_t disable_notification)
436{
437	aio_entry_lock_spin(entryp);
438
439	if (cancelled) {
440		aio_entry_ref_locked(entryp);
441		entryp->errorval = ECANCELED;
442		entryp->returnval = -1;
443	}
444
445	if ( wait_for_completion ) {
446		entryp->flags |= wait_for_completion; /* flag for special completion processing */
447	}
448
449	if ( disable_notification ) {
450		entryp->flags |= AIO_DISABLE; /* Don't want a signal */
451	}
452
453	aio_entry_unlock(entryp);
454}
455
456static int
457aio_entry_try_workq_remove(aio_workq_entry *entryp)
458{
459	/* Can only be cancelled if it's still on a work queue */
460	if (entryp->aio_workq_link.tqe_prev != NULL) {
461		aio_workq_t queue;
462
463		/* Will have to check again under the lock */
464		queue = aio_entry_workq(entryp);
465		aio_workq_lock_spin(queue);
466		if (entryp->aio_workq_link.tqe_prev != NULL) {
467			aio_workq_remove_entry_locked(queue, entryp);
468			aio_workq_unlock(queue);
469			return 1;
470		}  else {
471			aio_workq_unlock(queue);
472		}
473	}
474
475	return 0;
476}
477
478static void
479aio_workq_lock_spin(aio_workq_t wq)
480{
481	lck_mtx_lock_spin(aio_workq_mutex(wq));
482}
483
484static void
485aio_workq_unlock(aio_workq_t wq)
486{
487	lck_mtx_unlock(aio_workq_mutex(wq));
488}
489
490static lck_mtx_t*
491aio_workq_mutex(aio_workq_t wq)
492{
493	return &wq->aioq_mtx;
494}
495
496/*
497 * aio_cancel - attempt to cancel one or more async IO requests currently
498 * outstanding against file descriptor uap->fd.  If uap->aiocbp is not
499 * NULL then only one specific IO is cancelled (if possible).  If uap->aiocbp
500 * is NULL then all outstanding async IO request for the given file
501 * descriptor are cancelled (if possible).
502 */
503int
504aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval )
505{
506	struct user_aiocb		my_aiocb;
507	int							result;
508
509	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_START,
510		     	  (int)p, (int)uap->aiocbp, 0, 0, 0 );
511
512	/* quick check to see if there are any async IO requests queued up */
513	if (aio_get_all_queues_count() < 1) {
514		result = 0;
515		*retval = AIO_ALLDONE;
516		goto ExitRoutine;
517	}
518
519	*retval = -1;
520	if ( uap->aiocbp != USER_ADDR_NULL ) {
521		if ( proc_is64bit(p) ) {
522			struct user64_aiocb aiocb64;
523
524			result = copyin( uap->aiocbp, &aiocb64, sizeof(aiocb64) );
525			if (result == 0 )
526				do_munge_aiocb_user64_to_user(&aiocb64, &my_aiocb);
527
528		} else {
529			struct user32_aiocb aiocb32;
530
531			result = copyin( uap->aiocbp, &aiocb32, sizeof(aiocb32) );
532			if ( result == 0 )
533				do_munge_aiocb_user32_to_user( &aiocb32, &my_aiocb );
534		}
535
536		if ( result != 0 ) {
537			result = EAGAIN;
538			goto ExitRoutine;
539		}
540
541		/* NOTE - POSIX standard says a mismatch between the file */
542		/* descriptor passed in and the file descriptor embedded in */
543		/* the aiocb causes unspecified results.  We return EBADF in */
544		/* that situation.  */
545		if ( uap->fd != my_aiocb.aio_fildes ) {
546			result = EBADF;
547			goto ExitRoutine;
548		}
549	}
550
551	aio_proc_lock(p);
552	result = do_aio_cancel_locked( p, uap->fd, uap->aiocbp, 0, FALSE );
553	ASSERT_AIO_PROC_LOCK_OWNED(p);
554	aio_proc_unlock(p);
555
556	if ( result != -1 ) {
557		*retval = result;
558		result = 0;
559		goto ExitRoutine;
560	}
561
562	result = EBADF;
563
564ExitRoutine:
565	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel)) | DBG_FUNC_END,
566		     	  (int)p, (int)uap->aiocbp, result, 0, 0 );
567
568	return( result );
569
570} /* aio_cancel */
571
572
573/*
574 * _aio_close - internal function used to clean up async IO requests for
575 * a file descriptor that is closing.
576 * THIS MAY BLOCK.
577 */
578__private_extern__ void
579_aio_close(proc_t p, int fd )
580{
581	int			error;
582
583	/* quick check to see if there are any async IO requests queued up */
584	if (aio_get_all_queues_count() < 1) {
585		return;
586	}
587
588	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_START,
589		     	  (int)p, fd, 0, 0, 0 );
590
591	/* cancel all async IO requests on our todo queues for this file descriptor */
592	aio_proc_lock(p);
593	error = do_aio_cancel_locked( p, fd, 0, AIO_CLOSE_WAIT, FALSE );
594	ASSERT_AIO_PROC_LOCK_OWNED(p);
595	if ( error == AIO_NOTCANCELED ) {
596		/*
597		 * AIO_NOTCANCELED is returned when we find an aio request for this process
598		 * and file descriptor on the active async IO queue.  Active requests cannot
599		 * be cancelled so we must wait for them to complete.  We will get a special
600		 * wake up call on our channel used to sleep for ALL active requests to
601		 * complete.  This sleep channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used
602		 * when we must wait for all active aio requests.
603		 */
604
605		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close_sleep)) | DBG_FUNC_NONE,
606		     	 	  (int)p, fd, 0, 0, 0 );
607
608		while (aio_proc_active_requests_for_file(p, fd) > 0) {
609			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_close", 0 );
610		}
611
612	}
613
614	aio_proc_unlock(p);
615
616	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_close)) | DBG_FUNC_END,
617		     	  (int)p, fd, 0, 0, 0 );
618
619	return;
620
621} /* _aio_close */
622
623
624/*
625 * aio_error - return the error status associated with the async IO
626 * request referred to by uap->aiocbp.  The error status is the errno
627 * value that would be set by the corresponding IO request (read, wrtie,
628 * fdatasync, or sync).
629 */
630int
631aio_error(proc_t p, struct aio_error_args *uap, int *retval )
632{
633	aio_workq_entry		 		*entryp;
634	int							error;
635
636	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_START,
637		     	  (int)p, (int)uap->aiocbp, 0, 0, 0 );
638
639	/* see if there are any aios to check */
640	if (aio_get_all_queues_count() < 1) {
641		return EINVAL;
642	}
643
644	aio_proc_lock(p);
645
646	/* look for a match on our queue of async IO requests that have completed */
647	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
648		if ( entryp->uaiocbp == uap->aiocbp ) {
649			ASSERT_AIO_FROM_PROC(entryp, p);
650
651			aio_entry_lock_spin(entryp);
652			*retval = entryp->errorval;
653			error = 0;
654			aio_entry_unlock(entryp);
655			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_val)) | DBG_FUNC_NONE,
656		     	 		   (int)p, (int)uap->aiocbp, *retval, 0, 0 );
657			goto ExitRoutine;
658		}
659	}
660
661	/* look for a match on our queue of active async IO requests */
662	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
663		if ( entryp->uaiocbp == uap->aiocbp ) {
664			ASSERT_AIO_FROM_PROC(entryp, p);
665			*retval = EINPROGRESS;
666			error = 0;
667			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error_activeq)) | DBG_FUNC_NONE,
668		     	 		   (int)p, (int)uap->aiocbp, *retval, 0, 0 );
669			goto ExitRoutine;
670		}
671	}
672
673	error = EINVAL;
674
675ExitRoutine:
676	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_error)) | DBG_FUNC_END,
677		     	  (int)p, (int)uap->aiocbp, error, 0, 0 );
678	aio_proc_unlock(p);
679
680	return( error );
681
682} /* aio_error */
683
684
685/*
686 * aio_fsync - asynchronously force all IO operations associated
687 * with the file indicated by the file descriptor (uap->aiocbp->aio_fildes) and
688 * queued at the time of the call to the synchronized completion state.
689 * NOTE - we do not support op O_DSYNC at this point since we do not support the
690 * fdatasync() call.
691 */
692int
693aio_fsync(proc_t p, struct aio_fsync_args *uap, int *retval )
694{
695	int			error;
696	int			fsync_kind;
697
698	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_START,
699		     	  (int)p, (int)uap->aiocbp, uap->op, 0, 0 );
700
701	*retval = 0;
702	/* 0 := O_SYNC for binary backward compatibility with Panther */
703	if (uap->op == O_SYNC || uap->op == 0)
704		fsync_kind = AIO_FSYNC;
705	else if ( uap->op == O_DSYNC )
706		fsync_kind = AIO_DSYNC;
707	else {
708		*retval = -1;
709		error = EINVAL;
710		goto ExitRoutine;
711	}
712
713	error = aio_queue_async_request( p, uap->aiocbp, fsync_kind );
714	if ( error != 0 )
715		*retval = -1;
716
717ExitRoutine:
718	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync)) | DBG_FUNC_END,
719		     	  (int)p, (int)uap->aiocbp, error, 0, 0 );
720
721	return( error );
722
723} /* aio_fsync */
724
725
726/* aio_read - asynchronously read uap->aiocbp->aio_nbytes bytes from the
727 * file descriptor (uap->aiocbp->aio_fildes) into the buffer
728 * (uap->aiocbp->aio_buf).
729 */
730int
731aio_read(proc_t p, struct aio_read_args *uap, int *retval )
732{
733	int			error;
734
735	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_START,
736		     	  (int)p, (int)uap->aiocbp, 0, 0, 0 );
737
738	*retval = 0;
739
740	error = aio_queue_async_request( p, uap->aiocbp, AIO_READ );
741	if ( error != 0 )
742		*retval = -1;
743
744	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_read)) | DBG_FUNC_END,
745		     	  (int)p, (int)uap->aiocbp, error, 0, 0 );
746
747	return( error );
748
749} /* aio_read */
750
751
752/*
753 * aio_return - return the return status associated with the async IO
754 * request referred to by uap->aiocbp.  The return status is the value
755 * that would be returned by corresponding IO request (read, write,
756 * fdatasync, or sync).  This is where we release kernel resources
757 * held for async IO call associated with the given aiocb pointer.
758 */
759int
760aio_return(proc_t p, struct aio_return_args *uap, user_ssize_t *retval )
761{
762	aio_workq_entry		 		*entryp;
763	int							error;
764	boolean_t					proc_lock_held = FALSE;
765
766	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_START,
767		     	  (int)p, (int)uap->aiocbp, 0, 0, 0 );
768
769	/* See if there are any entries to check */
770	if (aio_get_all_queues_count() < 1) {
771		error = EINVAL;
772		goto ExitRoutine;
773	}
774
775	aio_proc_lock(p);
776	proc_lock_held = TRUE;
777	*retval = 0;
778
779	/* look for a match on our queue of async IO requests that have completed */
780	TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
781		ASSERT_AIO_FROM_PROC(entryp, p);
782		if ( entryp->uaiocbp == uap->aiocbp ) {
783			/* Done and valid for aio_return(), pull it off the list */
784			aio_proc_remove_done_locked(p, entryp);
785
786			/* Drop the proc lock, but keep the entry locked */
787			aio_entry_lock(entryp);
788			aio_proc_unlock(p);
789			proc_lock_held = FALSE;
790
791			*retval = entryp->returnval;
792			error = 0;
793
794			/* No references and off all lists, safe to free */
795			if (entryp->aio_refcount == 0) {
796				aio_entry_unlock(entryp);
797				aio_free_request(entryp);
798			}
799			else {
800				/* Whoever has the refcount will have to free it */
801				entryp->flags |= AIO_DO_FREE;
802				aio_entry_unlock(entryp);
803			}
804
805
806			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_val)) | DBG_FUNC_NONE,
807		     	 		   (int)p, (int)uap->aiocbp, *retval, 0, 0 );
808			goto ExitRoutine;
809		}
810	}
811
812	/* look for a match on our queue of active async IO requests */
813	TAILQ_FOREACH( entryp, &p->p_aio_activeq, aio_proc_link) {
814		ASSERT_AIO_FROM_PROC(entryp, p);
815		if ( entryp->uaiocbp == uap->aiocbp ) {
816			error = EINPROGRESS;
817			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return_activeq)) | DBG_FUNC_NONE,
818		     	 		   (int)p, (int)uap->aiocbp, *retval, 0, 0 );
819			goto ExitRoutine;
820		}
821	}
822
823	error = EINVAL;
824
825ExitRoutine:
826	if (proc_lock_held)
827		aio_proc_unlock(p);
828	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_return)) | DBG_FUNC_END,
829		     	  (int)p, (int)uap->aiocbp, error, 0, 0 );
830
831	return( error );
832
833} /* aio_return */
834
835
836/*
837 * _aio_exec - internal function used to clean up async IO requests for
838 * a process that is going away due to exec().  We cancel any async IOs
839 * we can and wait for those already active.  We also disable signaling
840 * for cancelled or active aio requests that complete.
841 * This routine MAY block!
842 */
843__private_extern__ void
844_aio_exec(proc_t p )
845{
846
847	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_START,
848		     	  (int)p, 0, 0, 0, 0 );
849
850	_aio_exit( p );
851
852	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exec)) | DBG_FUNC_END,
853		     	  (int)p, 0, 0, 0, 0 );
854
855	return;
856
857} /* _aio_exec */
858
859
860/*
861 * _aio_exit - internal function used to clean up async IO requests for
862 * a process that is terminating (via exit() or exec() ).  We cancel any async IOs
863 * we can and wait for those already active.  We also disable signaling
864 * for cancelled or active aio requests that complete.  This routine MAY block!
865 */
866__private_extern__ void
867_aio_exit(proc_t p )
868{
869	int						error;
870	aio_workq_entry 		*entryp;
871
872
873	/* quick check to see if there are any async IO requests queued up */
874	if (aio_get_all_queues_count() < 1) {
875		return;
876	}
877
878	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_START,
879		     	  (int)p, 0, 0, 0, 0 );
880
881	aio_proc_lock(p);
882
883	/*
884	 * cancel async IO requests on the todo work queue and wait for those
885	 * already active to complete.
886	 */
887	error = do_aio_cancel_locked( p, 0, 0, AIO_EXIT_WAIT, TRUE );
888	ASSERT_AIO_PROC_LOCK_OWNED(p);
889	if ( error == AIO_NOTCANCELED ) {
890		/*
891		 * AIO_NOTCANCELED is returned when we find an aio request for this process
892		 * on the active async IO queue.  Active requests cannot be cancelled so we
893		 * must wait for them to complete.  We will get a special wake up call on
894		 * our channel used to sleep for ALL active requests to complete.  This sleep
895		 * channel (proc.AIO_CLEANUP_SLEEP_CHAN) is only used when we must wait for all
896		 * active aio requests.
897		 */
898
899		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit_sleep)) | DBG_FUNC_NONE,
900		     	 	  (int)p, 0, 0, 0, 0 );
901
902		while (p->p_aio_active_count != 0) {
903			msleep(&p->AIO_CLEANUP_SLEEP_CHAN, aio_proc_mutex(p), PRIBIO, "aio_exit", 0 );
904		}
905	}
906
907	if (p->p_aio_active_count != 0) {
908		panic("Exiting process has %d active AIOs after cancellation has completed.\n", p->p_aio_active_count);
909	}
910
911	/* release all aio resources used by this process */
912	entryp = TAILQ_FIRST( &p->p_aio_doneq );
913	while ( entryp != NULL ) {
914		ASSERT_AIO_FROM_PROC(entryp, p);
915		aio_workq_entry		 	*next_entryp;
916
917		next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
918		aio_proc_remove_done_locked(p, entryp);
919
920		/* we cannot free requests that are still completing */
921		aio_entry_lock_spin(entryp);
922		if (entryp->aio_refcount == 0) {
923			aio_proc_unlock(p);
924			aio_entry_unlock(entryp);
925			aio_free_request(entryp);
926
927			/* need to start over since aio_doneq may have been */
928			/* changed while we were away.  */
929			aio_proc_lock(p);
930			entryp = TAILQ_FIRST( &p->p_aio_doneq );
931			continue;
932		}
933		else {
934			/* whoever has the reference will have to do the free */
935			entryp->flags |= AIO_DO_FREE;
936		}
937
938		aio_entry_unlock(entryp);
939		entryp = next_entryp;
940	}
941
942	aio_proc_unlock(p);
943
944	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_exit)) | DBG_FUNC_END,
945		     	  (int)p, 0, 0, 0, 0 );
946	return;
947
948} /* _aio_exit */
949
950
951static boolean_t
952should_cancel(aio_workq_entry *entryp, user_addr_t aiocbp, int fd)
953{
954	if ( (aiocbp == USER_ADDR_NULL && fd == 0) ||
955			(aiocbp != USER_ADDR_NULL && entryp->uaiocbp == aiocbp) ||
956			(aiocbp == USER_ADDR_NULL && fd == entryp->aiocb.aio_fildes) ) {
957		return TRUE;
958	}
959
960	return FALSE;
961}
962
963/*
964 * do_aio_cancel_locked - cancel async IO requests (if possible).  We get called by
965 * aio_cancel, close, and at exit.
966 * There are three modes of operation: 1) cancel all async IOs for a process -
967 * fd is 0 and aiocbp is NULL 2) cancel all async IOs for file descriptor - fd
968 * is > 0 and aiocbp is NULL 3) cancel one async IO associated with the given
969 * aiocbp.
970 * Returns -1 if no matches were found, AIO_CANCELED when we cancelled all
971 * target async IO requests, AIO_NOTCANCELED if we could not cancel all
972 * target async IO requests, and AIO_ALLDONE if all target async IO requests
973 * were already complete.
974 * WARNING - do not deference aiocbp in this routine, it may point to user
975 * land data that has not been copied in (when called from aio_cancel() )
976 *
977 * Called with proc locked, and returns the same way.
978 */
979static int
980do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp,
981	int wait_for_completion, boolean_t disable_notification )
982{
983	ASSERT_AIO_PROC_LOCK_OWNED(p);
984
985	aio_workq_entry		 	*entryp;
986	int						result;
987
988	result = -1;
989
990	/* look for a match on our queue of async todo work. */
991	entryp = TAILQ_FIRST(&p->p_aio_activeq);
992	while ( entryp != NULL ) {
993		ASSERT_AIO_FROM_PROC(entryp, p);
994		aio_workq_entry		 	*next_entryp;
995
996		next_entryp = TAILQ_NEXT( entryp, aio_proc_link);
997		if (!should_cancel(entryp, aiocbp, fd)) {
998			entryp = next_entryp;
999			continue;
1000		}
1001
1002		/* Can only be cancelled if it's still on a work queue */
1003		if (aio_entry_try_workq_remove(entryp) != 0) {
1004			/* Have removed from workq. Update entry state and take a ref */
1005			aio_entry_update_for_cancel(entryp, TRUE, 0, disable_notification);
1006
1007			/* Put on the proc done queue and update counts, then unlock the proc */
1008			aio_proc_move_done_locked(p, entryp);
1009			aio_proc_unlock(p);
1010
1011			/* Now it's officially cancelled.  Do the completion */
1012			result = AIO_CANCELED;
1013			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq)) | DBG_FUNC_NONE,
1014					(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1015			do_aio_completion(entryp);
1016
1017			/* This will free if the aio_return() has already happened ... */
1018			aio_entry_unref(entryp);
1019			aio_proc_lock(p);
1020
1021			if ( aiocbp != USER_ADDR_NULL ) {
1022				return( result );
1023			}
1024
1025			/*
1026			 * Restart from the head of the proc active queue since it
1027			 * may have been changed while we were away doing completion
1028			 * processing.
1029			 *
1030			 * Note that if we found an uncancellable AIO before, we will
1031			 * either find it again or discover that it's been completed,
1032			 * so resetting the result will not cause us to return success
1033			 * despite outstanding AIOs.
1034			 */
1035			entryp = TAILQ_FIRST(&p->p_aio_activeq);
1036			result = -1; /* As if beginning anew */
1037		} else {
1038			/*
1039			 * It's been taken off the active queue already, i.e. is in flight.
1040			 * All we can do is ask for notification.
1041			 */
1042			result = AIO_NOTCANCELED;
1043
1044			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_activeq)) | DBG_FUNC_NONE,
1045					(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1046
1047			/* Mark for waiting and such; will not take a ref if "cancelled" arg is FALSE */
1048			aio_entry_update_for_cancel(entryp, FALSE, wait_for_completion, disable_notification);
1049
1050			if ( aiocbp != USER_ADDR_NULL ) {
1051				return( result );
1052			}
1053			entryp = next_entryp;
1054		}
1055	} /* while... */
1056
1057	/*
1058	 * if we didn't find any matches on the todo or active queues then look for a
1059	 * match on our queue of async IO requests that have completed and if found
1060	 * return AIO_ALLDONE result.
1061	 *
1062	 * Proc AIO lock is still held.
1063	 */
1064	if ( result == -1 ) {
1065		TAILQ_FOREACH(entryp, &p->p_aio_doneq, aio_proc_link) {
1066			ASSERT_AIO_FROM_PROC(entryp, p);
1067			if (should_cancel(entryp, aiocbp, fd)) {
1068				result = AIO_ALLDONE;
1069				KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_doneq)) | DBG_FUNC_NONE,
1070						(int)entryp->procp, (int)entryp->uaiocbp, fd, 0, 0 );
1071
1072				if ( aiocbp != USER_ADDR_NULL ) {
1073					return( result );
1074				}
1075			}
1076		}
1077	}
1078
1079	return( result );
1080
1081}
1082 /* do_aio_cancel_locked */
1083
1084
1085/*
1086 * aio_suspend - suspend the calling thread until at least one of the async
1087 * IO operations referenced by uap->aiocblist has completed, until a signal
1088 * interrupts the function, or uap->timeoutp time interval (optional) has
1089 * passed.
1090 * Returns 0 if one or more async IOs have completed else -1 and errno is
1091 * set appropriately - EAGAIN if timeout elapses or EINTR if an interrupt
1092 * woke us up.
1093 */
1094int
1095aio_suspend(proc_t p, struct aio_suspend_args *uap, int *retval )
1096{
1097	__pthread_testcancel(1);
1098	return(aio_suspend_nocancel(p, (struct aio_suspend_nocancel_args *)uap, retval));
1099}
1100
1101
1102int
1103aio_suspend_nocancel(proc_t p, struct aio_suspend_nocancel_args *uap, int *retval )
1104{
1105	int					error;
1106	int					i, count;
1107	uint64_t			abstime;
1108	struct user_timespec ts;
1109	aio_workq_entry 	*entryp;
1110	user_addr_t			*aiocbpp;
1111
1112	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_START,
1113		     	  (int)p, uap->nent, 0, 0, 0 );
1114
1115	*retval = -1;
1116	abstime = 0;
1117	aiocbpp = NULL;
1118
1119	count = aio_get_all_queues_count( );
1120	if ( count < 1 ) {
1121		error = EINVAL;
1122		goto ExitThisRoutine;
1123	}
1124
1125	if ( uap->nent < 1 || uap->nent > aio_max_requests_per_process ) {
1126		error = EINVAL;
1127		goto ExitThisRoutine;
1128	}
1129
1130	if ( uap->timeoutp != USER_ADDR_NULL ) {
1131		if ( proc_is64bit(p) ) {
1132			struct user64_timespec temp;
1133			error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1134			if ( error == 0 ) {
1135				ts.tv_sec = temp.tv_sec;
1136				ts.tv_nsec = temp.tv_nsec;
1137			}
1138		}
1139		else {
1140			struct user32_timespec temp;
1141			error = copyin( uap->timeoutp, &temp, sizeof(temp) );
1142			if ( error == 0 ) {
1143				ts.tv_sec = temp.tv_sec;
1144				ts.tv_nsec = temp.tv_nsec;
1145			}
1146		}
1147		if ( error != 0 ) {
1148			error = EAGAIN;
1149			goto ExitThisRoutine;
1150		}
1151
1152		if ( ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000 ) {
1153			error = EINVAL;
1154			goto ExitThisRoutine;
1155		}
1156
1157		nanoseconds_to_absolutetime( (uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec,
1158									 &abstime );
1159		clock_absolutetime_interval_to_deadline( abstime, &abstime );
1160	}
1161
1162	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1163	if ( aiocbpp == NULL ) {
1164		error = EAGAIN;
1165		goto ExitThisRoutine;
1166	}
1167
1168	/* check list of aio requests to see if any have completed */
1169check_for_our_aiocbp:
1170	aio_proc_lock_spin(p);
1171	for ( i = 0; i < uap->nent; i++ ) {
1172		user_addr_t	aiocbp;
1173
1174		/* NULL elements are legal so check for 'em */
1175		aiocbp = *(aiocbpp + i);
1176		if ( aiocbp == USER_ADDR_NULL )
1177			continue;
1178
1179		/* return immediately if any aio request in the list is done */
1180		TAILQ_FOREACH( entryp, &p->p_aio_doneq, aio_proc_link) {
1181			ASSERT_AIO_FROM_PROC(entryp, p);
1182			if ( entryp->uaiocbp == aiocbp ) {
1183				aio_proc_unlock(p);
1184				*retval = 0;
1185				error = 0;
1186				goto ExitThisRoutine;
1187			}
1188		}
1189	} /* for ( ; i < uap->nent; ) */
1190
1191	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend_sleep)) | DBG_FUNC_NONE,
1192		     	  (int)p, uap->nent, 0, 0, 0 );
1193
1194	/*
1195	 * wait for an async IO to complete or a signal fires or timeout expires.
1196	 * we return EAGAIN (35) for timeout expiration and EINTR (4) when a signal
1197	 * interrupts us.  If an async IO completes before a signal fires or our
1198	 * timeout expires, we get a wakeup call from aio_work_thread().
1199	 */
1200
1201	error = msleep1(&p->AIO_SUSPEND_SLEEP_CHAN, aio_proc_mutex(p), PCATCH | PWAIT | PDROP, "aio_suspend", abstime); /* XXX better priority? */
1202	if ( error == 0 ) {
1203		/*
1204		 * got our wakeup call from aio_work_thread().
1205		 * Since we can get a wakeup on this channel from another thread in the
1206		 * same process we head back up to make sure this is for the correct aiocbp.
1207		 * If it is the correct aiocbp we will return from where we do the check
1208		 * (see entryp->uaiocbp == aiocbp after check_for_our_aiocbp label)
1209		 * else we will fall out and just sleep again.
1210		 */
1211		goto check_for_our_aiocbp;
1212	}
1213	else if ( error == EWOULDBLOCK ) {
1214		/* our timeout expired */
1215		error = EAGAIN;
1216	}
1217	else {
1218		/* we were interrupted */
1219		error = EINTR;
1220	}
1221
1222ExitThisRoutine:
1223	if ( aiocbpp != NULL )
1224		FREE( aiocbpp, M_TEMP );
1225
1226	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_suspend)) | DBG_FUNC_END,
1227		     	  (int)p, uap->nent, error, 0, 0 );
1228
1229	return( error );
1230
1231} /* aio_suspend */
1232
1233
1234/* aio_write - asynchronously write uap->aiocbp->aio_nbytes bytes to the
1235 * file descriptor (uap->aiocbp->aio_fildes) from the buffer
1236 * (uap->aiocbp->aio_buf).
1237 */
1238
1239int
1240aio_write(proc_t p, struct aio_write_args *uap, int *retval )
1241{
1242	int			error;
1243
1244	*retval = 0;
1245
1246	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_START,
1247		     	  (int)p, (int)uap->aiocbp, 0, 0, 0 );
1248
1249	error = aio_queue_async_request( p, uap->aiocbp, AIO_WRITE );
1250	if ( error != 0 )
1251		*retval = -1;
1252
1253	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_write)) | DBG_FUNC_END,
1254		     	  (int)p, (int)uap->aiocbp, error, 0, 0 );
1255
1256	return( error );
1257
1258} /* aio_write */
1259
1260
1261static user_addr_t *
1262aio_copy_in_list(proc_t procp, user_addr_t aiocblist, int nent)
1263{
1264	user_addr_t	*aiocbpp;
1265	int		i, result;
1266
1267	/* we reserve enough space for largest possible pointer size */
1268	MALLOC( aiocbpp, user_addr_t *, (nent * sizeof(user_addr_t)), M_TEMP, M_WAITOK );
1269	if ( aiocbpp == NULL )
1270		goto err;
1271
1272	/* copyin our aiocb pointers from list */
1273	result = copyin( aiocblist, aiocbpp,
1274			proc_is64bit(procp) ? (nent * sizeof(user64_addr_t))
1275					    : (nent * sizeof(user32_addr_t)) );
1276	if ( result) {
1277		FREE( aiocbpp, M_TEMP );
1278		aiocbpp = NULL;
1279		goto err;
1280	}
1281
1282	/*
1283	 * We depend on a list of user_addr_t's so we need to
1284	 * munge and expand when these pointers came from a
1285	 * 32-bit process
1286	 */
1287	if ( !proc_is64bit(procp) ) {
1288		/* copy from last to first to deal with overlap */
1289		user32_addr_t *my_ptrp = ((user32_addr_t *)aiocbpp) + (nent - 1);
1290		user_addr_t *my_addrp = aiocbpp + (nent - 1);
1291
1292		for (i = 0; i < nent; i++, my_ptrp--, my_addrp--) {
1293			*my_addrp = (user_addr_t) (*my_ptrp);
1294		}
1295	}
1296
1297err:
1298	return (aiocbpp);
1299}
1300
1301
1302static int
1303aio_copy_in_sigev(proc_t procp, user_addr_t sigp, struct user_sigevent *sigev)
1304{
1305	int	result = 0;
1306
1307	if (sigp == USER_ADDR_NULL)
1308		goto out;
1309
1310	/*
1311	 * We need to munge aio_sigevent since it contains pointers.
1312	 * Since we do not know if sigev_value is an int or a ptr we do
1313	 * NOT cast the ptr to a user_addr_t.   This means if we send
1314	 * this info back to user space we need to remember sigev_value
1315	 * was not expanded for the 32-bit case.
1316	 *
1317	 * Notes:	 This does NOT affect us since we don't support
1318	 *		sigev_value yet in the aio context.
1319	 */
1320	if ( proc_is64bit(procp) ) {
1321		struct user64_sigevent sigevent64;
1322
1323		result = copyin( sigp, &sigevent64, sizeof(sigevent64) );
1324		if ( result == 0 ) {
1325			sigev->sigev_notify = sigevent64.sigev_notify;
1326			sigev->sigev_signo = sigevent64.sigev_signo;
1327			sigev->sigev_value.size_equivalent.sival_int = sigevent64.sigev_value.size_equivalent.sival_int;
1328			sigev->sigev_notify_function = sigevent64.sigev_notify_function;
1329			sigev->sigev_notify_attributes = sigevent64.sigev_notify_attributes;
1330		}
1331
1332	} else {
1333		struct user32_sigevent sigevent32;
1334
1335		result = copyin( sigp, &sigevent32, sizeof(sigevent32) );
1336		if ( result == 0 ) {
1337			sigev->sigev_notify = sigevent32.sigev_notify;
1338			sigev->sigev_signo = sigevent32.sigev_signo;
1339			sigev->sigev_value.size_equivalent.sival_int = sigevent32.sigev_value.sival_int;
1340			sigev->sigev_notify_function = CAST_USER_ADDR_T(sigevent32.sigev_notify_function);
1341			sigev->sigev_notify_attributes = CAST_USER_ADDR_T(sigevent32.sigev_notify_attributes);
1342		}
1343	}
1344
1345	if ( result != 0 ) {
1346		result = EAGAIN;
1347	}
1348
1349out:
1350	return (result);
1351}
1352
1353/*
1354 * aio_enqueue_work
1355 *
1356 * Queue up the entry on the aio asynchronous work queue in priority order
1357 * based on the relative priority of the request.  We calculate the relative
1358 * priority using the nice value of the caller and the value
1359 *
1360 * Parameters:	procp			Process queueing the I/O
1361 *		entryp			The work queue entry being queued
1362 *
1363 * Returns:	(void)			No failure modes
1364 *
1365 * Notes:	This function is used for both lio_listio and aio
1366 *
1367 * XXX:		At some point, we may have to consider thread priority
1368 *		rather than process priority, but we don't maintain the
1369 *		adjusted priority for threads the POSIX way.
1370 *
1371 *
1372 * Called with proc locked.
1373 */
1374static void
1375aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked)
1376{
1377#if 0
1378	aio_workq_entry	*my_entryp;	/* used for insertion sort */
1379#endif /* 0 */
1380	aio_workq_t queue = aio_entry_workq(entryp);
1381
1382	if (proc_locked == 0) {
1383		aio_proc_lock(procp);
1384	}
1385
1386	ASSERT_AIO_PROC_LOCK_OWNED(procp);
1387
1388	/* Onto proc queue */
1389	TAILQ_INSERT_TAIL(&procp->p_aio_activeq, entryp,  aio_proc_link);
1390	procp->p_aio_active_count++;
1391	procp->p_aio_total_count++;
1392
1393	/* And work queue */
1394	aio_workq_lock_spin(queue);
1395	aio_workq_add_entry_locked(queue, entryp);
1396	wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1);
1397	aio_workq_unlock(queue);
1398
1399	if (proc_locked == 0) {
1400		aio_proc_unlock(procp);
1401	}
1402
1403#if 0
1404	/*
1405	 * Procedure:
1406	 *
1407	 * (1)	The nice value is in the range PRIO_MIN..PRIO_MAX [-20..20]
1408	 * (2)	The normalized nice value is in the range 0..((2 * NZERO) - 1)
1409	 *	which is [0..39], with 0 not being used.  In nice values, the
1410	 *	lower the nice value, the higher the priority.
1411	 * (3)	The normalized scheduling prioritiy is the highest nice value
1412	 *	minus the current nice value.  In I/O scheduling priority, the
1413	 *	higher the value the lower the priority, so it is the inverse
1414	 *	of the nice value (the higher the number, the higher the I/O
1415	 *	priority).
1416	 * (4)	From the normalized scheduling priority, we subtract the
1417	 *	request priority to get the request priority value number;
1418	 *	this means that requests are only capable of depressing their
1419	 *	priority relative to other requests,
1420	 */
1421	entryp->priority = (((2 * NZERO) - 1) - procp->p_nice);
1422
1423	/* only premit depressing the priority */
1424	if (entryp->aiocb.aio_reqprio < 0)
1425		entryp->aiocb.aio_reqprio = 0;
1426	if (entryp->aiocb.aio_reqprio > 0) {
1427		entryp->priority -= entryp->aiocb.aio_reqprio;
1428		if (entryp->priority < 0)
1429			entryp->priority = 0;
1430	}
1431
1432	/* Insertion sort the entry; lowest ->priority to highest */
1433	TAILQ_FOREACH(my_entryp, &aio_anchor.aio_async_workq, aio_workq_link) {
1434		if ( entryp->priority <= my_entryp->priority) {
1435			TAILQ_INSERT_BEFORE(my_entryp, entryp, aio_workq_link);
1436			break;
1437		}
1438	}
1439	if (my_entryp == NULL)
1440		TAILQ_INSERT_TAIL( &aio_anchor.aio_async_workq, entryp, aio_workq_link );
1441#endif /* 0 */
1442}
1443
1444
1445/*
1446 * lio_listio - initiate a list of IO requests.  We process the list of
1447 * aiocbs either synchronously (mode == LIO_WAIT) or asynchronously
1448 * (mode == LIO_NOWAIT).
1449 *
1450 * The caller gets error and return status for each aiocb in the list
1451 * via aio_error and aio_return.  We must keep completed requests until
1452 * released by the aio_return call.
1453 */
1454int
1455lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
1456{
1457	int				i;
1458	int				call_result;
1459	int				result;
1460	int				old_count;
1461	aio_workq_entry			**entryp_listp;
1462	user_addr_t			*aiocbpp;
1463	struct user_sigevent		aiosigev;
1464	aio_lio_context		*lio_context;
1465	boolean_t 			free_context = FALSE;
1466
1467	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_START,
1468		     	  (int)p, uap->nent, uap->mode, 0, 0 );
1469
1470	entryp_listp = NULL;
1471	lio_context = NULL;
1472	aiocbpp = NULL;
1473	call_result = -1;
1474	*retval = -1;
1475	if ( !(uap->mode == LIO_NOWAIT || uap->mode == LIO_WAIT) ) {
1476		call_result = EINVAL;
1477		goto ExitRoutine;
1478	}
1479
1480	if ( uap->nent < 1 || uap->nent > AIO_LISTIO_MAX ) {
1481		call_result = EINVAL;
1482		goto ExitRoutine;
1483	}
1484
1485	/*
1486	 * allocate a list of aio_workq_entry pointers that we will use
1487	 * to queue up all our requests at once while holding our lock.
1488	 */
1489	MALLOC( entryp_listp, void *, (uap->nent * sizeof(aio_workq_entry *)), M_TEMP, M_WAITOK );
1490	if ( entryp_listp == NULL ) {
1491		call_result = EAGAIN;
1492		goto ExitRoutine;
1493	}
1494
1495	MALLOC( lio_context, aio_lio_context*, sizeof(aio_lio_context), M_TEMP, M_WAITOK );
1496	if ( lio_context == NULL ) {
1497		call_result = EAGAIN;
1498		goto ExitRoutine;
1499	}
1500
1501#if DEBUG
1502	OSIncrementAtomic(&lio_contexts_alloced);
1503#endif /* DEBUG */
1504
1505	bzero(lio_context, sizeof(aio_lio_context));
1506
1507	aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
1508	if ( aiocbpp == NULL ) {
1509		call_result = EAGAIN;
1510		goto ExitRoutine;
1511	}
1512
1513	/*
1514	 * Use sigevent passed in to lio_listio for each of our calls, but
1515	 * only do completion notification after the last request completes.
1516	 */
1517	bzero(&aiosigev, sizeof(aiosigev));
1518	/* Only copy in an sigev if the user supplied one */
1519	if (uap->sigp != USER_ADDR_NULL) {
1520		call_result = aio_copy_in_sigev(p, uap->sigp, &aiosigev);
1521		if ( call_result)
1522			goto ExitRoutine;
1523	}
1524
1525	/* process list of aio requests */
1526	lio_context->io_issued = uap->nent;
1527	lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
1528	for ( i = 0; i < uap->nent; i++ ) {
1529		user_addr_t my_aiocbp;
1530		aio_workq_entry		 		*entryp;
1531
1532		*(entryp_listp + i) = NULL;
1533		my_aiocbp = *(aiocbpp + i);
1534
1535		/* NULL elements are legal so check for 'em */
1536		if ( my_aiocbp == USER_ADDR_NULL ) {
1537			aio_proc_lock_spin(p);
1538			lio_context->io_issued--;
1539			aio_proc_unlock(p);
1540			continue;
1541		}
1542
1543		/*
1544		 * We use lio_context to mark IO requests for delayed completion
1545		 * processing which means we wait until all IO requests in the
1546		 * group have completed before we either return to the caller
1547		 * when mode is LIO_WAIT or signal user when mode is LIO_NOWAIT.
1548		 *
1549		 * We use the address of the lio_context for this, since it is
1550		 * unique in the address space.
1551		 */
1552		result = lio_create_entry( p, my_aiocbp, lio_context, (entryp_listp + i) );
1553		if ( result != 0 && call_result == -1 )
1554			call_result = result;
1555
1556		/* NULL elements are legal so check for 'em */
1557		entryp = *(entryp_listp + i);
1558		if ( entryp == NULL ) {
1559			aio_proc_lock_spin(p);
1560			lio_context->io_issued--;
1561			aio_proc_unlock(p);
1562			continue;
1563		}
1564
1565		if ( uap->mode == LIO_NOWAIT ) {
1566			/* Set signal hander, if any */
1567			entryp->aiocb.aio_sigevent = aiosigev;
1568		} else {
1569			/* flag that this thread blocks pending completion */
1570			entryp->flags |= AIO_LIO_NOTIFY;
1571		}
1572
1573		/* check our aio limits to throttle bad or rude user land behavior */
1574		old_count = aio_increment_total_count();
1575
1576		aio_proc_lock_spin(p);
1577		if ( old_count >= aio_max_requests ||
1578			 aio_get_process_count( entryp->procp ) >= aio_max_requests_per_process ||
1579			 is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1580
1581			lio_context->io_issued--;
1582			aio_proc_unlock(p);
1583
1584			aio_decrement_total_count();
1585
1586			if ( call_result == -1 )
1587				call_result = EAGAIN;
1588			aio_free_request(entryp);
1589			entryp_listp[i] = NULL;
1590			continue;
1591		}
1592
1593		lck_mtx_convert_spin(aio_proc_mutex(p));
1594		aio_enqueue_work(p, entryp, 1);
1595		aio_proc_unlock(p);
1596
1597		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1598				  (int)p, (int)entryp->uaiocbp, 0, 0, 0 );
1599	}
1600
1601	switch(uap->mode) {
1602	case LIO_WAIT:
1603		aio_proc_lock_spin(p);
1604		while (lio_context->io_completed < lio_context->io_issued) {
1605			result = msleep(lio_context, aio_proc_mutex(p), PCATCH | PRIBIO | PSPIN, "lio_listio", 0);
1606
1607			/* If we were interrupted, fail out (even if all finished) */
1608			if (result != 0) {
1609				call_result = EINTR;
1610				lio_context->io_waiter = 0;
1611				break;
1612			}
1613		}
1614
1615		/* If all IOs have finished must free it */
1616		if (lio_context->io_completed == lio_context->io_issued) {
1617			free_context = TRUE;
1618		}
1619
1620		aio_proc_unlock(p);
1621		break;
1622
1623	case LIO_NOWAIT:
1624		break;
1625	}
1626
1627	/* call_result == -1 means we had no trouble queueing up requests */
1628	if ( call_result == -1 ) {
1629		call_result = 0;
1630		*retval = 0;
1631	}
1632
1633ExitRoutine:
1634	if ( entryp_listp != NULL )
1635		FREE( entryp_listp, M_TEMP );
1636	if ( aiocbpp != NULL )
1637		FREE( aiocbpp, M_TEMP );
1638	if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
1639		free_lio_context(lio_context);
1640	}
1641
1642	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_listio)) | DBG_FUNC_END,
1643		     	  (int)p, call_result, 0, 0, 0 );
1644
1645	return( call_result );
1646
1647} /* lio_listio */
1648
1649
1650/*
1651 * aio worker thread.  this is where all the real work gets done.
1652 * we get a wake up call on sleep channel &aio_anchor.aio_async_workq
1653 * after new work is queued up.
1654 */
1655static void
1656aio_work_thread( void )
1657{
1658	aio_workq_entry		 	*entryp;
1659	int 			error;
1660	vm_map_t 		currentmap;
1661	vm_map_t 		oldmap = VM_MAP_NULL;
1662	task_t			oldaiotask = TASK_NULL;
1663	struct uthread	*uthreadp = NULL;
1664
1665	for( ;; ) {
1666		/*
1667		 * returns with the entry ref'ed.
1668		 * sleeps until work is available.
1669		 */
1670		entryp = aio_get_some_work();
1671
1672		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_START,
1673				(int)entryp->procp, (int)entryp->uaiocbp, entryp->flags, 0, 0 );
1674
1675		/*
1676		 * Assume the target's address space identity for the duration
1677		 * of the IO.  Note: don't need to have the entryp locked,
1678		 * because the proc and map don't change until it's freed.
1679		 */
1680		currentmap = get_task_map( (current_proc())->task );
1681		if ( currentmap != entryp->aio_map ) {
1682			uthreadp = (struct uthread *) get_bsdthread_info(current_thread());
1683			oldaiotask = uthreadp->uu_aio_task;
1684			uthreadp->uu_aio_task = entryp->procp->task;
1685			oldmap = vm_map_switch( entryp->aio_map );
1686		}
1687
1688		if ( (entryp->flags & AIO_READ) != 0 ) {
1689			error = do_aio_read( entryp );
1690		}
1691		else if ( (entryp->flags & AIO_WRITE) != 0 ) {
1692			error = do_aio_write( entryp );
1693		}
1694		else if ( (entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0 ) {
1695			error = do_aio_fsync( entryp );
1696		}
1697		else {
1698			printf( "%s - unknown aio request - flags 0x%02X \n",
1699					__FUNCTION__, entryp->flags );
1700			error = EINVAL;
1701		}
1702
1703		/* Restore old map */
1704		if ( currentmap != entryp->aio_map ) {
1705			(void) vm_map_switch( oldmap );
1706			uthreadp->uu_aio_task = oldaiotask;
1707		}
1708
1709		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread)) | DBG_FUNC_END,
1710				(int)entryp->procp, (int)entryp->uaiocbp, entryp->errorval,
1711				entryp->returnval, 0 );
1712
1713
1714		/* XXX COUNTS */
1715		aio_entry_lock_spin(entryp);
1716		entryp->errorval = error;
1717		aio_entry_unlock(entryp);
1718
1719		/* we're done with the IO request so pop it off the active queue and */
1720		/* push it on the done queue */
1721		aio_proc_lock(entryp->procp);
1722		aio_proc_move_done_locked(entryp->procp, entryp);
1723		aio_proc_unlock(entryp->procp);
1724
1725		OSDecrementAtomic(&aio_anchor.aio_inflight_count);
1726
1727		/* remove our reference to the user land map. */
1728		if ( VM_MAP_NULL != entryp->aio_map ) {
1729			vm_map_t 		my_map;
1730
1731			my_map = entryp->aio_map;
1732			entryp->aio_map = VM_MAP_NULL;
1733			vm_map_deallocate( my_map );
1734		}
1735
1736		/* Provide notifications */
1737		do_aio_completion( entryp );
1738
1739		/* Will free if needed */
1740		aio_entry_unref(entryp);
1741
1742	} /* for ( ;; ) */
1743
1744	/* NOT REACHED */
1745
1746} /* aio_work_thread */
1747
1748
1749/*
1750 * aio_get_some_work - get the next async IO request that is ready to be executed.
1751 * aio_fsync complicates matters a bit since we cannot do the fsync until all async
1752 * IO requests at the time the aio_fsync call came in have completed.
1753 * NOTE - AIO_LOCK must be held by caller
1754 */
1755static aio_workq_entry *
1756aio_get_some_work( void )
1757{
1758	aio_workq_entry		 		*entryp = NULL;
1759	aio_workq_t 				queue = NULL;
1760
1761	/* Just one queue for the moment.  In the future there will be many. */
1762	queue = &aio_anchor.aio_async_workqs[0];
1763	aio_workq_lock_spin(queue);
1764	if (queue->aioq_count == 0) {
1765		goto nowork;
1766	}
1767
1768	/*
1769	 * Hold the queue lock.
1770	 *
1771	 * pop some work off the work queue and add to our active queue
1772	 * Always start with the queue lock held.
1773	 */
1774	for(;;) {
1775		/*
1776		 * Pull of of work queue.  Once it's off, it can't be cancelled,
1777		 * so we can take our ref once we drop the queue lock.
1778		 */
1779		entryp = TAILQ_FIRST(&queue->aioq_entries);
1780
1781		/*
1782		 * If there's no work or only fsyncs that need delay, go to sleep
1783		 * and then start anew from aio_work_thread
1784		 */
1785		if (entryp == NULL) {
1786			goto nowork;
1787		}
1788
1789		aio_workq_remove_entry_locked(queue, entryp);
1790
1791		aio_workq_unlock(queue);
1792
1793		/*
1794		 * Check if it's an fsync that must be delayed.  No need to lock the entry;
1795		 * that flag would have been set at initialization.
1796		 */
1797		if ( (entryp->flags & AIO_FSYNC) != 0 ) {
1798			/*
1799			 * Check for unfinished operations on the same file
1800			 * in this proc's queue.
1801			 */
1802			aio_proc_lock_spin(entryp->procp);
1803			if ( aio_delay_fsync_request( entryp ) ) {
1804				/* It needs to be delayed.  Put it back on the end of the work queue */
1805				KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay)) | DBG_FUNC_NONE,
1806							  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
1807
1808				aio_proc_unlock(entryp->procp);
1809
1810				aio_workq_lock_spin(queue);
1811				aio_workq_add_entry_locked(queue, entryp);
1812				continue;
1813			}
1814			aio_proc_unlock(entryp->procp);
1815		}
1816
1817		break;
1818	}
1819
1820	aio_entry_ref(entryp);
1821
1822	OSIncrementAtomic(&aio_anchor.aio_inflight_count);
1823	return( entryp );
1824
1825nowork:
1826	/* We will wake up when someone enqueues something */
1827	wait_queue_assert_wait(queue->aioq_waitq, queue, THREAD_UNINT, 0);
1828	aio_workq_unlock(queue);
1829	thread_block( (thread_continue_t)aio_work_thread );
1830
1831	// notreached
1832	return NULL;
1833}
1834
1835/*
1836 * aio_delay_fsync_request - look to see if this aio_fsync request should be delayed.
1837 * A big, simple hammer: only send it off if it's the most recently filed IO which has
1838 * not been completed.
1839 */
1840static boolean_t
1841aio_delay_fsync_request( aio_workq_entry *entryp )
1842{
1843	if (entryp == TAILQ_FIRST(&entryp->procp->p_aio_activeq)) {
1844		return FALSE;
1845	}
1846
1847	return TRUE;
1848} /* aio_delay_fsync_request */
1849
1850static aio_workq_entry *
1851aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, void *group_tag, int kindOfIO)
1852{
1853	aio_workq_entry	*entryp;
1854	int		result = 0;
1855
1856	entryp = (aio_workq_entry *) zalloc( aio_workq_zonep );
1857	if ( entryp == NULL ) {
1858		result = EAGAIN;
1859		goto error_exit;
1860	}
1861
1862	bzero( entryp, sizeof(*entryp) );
1863
1864	/* fill in the rest of the aio_workq_entry */
1865	entryp->procp = procp;
1866	entryp->uaiocbp = aiocbp;
1867	entryp->flags |= kindOfIO;
1868	entryp->group_tag = group_tag;
1869	entryp->aio_map = VM_MAP_NULL;
1870	entryp->aio_refcount = 0;
1871
1872	if ( proc_is64bit(procp) ) {
1873		struct user64_aiocb aiocb64;
1874
1875		result = copyin( aiocbp, &aiocb64, sizeof(aiocb64) );
1876		if (result == 0 )
1877			do_munge_aiocb_user64_to_user(&aiocb64, &entryp->aiocb);
1878
1879	} else {
1880		struct user32_aiocb aiocb32;
1881
1882		result = copyin( aiocbp, &aiocb32, sizeof(aiocb32) );
1883		if ( result == 0 )
1884			do_munge_aiocb_user32_to_user( &aiocb32, &entryp->aiocb );
1885	}
1886
1887	if ( result != 0 ) {
1888		result = EAGAIN;
1889		goto error_exit;
1890	}
1891
1892	/* get a reference to the user land map in order to keep it around */
1893	entryp->aio_map = get_task_map( procp->task );
1894	vm_map_reference( entryp->aio_map );
1895
1896	/* do some more validation on the aiocb and embedded file descriptor */
1897	result = aio_validate( entryp );
1898	if ( result != 0 )
1899		goto error_exit_with_ref;
1900
1901	/* get a reference on the current_thread, which is passed in vfs_context. */
1902	entryp->thread = current_thread();
1903	thread_reference( entryp->thread );
1904	return ( entryp );
1905
1906error_exit_with_ref:
1907	if ( VM_MAP_NULL != entryp->aio_map ) {
1908		vm_map_deallocate( entryp->aio_map );
1909	}
1910error_exit:
1911	if ( result && entryp != NULL ) {
1912		zfree( aio_workq_zonep, entryp );
1913		entryp = NULL;
1914	}
1915
1916	return ( entryp );
1917}
1918
1919
1920/*
1921 * aio_queue_async_request - queue up an async IO request on our work queue then
1922 * wake up one of our worker threads to do the actual work.  We get a reference
1923 * to our caller's user land map in order to keep it around while we are
1924 * processing the request.
1925 */
1926static int
1927aio_queue_async_request(proc_t procp, user_addr_t aiocbp, int kindOfIO )
1928{
1929	aio_workq_entry	*entryp;
1930	int		result;
1931	int		old_count;
1932
1933	old_count = aio_increment_total_count();
1934	if (old_count >= aio_max_requests) {
1935		result = EAGAIN;
1936		goto error_noalloc;
1937	}
1938
1939	entryp = aio_create_queue_entry( procp, aiocbp, 0, kindOfIO);
1940	if ( entryp == NULL ) {
1941		result = EAGAIN;
1942		goto error_noalloc;
1943	}
1944
1945
1946	aio_proc_lock_spin(procp);
1947
1948	if ( is_already_queued( entryp->procp, entryp->uaiocbp ) == TRUE ) {
1949		result = EAGAIN;
1950		goto error_exit;
1951	}
1952
1953	/* check our aio limits to throttle bad or rude user land behavior */
1954	if (aio_get_process_count( procp ) >= aio_max_requests_per_process) {
1955		printf("aio_queue_async_request(): too many in flight for proc: %d.\n", procp->p_aio_total_count);
1956		result = EAGAIN;
1957		goto error_exit;
1958	}
1959
1960	/* Add the IO to proc and work queues, wake up threads as appropriate */
1961	lck_mtx_convert_spin(aio_proc_mutex(procp));
1962	aio_enqueue_work(procp, entryp, 1);
1963
1964	aio_proc_unlock(procp);
1965
1966	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued)) | DBG_FUNC_NONE,
1967		     	  (int)procp, (int)aiocbp, 0, 0, 0 );
1968
1969	return( 0 );
1970
1971error_exit:
1972	/*
1973	 * This entry has not been queued up so no worries about
1974	 * unlocked state and aio_map
1975	 */
1976	aio_proc_unlock(procp);
1977	aio_free_request(entryp);
1978
1979error_noalloc:
1980	aio_decrement_total_count();
1981
1982	return( result );
1983
1984} /* aio_queue_async_request */
1985
1986
1987/*
1988 * lio_create_entry
1989 *
1990 * Allocate an aio_workq_entry and fill it in.  If all goes well return 0
1991 * and pass the aio_workq_entry pointer back to our caller.
1992 *
1993 * Parameters:	procp			The process makign the request
1994 *		aiocbp			The aio context buffer pointer
1995 *		group_tag		The group tag used to indicate a
1996 *					group of operations has completed
1997 *		entrypp			Pointer to the pointer to receive the
1998 *					address of the created aio_workq_entry
1999 *
2000 * Returns:	0			Successfully created
2001 *		EAGAIN			Try again (usually resource shortage)
2002 *
2003 *
2004 * Notes:	We get a reference to our caller's user land map in order
2005 *		to keep it around while we are processing the request.
2006 *
2007 *		lio_listio calls behave differently at completion they do
2008 *		completion notification when all async IO requests have
2009 *		completed.  We use group_tag to tag IO requests that behave
2010 *		in the delay notification manner.
2011 *
2012 *		All synchronous operations are considered to not have a
2013 *		signal routine associated with them (sigp == USER_ADDR_NULL).
2014 */
2015static int
2016lio_create_entry(proc_t procp, user_addr_t aiocbp, void *group_tag,
2017		aio_workq_entry **entrypp )
2018{
2019	aio_workq_entry	*entryp;
2020	int		result;
2021
2022	entryp = aio_create_queue_entry( procp, aiocbp, group_tag, AIO_LIO);
2023	if ( entryp == NULL ) {
2024		result = EAGAIN;
2025		goto error_exit;
2026	}
2027
2028	/*
2029	 * Look for lio_listio LIO_NOP requests and ignore them; this is
2030	 * not really an error, but we need to free our aio_workq_entry.
2031	 */
2032	if ( entryp->aiocb.aio_lio_opcode == LIO_NOP ) {
2033		result = 0;
2034		goto error_exit;
2035	}
2036
2037	*entrypp = entryp;
2038	return( 0 );
2039
2040error_exit:
2041
2042	if ( entryp != NULL ) {
2043		/*
2044		 * This entry has not been queued up so no worries about
2045		 * unlocked state and aio_map
2046		 */
2047		aio_free_request(entryp);
2048	}
2049
2050	return( result );
2051
2052} /* lio_create_entry */
2053
2054
2055/*
2056 * aio_free_request - remove our reference on the user land map and
2057 * free the work queue entry resources.  The entry is off all lists
2058 * and has zero refcount, so no one can have a pointer to it.
2059 */
2060
2061static int
2062aio_free_request(aio_workq_entry *entryp)
2063{
2064	/* remove our reference to the user land map. */
2065	if ( VM_MAP_NULL != entryp->aio_map) {
2066		vm_map_deallocate(entryp->aio_map);
2067	}
2068
2069	/* remove our reference to thread which enqueued the request */
2070	if ( NULL != entryp->thread ) {
2071		thread_deallocate( entryp->thread );
2072	}
2073
2074	entryp->aio_refcount = -1; /* A bit of poisoning in case of bad refcounting. */
2075
2076	zfree( aio_workq_zonep, entryp );
2077
2078	return( 0 );
2079
2080} /* aio_free_request */
2081
2082
2083/*
2084 * aio_validate
2085 *
2086 * validate the aiocb passed in by one of the aio syscalls.
2087 */
2088static int
2089aio_validate( aio_workq_entry *entryp )
2090{
2091	struct fileproc 				*fp;
2092	int							flag;
2093	int							result;
2094
2095	result = 0;
2096
2097	if ( (entryp->flags & AIO_LIO) != 0 ) {
2098		if ( entryp->aiocb.aio_lio_opcode == LIO_READ )
2099			entryp->flags |= AIO_READ;
2100		else if ( entryp->aiocb.aio_lio_opcode == LIO_WRITE )
2101			entryp->flags |= AIO_WRITE;
2102		else if ( entryp->aiocb.aio_lio_opcode == LIO_NOP )
2103			return( 0 );
2104		else
2105			return( EINVAL );
2106	}
2107
2108	flag = FREAD;
2109	if ( (entryp->flags & (AIO_WRITE | AIO_FSYNC | AIO_DSYNC)) != 0 ) {
2110		flag = FWRITE;
2111	}
2112
2113	if ( (entryp->flags & (AIO_READ | AIO_WRITE)) != 0 ) {
2114		if ( entryp->aiocb.aio_nbytes > INT_MAX		||
2115			 entryp->aiocb.aio_buf == USER_ADDR_NULL ||
2116			 entryp->aiocb.aio_offset < 0 )
2117			return( EINVAL );
2118	}
2119
2120	/*
2121	 * validate aiocb.aio_sigevent.  at this point we only support
2122	 * sigev_notify equal to SIGEV_SIGNAL or SIGEV_NONE.  this means
2123	 * sigev_value, sigev_notify_function, and sigev_notify_attributes
2124	 * are ignored, since SIGEV_THREAD is unsupported.  This is consistent
2125	 * with no [RTS] (RalTime Signal) option group support.
2126	 */
2127	switch ( entryp->aiocb.aio_sigevent.sigev_notify ) {
2128	case SIGEV_SIGNAL:
2129	    {
2130		int		signum;
2131
2132		/* make sure we have a valid signal number */
2133		signum = entryp->aiocb.aio_sigevent.sigev_signo;
2134		if ( signum <= 0 || signum >= NSIG ||
2135			 signum == SIGKILL || signum == SIGSTOP )
2136			return (EINVAL);
2137	    }
2138	    break;
2139
2140	case SIGEV_NONE:
2141		break;
2142
2143	case SIGEV_THREAD:
2144		/* Unsupported [RTS] */
2145
2146	default:
2147		return (EINVAL);
2148	}
2149
2150	/* validate the file descriptor and that the file was opened
2151	 * for the appropriate read / write access.
2152	 */
2153	proc_fdlock(entryp->procp);
2154
2155	result = fp_lookup( entryp->procp, entryp->aiocb.aio_fildes, &fp , 1);
2156	if ( result == 0 ) {
2157		if ( (fp->f_fglob->fg_flag & flag) == 0 ) {
2158			/* we don't have read or write access */
2159			result = EBADF;
2160		}
2161		else if ( FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_VNODE ) {
2162			/* this is not a file */
2163			result = ESPIPE;
2164		} else
2165		        fp->f_flags |= FP_AIOISSUED;
2166
2167		fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp , 1);
2168	}
2169	else {
2170		result = EBADF;
2171	}
2172
2173	proc_fdunlock(entryp->procp);
2174
2175	return( result );
2176
2177} /* aio_validate */
2178
2179static int
2180aio_increment_total_count()
2181{
2182	return OSIncrementAtomic(&aio_anchor.aio_total_count);
2183}
2184
2185static int
2186aio_decrement_total_count()
2187{
2188	int old = OSDecrementAtomic(&aio_anchor.aio_total_count);
2189	if (old <= 0) {
2190		panic("Negative total AIO count!\n");
2191	}
2192
2193	return old;
2194}
2195
2196static int
2197aio_get_process_count(proc_t procp )
2198{
2199	return procp->p_aio_total_count;
2200
2201} /* aio_get_process_count */
2202
2203static int
2204aio_get_all_queues_count( void )
2205{
2206	return aio_anchor.aio_total_count;
2207
2208} /* aio_get_all_queues_count */
2209
2210
2211/*
2212 * do_aio_completion.  Handle async IO completion.
2213 */
2214static void
2215do_aio_completion( aio_workq_entry *entryp )
2216{
2217
2218	boolean_t		lastLioCompleted = FALSE;
2219	aio_lio_context	*lio_context = NULL;
2220	int waiter = 0;
2221
2222	lio_context = (aio_lio_context *)entryp->group_tag;
2223
2224	if (lio_context != NULL) {
2225
2226		aio_proc_lock_spin(entryp->procp);
2227
2228		/* Account for this I/O completing. */
2229	 	lio_context->io_completed++;
2230
2231		/* Are we done with this lio context? */
2232	 	if (lio_context->io_issued == lio_context->io_completed) {
2233	 		lastLioCompleted = TRUE;
2234	 	}
2235
2236		waiter = lio_context->io_waiter;
2237
2238		/* explicit wakeup of lio_listio() waiting in LIO_WAIT */
2239		if ((entryp->flags & AIO_LIO_NOTIFY) && (lastLioCompleted) && (waiter != 0)) {
2240			/* wake up the waiter */
2241			wakeup(lio_context);
2242		}
2243
2244		aio_proc_unlock(entryp->procp);
2245	}
2246
2247	if ( entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
2248		 (entryp->flags & AIO_DISABLE) == 0 ) {
2249
2250		boolean_t	performSignal = FALSE;
2251		 if (lio_context == NULL) {
2252		 	performSignal = TRUE;
2253		 }
2254		 else {
2255			/*
2256			 * If this was the last request in the group and a signal
2257			 * is desired, send one.
2258			 */
2259			performSignal = lastLioCompleted;
2260		 }
2261
2262		 if (performSignal) {
2263
2264			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_sig)) | DBG_FUNC_NONE,
2265				 (int)entryp->procp, (int)entryp->uaiocbp,
2266				 entryp->aiocb.aio_sigevent.sigev_signo, 0, 0 );
2267
2268			psignal( entryp->procp, entryp->aiocb.aio_sigevent.sigev_signo );
2269		}
2270	}
2271
2272	if ((entryp->flags & AIO_EXIT_WAIT) && (entryp->flags & AIO_CLOSE_WAIT)) {
2273		panic("Close and exit flags set at the same time\n");
2274	}
2275
2276	/*
2277	 * need to handle case where a process is trying to exit, exec, or
2278	 * close and is currently waiting for active aio requests to complete.
2279	 * If AIO_CLEANUP_WAIT is set then we need to look to see if there are any
2280	 * other requests in the active queue for this process.  If there are
2281	 * none then wakeup using the AIO_CLEANUP_SLEEP_CHAN tsleep channel.
2282	 * If there are some still active then do nothing - we only want to
2283	 * wakeup when all active aio requests for the process are complete.
2284	 *
2285	 * Don't need to lock the entry or proc to check the cleanup flag.  It can only be
2286	 * set for cancellation, while the entryp is still on a proc list; now it's
2287	 * off, so that flag is already set if it's going to be.
2288	 */
2289	if ( (entryp->flags & AIO_EXIT_WAIT) != 0 ) {
2290		int		active_requests;
2291
2292		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2293					  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2294
2295		aio_proc_lock_spin(entryp->procp);
2296		active_requests = aio_active_requests_for_process( entryp->procp );
2297		if ( active_requests < 1 ) {
2298			/*
2299			 * no active aio requests for this process, continue exiting.  In this
2300			 * case, there should be no one else waiting ont he proc in AIO...
2301			 */
2302			wakeup_one((caddr_t)&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2303			aio_proc_unlock(entryp->procp);
2304
2305			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2306					  	  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2307		} else {
2308			aio_proc_unlock(entryp->procp);
2309		}
2310	}
2311
2312	if ( (entryp->flags & AIO_CLOSE_WAIT) != 0 ) {
2313		int		active_requests;
2314
2315		KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait)) | DBG_FUNC_NONE,
2316					  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2317
2318		aio_proc_lock_spin(entryp->procp);
2319		active_requests = aio_proc_active_requests_for_file( entryp->procp, entryp->aiocb.aio_fildes);
2320		if ( active_requests < 1 ) {
2321			/* Can't wakeup_one(); multiple closes might be in progress. */
2322			wakeup(&entryp->procp->AIO_CLEANUP_SLEEP_CHAN);
2323			aio_proc_unlock(entryp->procp);
2324
2325			KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wake)) | DBG_FUNC_NONE,
2326					  	  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2327		} else {
2328			aio_proc_unlock(entryp->procp);
2329		}
2330	}
2331	/*
2332	 * A thread in aio_suspend() wants to known about completed IOs.  If it checked
2333	 * the done list before we moved our AIO there, then it already asserted its wait,
2334	 * and we can wake it up without holding the lock.  If it checked the list after
2335	 * we did our move, then it already has seen the AIO that we moved.  Herego, we
2336	 * can do our wakeup without holding the lock.
2337	 */
2338	wakeup( (caddr_t) &entryp->procp->AIO_SUSPEND_SLEEP_CHAN );
2339	KERNEL_DEBUG( (BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_suspend_wake)) | DBG_FUNC_NONE,
2340				  (int)entryp->procp, (int)entryp->uaiocbp, 0, 0, 0 );
2341
2342	/*
2343	 * free the LIO context if the last lio completed and no thread is
2344	 * waiting
2345	 */
2346	if (lastLioCompleted && (waiter == 0))
2347		free_lio_context (lio_context);
2348
2349
2350} /* do_aio_completion */
2351
2352
2353/*
2354 * do_aio_read
2355 */
2356static int
2357do_aio_read( aio_workq_entry *entryp )
2358{
2359	struct fileproc		*fp;
2360	int					error;
2361	struct vfs_context	context;
2362
2363	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2364		return(error);
2365	if ( (fp->f_fglob->fg_flag & FREAD) == 0 ) {
2366		fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2367		return(EBADF);
2368	}
2369
2370	context.vc_thread = entryp->thread;	/* XXX */
2371	context.vc_ucred = fp->f_fglob->fg_cred;
2372
2373	error = dofileread(&context, fp,
2374				entryp->aiocb.aio_buf,
2375				entryp->aiocb.aio_nbytes,
2376				entryp->aiocb.aio_offset, FOF_OFFSET,
2377				&entryp->returnval);
2378	fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2379
2380	return( error );
2381
2382} /* do_aio_read */
2383
2384
2385/*
2386 * do_aio_write
2387 */
2388static int
2389do_aio_write( aio_workq_entry *entryp )
2390{
2391	struct fileproc 		*fp;
2392	int				error, flags;
2393	struct vfs_context		context;
2394
2395	if ( (error = fp_lookup(entryp->procp, entryp->aiocb.aio_fildes, &fp , 0)) )
2396		return(error);
2397	if ( (fp->f_fglob->fg_flag & FWRITE) == 0 ) {
2398		fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2399		return(EBADF);
2400	}
2401
2402	flags = FOF_PCRED;
2403	if ( (fp->f_fglob->fg_flag & O_APPEND) == 0 ) {
2404		flags |= FOF_OFFSET;
2405	}
2406
2407	context.vc_thread = entryp->thread;	/* XXX */
2408	context.vc_ucred = fp->f_fglob->fg_cred;
2409
2410	/* NB: tell dofilewrite the offset, and to use the proc cred */
2411	error = dofilewrite(&context,
2412				fp,
2413				entryp->aiocb.aio_buf,
2414				entryp->aiocb.aio_nbytes,
2415				entryp->aiocb.aio_offset,
2416				flags,
2417				&entryp->returnval);
2418
2419	if (entryp->returnval)
2420		fp_drop_written(entryp->procp, entryp->aiocb.aio_fildes, fp);
2421	else
2422		fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2423
2424	return( error );
2425
2426} /* do_aio_write */
2427
2428
2429/*
2430 * aio_active_requests_for_process - return number of active async IO
2431 * requests for the given process.
2432 */
2433static int
2434aio_active_requests_for_process(proc_t procp )
2435{
2436	return( procp->p_aio_active_count );
2437
2438} /* aio_active_requests_for_process */
2439
2440/*
2441 * Called with the proc locked.
2442 */
2443static int
2444aio_proc_active_requests_for_file(proc_t procp, int fd)
2445{
2446	int count = 0;
2447	aio_workq_entry *entryp;
2448	TAILQ_FOREACH(entryp, &procp->p_aio_activeq, aio_proc_link) {
2449		if (entryp->aiocb.aio_fildes == fd) {
2450			count++;
2451		}
2452	}
2453
2454	return count;
2455} /* aio_active_requests_for_process */
2456
2457
2458
2459/*
2460 * do_aio_fsync
2461 */
2462static int
2463do_aio_fsync( aio_workq_entry *entryp )
2464{
2465	struct vfs_context 	context;
2466	struct vnode 		*vp;
2467	struct fileproc		*fp;
2468	int			sync_flag;
2469	int			error;
2470
2471	/*
2472	 * We are never called unless either AIO_FSYNC or AIO_DSYNC are set.
2473	 *
2474	 * If AIO_DSYNC is set, we can tell the lower layers that it is OK
2475	 * to mark for update the metadata not strictly necessary for data
2476	 * retrieval, rather than forcing it to disk.
2477	 *
2478	 * If AIO_FSYNC is set, we have to also wait for metadata not really
2479	 * necessary to data retrival are committed to stable storage (e.g.
2480	 * atime, mtime, ctime, etc.).
2481	 *
2482	 * Metadata necessary for data retrieval ust be committed to stable
2483	 * storage in either case (file length, etc.).
2484	 */
2485	if (entryp->flags & AIO_FSYNC)
2486		sync_flag = MNT_WAIT;
2487	else
2488		sync_flag = MNT_DWAIT;
2489
2490	error = fp_getfvp( entryp->procp, entryp->aiocb.aio_fildes, &fp, &vp);
2491	if ( error == 0 ) {
2492		if ( (error = vnode_getwithref(vp)) ) {
2493		        fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2494			entryp->returnval = -1;
2495			return(error);
2496		}
2497		context.vc_thread = current_thread();
2498		context.vc_ucred = fp->f_fglob->fg_cred;
2499
2500		error = VNOP_FSYNC( vp, sync_flag, &context);
2501
2502		(void)vnode_put(vp);
2503
2504		fp_drop(entryp->procp, entryp->aiocb.aio_fildes, fp, 0);
2505	}
2506	if ( error != 0 )
2507		entryp->returnval = -1;
2508
2509	return( error );
2510
2511} /* do_aio_fsync */
2512
2513
2514/*
2515 * is_already_queued - runs through our queues to see if the given
2516 * aiocbp / process is there.  Returns TRUE if there is a match
2517 * on any of our aio queues.
2518 *
2519 * Called with proc aio lock held (can be held spin)
2520 */
2521static boolean_t
2522is_already_queued(proc_t procp,
2523					user_addr_t aiocbp )
2524{
2525	aio_workq_entry		 	*entryp;
2526	boolean_t				result;
2527
2528	result = FALSE;
2529
2530	/* look for matches on our queue of async IO requests that have completed */
2531	TAILQ_FOREACH( entryp, &procp->p_aio_doneq, aio_proc_link ) {
2532		if ( aiocbp == entryp->uaiocbp ) {
2533			result = TRUE;
2534			goto ExitThisRoutine;
2535		}
2536	}
2537
2538	/* look for matches on our queue of active async IO requests */
2539	TAILQ_FOREACH( entryp, &procp->p_aio_activeq, aio_proc_link ) {
2540		if ( aiocbp == entryp->uaiocbp ) {
2541			result = TRUE;
2542			goto ExitThisRoutine;
2543		}
2544	}
2545
2546ExitThisRoutine:
2547	return( result );
2548
2549} /* is_already_queued */
2550
2551
2552static void
2553free_lio_context(aio_lio_context* context)
2554{
2555
2556#if DEBUG
2557	OSDecrementAtomic(&lio_contexts_alloced);
2558#endif /* DEBUG */
2559
2560	FREE( context, M_TEMP );
2561
2562} /* free_lio_context */
2563
2564
2565/*
2566 * aio initialization
2567 */
2568__private_extern__ void
2569aio_init( void )
2570{
2571	int			i;
2572
2573	aio_lock_grp_attr = lck_grp_attr_alloc_init();
2574	aio_proc_lock_grp = lck_grp_alloc_init("aio_proc", aio_lock_grp_attr);;
2575	aio_entry_lock_grp = lck_grp_alloc_init("aio_entry", aio_lock_grp_attr);;
2576	aio_queue_lock_grp = lck_grp_alloc_init("aio_queue", aio_lock_grp_attr);;
2577	aio_lock_attr = lck_attr_alloc_init();
2578
2579	lck_mtx_init(&aio_entry_mtx, aio_entry_lock_grp, aio_lock_attr);
2580	lck_mtx_init(&aio_proc_mtx, aio_proc_lock_grp, aio_lock_attr);
2581
2582	aio_anchor.aio_inflight_count = 0;
2583	aio_anchor.aio_done_count = 0;
2584	aio_anchor.aio_total_count = 0;
2585	aio_anchor.aio_num_workqs = AIO_NUM_WORK_QUEUES;
2586
2587	for (i = 0; i < AIO_NUM_WORK_QUEUES; i++) {
2588		aio_workq_init(&aio_anchor.aio_async_workqs[i]);
2589	}
2590
2591
2592	i = sizeof( aio_workq_entry );
2593	aio_workq_zonep = zinit( i, i * aio_max_requests, i * aio_max_requests, "aiowq" );
2594
2595	_aio_create_worker_threads( aio_worker_threads );
2596
2597} /* aio_init */
2598
2599
2600/*
2601 * aio worker threads created here.
2602 */
2603__private_extern__ void
2604_aio_create_worker_threads( int num )
2605{
2606	int			i;
2607
2608	/* create some worker threads to handle the async IO requests */
2609	for ( i = 0; i < num; i++ ) {
2610		thread_t		myThread;
2611
2612		if ( KERN_SUCCESS != kernel_thread_start((thread_continue_t)aio_work_thread, NULL, &myThread) ) {
2613			printf( "%s - failed to create a work thread \n", __FUNCTION__ );
2614		}
2615		else
2616			thread_deallocate(myThread);
2617	}
2618
2619	return;
2620
2621} /* _aio_create_worker_threads */
2622
2623/*
2624 * Return the current activation utask
2625 */
2626task_t
2627get_aiotask(void)
2628{
2629	return  ((struct uthread *)get_bsdthread_info(current_thread()))->uu_aio_task;
2630}
2631
2632
2633/*
2634 * In the case of an aiocb from a
2635 * 32-bit process we need to expand some longs and pointers to the correct
2636 * sizes in order to let downstream code always work on the same type of
2637 * aiocb (in our case that is a user_aiocb)
2638 */
2639static void
2640do_munge_aiocb_user32_to_user( struct user32_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2641{
2642	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2643	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2644	the_user_aiocbp->aio_buf = CAST_USER_ADDR_T(my_aiocbp->aio_buf);
2645	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2646	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2647	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2648
2649	/* special case here.  since we do not know if sigev_value is an */
2650	/* int or a ptr we do NOT cast the ptr to a user_addr_t.   This  */
2651	/* means if we send this info back to user space we need to remember */
2652	/* sigev_value was not expanded for the 32-bit case.  */
2653	/* NOTE - this does NOT affect us since we don't support sigev_value */
2654	/* yet in the aio context.  */
2655	//LP64
2656	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2657	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2658	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2659		my_aiocbp->aio_sigevent.sigev_value.sival_int;
2660	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2661		CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function);
2662	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2663		CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_attributes);
2664}
2665
2666/* Similar for 64-bit user process, so that we don't need to satisfy
2667 * the alignment constraints of the original user64_aiocb
2668 */
2669static void
2670do_munge_aiocb_user64_to_user( struct user64_aiocb *my_aiocbp, struct user_aiocb *the_user_aiocbp )
2671{
2672	the_user_aiocbp->aio_fildes = my_aiocbp->aio_fildes;
2673	the_user_aiocbp->aio_offset = my_aiocbp->aio_offset;
2674	the_user_aiocbp->aio_buf = my_aiocbp->aio_buf;
2675	the_user_aiocbp->aio_nbytes = my_aiocbp->aio_nbytes;
2676	the_user_aiocbp->aio_reqprio = my_aiocbp->aio_reqprio;
2677	the_user_aiocbp->aio_lio_opcode = my_aiocbp->aio_lio_opcode;
2678
2679	the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify;
2680	the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo;
2681	the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int =
2682		my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int;
2683	the_user_aiocbp->aio_sigevent.sigev_notify_function =
2684		my_aiocbp->aio_sigevent.sigev_notify_function;
2685	the_user_aiocbp->aio_sigevent.sigev_notify_attributes =
2686		my_aiocbp->aio_sigevent.sigev_notify_attributes;
2687}
2688