aio.c revision 4123:e5cb484f034e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Kernel asynchronous I/O.
31 * This is only for raw devices now (as of Nov. 1993).
32 */
33
34#include <sys/types.h>
35#include <sys/errno.h>
36#include <sys/conf.h>
37#include <sys/file.h>
38#include <sys/fs/snode.h>
39#include <sys/unistd.h>
40#include <sys/cmn_err.h>
41#include <vm/as.h>
42#include <vm/faultcode.h>
43#include <sys/sysmacros.h>
44#include <sys/procfs.h>
45#include <sys/kmem.h>
46#include <sys/autoconf.h>
47#include <sys/ddi_impldefs.h>
48#include <sys/sunddi.h>
49#include <sys/aio_impl.h>
50#include <sys/debug.h>
51#include <sys/param.h>
52#include <sys/systm.h>
53#include <sys/vmsystm.h>
54#include <sys/fs/pxfs_ki.h>
55#include <sys/contract/process_impl.h>
56
57/*
58 * external entry point.
59 */
60#ifdef _LP64
61static int64_t kaioc(long, long, long, long, long, long);
62#endif
63static int kaio(ulong_t *, rval_t *);
64
65
66#define	AIO_64	0
67#define	AIO_32	1
68#define	AIO_LARGEFILE	2
69
70/*
71 * implementation specific functions (private)
72 */
73#ifdef _LP64
74static int alio(int, aiocb_t **, int, struct sigevent *);
75#endif
76static int aionotify(void);
77static int aioinit(void);
78static int aiostart(void);
79static void alio_cleanup(aio_t *, aiocb_t **, int, int);
80static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
81    cred_t *);
82static void lio_set_error(aio_req_t *);
83static aio_t *aio_aiop_alloc();
84static int aio_req_alloc(aio_req_t **, aio_result_t *);
85static int aio_lio_alloc(aio_lio_t **);
86static aio_req_t *aio_req_done(void *);
87static aio_req_t *aio_req_remove(aio_req_t *);
88static int aio_req_find(aio_result_t *, aio_req_t **);
89static int aio_hash_insert(struct aio_req_t *, aio_t *);
90static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
91    aio_result_t *, vnode_t *);
92static int aio_cleanup_thread(aio_t *);
93static aio_lio_t *aio_list_get(aio_result_t *);
94static void lio_set_uerror(void *, int);
95extern void aio_zerolen(aio_req_t *);
96static int aiowait(struct timeval *, int, long	*);
97static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
98static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
99    aio_req_t *reqlist, aio_t *aiop, model_t model);
100static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
101static int aiosuspend(void *, int, struct  timespec *, int,
102    long	*, int);
103static int aliowait(int, void *, int, void *, int);
104static int aioerror(void *, int);
105static int aio_cancel(int, void *, long	*, int);
106static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
107static int aiorw(int, void *, int, int);
108
109static int alioLF(int, void *, int, void *);
110static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
111    aio_result_t *, vnode_t *);
112static int alio32(int, void *, int, void *);
113static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
114static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
115
116#ifdef  _SYSCALL32_IMPL
117static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
118void	aiocb_32ton(aiocb32_t *, aiocb_t *);
119#endif /* _SYSCALL32_IMPL */
120
121/*
122 * implementation specific functions (external)
123 */
124void aio_req_free(aio_t *, aio_req_t *);
125
126/*
127 * Event Port framework
128 */
129
130void aio_req_free_port(aio_t *, aio_req_t *);
131static int aio_port_callback(void *, int *, pid_t, int, void *);
132
133/*
134 * This is the loadable module wrapper.
135 */
136#include <sys/modctl.h>
137#include <sys/syscall.h>
138
139#ifdef _LP64
140
141static struct sysent kaio_sysent = {
142	6,
143	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
144	(int (*)())kaioc
145};
146
147#ifdef _SYSCALL32_IMPL
148static struct sysent kaio_sysent32 = {
149	7,
150	SE_NOUNLOAD | SE_64RVAL,
151	kaio
152};
153#endif  /* _SYSCALL32_IMPL */
154
155#else   /* _LP64 */
156
157static struct sysent kaio_sysent = {
158	7,
159	SE_NOUNLOAD | SE_32RVAL1,
160	kaio
161};
162
163#endif  /* _LP64 */
164
165/*
166 * Module linkage information for the kernel.
167 */
168
169static struct modlsys modlsys = {
170	&mod_syscallops,
171	"kernel Async I/O",
172	&kaio_sysent
173};
174
175#ifdef  _SYSCALL32_IMPL
176static struct modlsys modlsys32 = {
177	&mod_syscallops32,
178	"kernel Async I/O for 32 bit compatibility",
179	&kaio_sysent32
180};
181#endif  /* _SYSCALL32_IMPL */
182
183
184static struct modlinkage modlinkage = {
185	MODREV_1,
186	&modlsys,
187#ifdef  _SYSCALL32_IMPL
188	&modlsys32,
189#endif
190	NULL
191};
192
193int
194_init(void)
195{
196	int retval;
197
198	if ((retval = mod_install(&modlinkage)) != 0)
199		return (retval);
200
201	return (0);
202}
203
204int
205_fini(void)
206{
207	int retval;
208
209	retval = mod_remove(&modlinkage);
210
211	return (retval);
212}
213
214int
215_info(struct modinfo *modinfop)
216{
217	return (mod_info(&modlinkage, modinfop));
218}
219
220#ifdef	_LP64
221static int64_t
222kaioc(
223	long	a0,
224	long	a1,
225	long	a2,
226	long	a3,
227	long	a4,
228	long	a5)
229{
230	int	error;
231	long	rval = 0;
232
233	switch ((int)a0 & ~AIO_POLL_BIT) {
234	case AIOREAD:
235		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
236		    (offset_t)a4, (aio_result_t *)a5, FREAD);
237		break;
238	case AIOWRITE:
239		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
240		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
241		break;
242	case AIOWAIT:
243		error = aiowait((struct timeval *)a1, (int)a2, &rval);
244		break;
245	case AIOWAITN:
246		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
247		    (timespec_t *)a4);
248		break;
249	case AIONOTIFY:
250		error = aionotify();
251		break;
252	case AIOINIT:
253		error = aioinit();
254		break;
255	case AIOSTART:
256		error = aiostart();
257		break;
258	case AIOLIO:
259		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
260		    (struct sigevent *)a4);
261		break;
262	case AIOLIOWAIT:
263		error = aliowait((int)a1, (void *)a2, (int)a3,
264		    (struct sigevent *)a4, AIO_64);
265		break;
266	case AIOSUSPEND:
267		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
268		    (int)a4, &rval, AIO_64);
269		break;
270	case AIOERROR:
271		error = aioerror((void *)a1, AIO_64);
272		break;
273	case AIOAREAD:
274		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
275		break;
276	case AIOAWRITE:
277		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
278		break;
279	case AIOCANCEL:
280		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
281		break;
282
283	/*
284	 * The large file related stuff is valid only for
285	 * 32 bit kernel and not for 64 bit kernel
286	 * On 64 bit kernel we convert large file calls
287	 * to regular 64bit calls.
288	 */
289
290	default:
291		error = EINVAL;
292	}
293	if (error)
294		return ((int64_t)set_errno(error));
295	return (rval);
296}
297#endif
298
299static int
300kaio(
301	ulong_t *uap,
302	rval_t *rvp)
303{
304	long rval = 0;
305	int	error = 0;
306	offset_t	off;
307
308
309		rvp->r_vals = 0;
310#if defined(_LITTLE_ENDIAN)
311	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
312#else
313	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
314#endif
315
316	switch (uap[0] & ~AIO_POLL_BIT) {
317	/*
318	 * It must be the 32 bit system call on 64 bit kernel
319	 */
320	case AIOREAD:
321		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
322		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
323	case AIOWRITE:
324		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
325		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
326	case AIOWAIT:
327		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
328		    &rval);
329		break;
330	case AIOWAITN:
331		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
332		    (uint_t *)uap[3], (timespec_t *)uap[4]);
333		break;
334	case AIONOTIFY:
335		return (aionotify());
336	case AIOINIT:
337		return (aioinit());
338	case AIOSTART:
339		return (aiostart());
340	case AIOLIO:
341		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
342		    (void *)uap[4]));
343	case AIOLIOWAIT:
344		return (aliowait((int)uap[1], (void *)uap[2],
345		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
346	case AIOSUSPEND:
347		error = aiosuspend((void *)uap[1], (int)uap[2],
348		    (timespec_t *)uap[3], (int)uap[4],
349		    &rval, AIO_32);
350		break;
351	case AIOERROR:
352		return (aioerror((void *)uap[1], AIO_32));
353	case AIOAREAD:
354		return (aiorw((int)uap[0], (void *)uap[1],
355		    FREAD, AIO_32));
356	case AIOAWRITE:
357		return (aiorw((int)uap[0], (void *)uap[1],
358		    FWRITE, AIO_32));
359	case AIOCANCEL:
360		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
361		    AIO_32));
362		break;
363	case AIOLIO64:
364		return (alioLF((int)uap[1], (void *)uap[2],
365		    (int)uap[3], (void *)uap[4]));
366	case AIOLIOWAIT64:
367		return (aliowait(uap[1], (void *)uap[2],
368		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
369	case AIOSUSPEND64:
370		error = aiosuspend((void *)uap[1], (int)uap[2],
371		    (timespec_t *)uap[3], (int)uap[4], &rval,
372		    AIO_LARGEFILE);
373		break;
374	case AIOERROR64:
375		return (aioerror((void *)uap[1], AIO_LARGEFILE));
376	case AIOAREAD64:
377		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
378		    AIO_LARGEFILE));
379	case AIOAWRITE64:
380		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
381		    AIO_LARGEFILE));
382	case AIOCANCEL64:
383		error = (aio_cancel((int)uap[1], (void *)uap[2],
384		    &rval, AIO_LARGEFILE));
385		break;
386	default:
387		return (EINVAL);
388	}
389
390	rvp->r_val1 = rval;
391	return (error);
392}
393
394/*
395 * wake up LWPs in this process that are sleeping in
396 * aiowait().
397 */
398static int
399aionotify(void)
400{
401	aio_t	*aiop;
402
403	aiop = curproc->p_aio;
404	if (aiop == NULL)
405		return (0);
406
407	mutex_enter(&aiop->aio_mutex);
408	aiop->aio_notifycnt++;
409	cv_broadcast(&aiop->aio_waitcv);
410	mutex_exit(&aiop->aio_mutex);
411
412	return (0);
413}
414
415static int
416timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
417	timestruc_t **rqtp, int *blocking)
418{
419#ifdef	_SYSCALL32_IMPL
420	struct timeval32 wait_time_32;
421#endif
422	struct timeval wait_time;
423	model_t	model = get_udatamodel();
424
425	*rqtp = NULL;
426	if (timout == NULL) {		/* wait indefinitely */
427		*blocking = 1;
428		return (0);
429	}
430
431	/*
432	 * Need to correctly compare with the -1 passed in for a user
433	 * address pointer, with both 32 bit and 64 bit apps.
434	 */
435	if (model == DATAMODEL_NATIVE) {
436		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
437			*blocking = 0;
438			return (0);
439		}
440
441		if (copyin(timout, &wait_time, sizeof (wait_time)))
442			return (EFAULT);
443	}
444#ifdef	_SYSCALL32_IMPL
445	else {
446		/*
447		 * -1 from a 32bit app. It will not get sign extended.
448		 * don't wait if -1.
449		 */
450		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
451			*blocking = 0;
452			return (0);
453		}
454
455		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
456			return (EFAULT);
457		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
458	}
459#endif  /* _SYSCALL32_IMPL */
460
461	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
462		*blocking = 0;
463		return (0);
464	}
465
466	if (wait_time.tv_sec < 0 ||
467	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
468		return (EINVAL);
469
470	rqtime->tv_sec = wait_time.tv_sec;
471	rqtime->tv_nsec = wait_time.tv_usec * 1000;
472	*rqtp = rqtime;
473	*blocking = 1;
474
475	return (0);
476}
477
478static int
479timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
480	timestruc_t **rqtp, int *blocking)
481{
482#ifdef	_SYSCALL32_IMPL
483	timespec32_t wait_time_32;
484#endif
485	model_t	model = get_udatamodel();
486
487	*rqtp = NULL;
488	if (timout == NULL) {
489		*blocking = 1;
490		return (0);
491	}
492
493	if (model == DATAMODEL_NATIVE) {
494		if (copyin(timout, rqtime, sizeof (*rqtime)))
495			return (EFAULT);
496	}
497#ifdef	_SYSCALL32_IMPL
498	else {
499		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
500			return (EFAULT);
501		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
502	}
503#endif  /* _SYSCALL32_IMPL */
504
505	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
506		*blocking = 0;
507		return (0);
508	}
509
510	if (rqtime->tv_sec < 0 ||
511	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
512		return (EINVAL);
513
514	*rqtp = rqtime;
515	*blocking = 1;
516
517	return (0);
518}
519
520/*ARGSUSED*/
521static int
522aiowait(
523	struct timeval	*timout,
524	int	dontblockflg,
525	long	*rval)
526{
527	int 		error;
528	aio_t		*aiop;
529	aio_req_t	*reqp;
530	clock_t		status;
531	int		blocking;
532	int		timecheck;
533	timestruc_t	rqtime;
534	timestruc_t	*rqtp;
535
536	aiop = curproc->p_aio;
537	if (aiop == NULL)
538		return (EINVAL);
539
540	/*
541	 * Establish the absolute future time for the timeout.
542	 */
543	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
544	if (error)
545		return (error);
546	if (rqtp) {
547		timestruc_t now;
548		timecheck = timechanged;
549		gethrestime(&now);
550		timespecadd(rqtp, &now);
551	}
552
553	mutex_enter(&aiop->aio_mutex);
554	for (;;) {
555		/* process requests on poll queue */
556		if (aiop->aio_pollq) {
557			mutex_exit(&aiop->aio_mutex);
558			aio_cleanup(0);
559			mutex_enter(&aiop->aio_mutex);
560		}
561		if ((reqp = aio_req_remove(NULL)) != NULL) {
562			*rval = (long)reqp->aio_req_resultp;
563			break;
564		}
565		/* user-level done queue might not be empty */
566		if (aiop->aio_notifycnt > 0) {
567			aiop->aio_notifycnt--;
568			*rval = 1;
569			break;
570		}
571		/* don't block if no outstanding aio */
572		if (aiop->aio_outstanding == 0 && dontblockflg) {
573			error = EINVAL;
574			break;
575		}
576		if (blocking) {
577			status = cv_waituntil_sig(&aiop->aio_waitcv,
578			    &aiop->aio_mutex, rqtp, timecheck);
579
580			if (status > 0)		/* check done queue again */
581				continue;
582			if (status == 0) {	/* interrupted by a signal */
583				error = EINTR;
584				*rval = -1;
585			} else {		/* timer expired */
586				error = ETIME;
587			}
588		}
589		break;
590	}
591	mutex_exit(&aiop->aio_mutex);
592	if (reqp) {
593		aphysio_unlock(reqp);
594		aio_copyout_result(reqp);
595		mutex_enter(&aiop->aio_mutex);
596		aio_req_free(aiop, reqp);
597		mutex_exit(&aiop->aio_mutex);
598	}
599	return (error);
600}
601
602/*
603 * aiowaitn can be used to reap completed asynchronous requests submitted with
604 * lio_listio, aio_read or aio_write.
605 * This function only reaps asynchronous raw I/Os.
606 */
607
608/*ARGSUSED*/
609static int
610aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
611{
612	int 		error = 0;
613	aio_t		*aiop;
614	aio_req_t	*reqlist = NULL;
615	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
616	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
617	size_t		iocbsz;			/* users iocb size */
618	size_t		riocbsz;		/* returned iocb size */
619	int		iocb_index = 0;
620	model_t		model = get_udatamodel();
621	int		blocking = 1;
622	int		timecheck;
623	timestruc_t	rqtime;
624	timestruc_t	*rqtp;
625
626	aiop = curproc->p_aio;
627	if (aiop == NULL)
628		return (EINVAL);
629
630	if (aiop->aio_outstanding == 0)
631		return (EAGAIN);
632
633	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
634		return (EFAULT);
635
636	/* set *nwait to zero, if we must return prematurely */
637	if (copyout(&cnt, nwait, sizeof (uint_t)))
638		return (EFAULT);
639
640	if (waitcnt == 0) {
641		blocking = 0;
642		rqtp = NULL;
643		waitcnt = nent;
644	} else {
645		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
646		if (error)
647			return (error);
648	}
649
650	if (model == DATAMODEL_NATIVE)
651		iocbsz = (sizeof (aiocb_t *) * nent);
652#ifdef	_SYSCALL32_IMPL
653	else
654		iocbsz = (sizeof (caddr32_t) * nent);
655#endif  /* _SYSCALL32_IMPL */
656
657	/*
658	 * Only one aio_waitn call is allowed at a time.
659	 * The active aio_waitn will collect all requests
660	 * out of the "done" list and if necessary it will wait
661	 * for some/all pending requests to fulfill the nwait
662	 * parameter.
663	 * A second or further aio_waitn calls will sleep here
664	 * until the active aio_waitn finishes and leaves the kernel
665	 * If the second call does not block (poll), then return
666	 * immediately with the error code : EAGAIN.
667	 * If the second call should block, then sleep here, but
668	 * do not touch the timeout. The timeout starts when this
669	 * aio_waitn-call becomes active.
670	 */
671
672	mutex_enter(&aiop->aio_mutex);
673
674	while (aiop->aio_flags & AIO_WAITN) {
675		if (blocking == 0) {
676			mutex_exit(&aiop->aio_mutex);
677			return (EAGAIN);
678		}
679
680		/* block, no timeout */
681		aiop->aio_flags |= AIO_WAITN_PENDING;
682		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
683			mutex_exit(&aiop->aio_mutex);
684			return (EINTR);
685		}
686	}
687
688	/*
689	 * Establish the absolute future time for the timeout.
690	 */
691	if (rqtp) {
692		timestruc_t now;
693		timecheck = timechanged;
694		gethrestime(&now);
695		timespecadd(rqtp, &now);
696	}
697
698	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
699		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
700		aiop->aio_iocb = NULL;
701	}
702
703	if (aiop->aio_iocb == NULL) {
704		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
705		if (iocblist == NULL) {
706			mutex_exit(&aiop->aio_mutex);
707			return (ENOMEM);
708		}
709		aiop->aio_iocb = (aiocb_t **)iocblist;
710		aiop->aio_iocbsz = iocbsz;
711	} else {
712		iocblist = (char *)aiop->aio_iocb;
713	}
714
715	aiop->aio_waitncnt = waitcnt;
716	aiop->aio_flags |= AIO_WAITN;
717
718	for (;;) {
719		/* push requests on poll queue to done queue */
720		if (aiop->aio_pollq) {
721			mutex_exit(&aiop->aio_mutex);
722			aio_cleanup(0);
723			mutex_enter(&aiop->aio_mutex);
724		}
725
726		/* check for requests on done queue */
727		if (aiop->aio_doneq) {
728			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
729			aiop->aio_waitncnt = waitcnt - cnt;
730		}
731
732		/* user-level done queue might not be empty */
733		if (aiop->aio_notifycnt > 0) {
734			aiop->aio_notifycnt--;
735			error = 0;
736			break;
737		}
738
739		/*
740		 * if we are here second time as a result of timer
741		 * expiration, we reset error if there are enough
742		 * aiocb's to satisfy request.
743		 * We return also if all requests are already done
744		 * and we picked up the whole done queue.
745		 */
746
747		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
748		    aiop->aio_doneq == NULL)) {
749			error = 0;
750			break;
751		}
752
753		if ((cnt < waitcnt) && blocking) {
754			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
755				&aiop->aio_mutex, rqtp, timecheck);
756			if (rval > 0)
757				continue;
758			if (rval < 0) {
759				error = ETIME;
760				blocking = 0;
761				continue;
762			}
763			error = EINTR;
764		}
765		break;
766	}
767
768	mutex_exit(&aiop->aio_mutex);
769
770	if (cnt > 0) {
771
772		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
773		    aiop, model);
774
775		if (model == DATAMODEL_NATIVE)
776			riocbsz = (sizeof (aiocb_t *) * cnt);
777#ifdef	_SYSCALL32_IMPL
778		else
779			riocbsz = (sizeof (caddr32_t) * cnt);
780#endif  /* _SYSCALL32_IMPL */
781
782		if (copyout(iocblist, uiocb, riocbsz) ||
783		    copyout(&cnt, nwait, sizeof (uint_t)))
784			error = EFAULT;
785	}
786
787	if (aiop->aio_iocbsz > AIO_IOCB_MAX) {
788		kmem_free(iocblist, aiop->aio_iocbsz);
789		aiop->aio_iocb = NULL;
790	}
791
792	/* check if there is another thread waiting for execution */
793	mutex_enter(&aiop->aio_mutex);
794	aiop->aio_flags &= ~AIO_WAITN;
795	if (aiop->aio_flags & AIO_WAITN_PENDING) {
796		aiop->aio_flags &= ~AIO_WAITN_PENDING;
797		cv_signal(&aiop->aio_waitncv);
798	}
799	mutex_exit(&aiop->aio_mutex);
800
801	return (error);
802}
803
804/*
805 * aio_unlock_requests
806 * copyouts the result of the request as well as the return value.
807 * It builds the list of completed asynchronous requests,
808 * unlocks the allocated memory ranges and
809 * put the aio request structure back into the free list.
810 */
811
812static int
813aio_unlock_requests(
814	caddr_t	iocblist,
815	int	iocb_index,
816	aio_req_t *reqlist,
817	aio_t	*aiop,
818	model_t	model)
819{
820	aio_req_t	*reqp, *nreqp;
821
822	if (model == DATAMODEL_NATIVE) {
823		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
824			(((caddr_t *)iocblist)[iocb_index++]) =
825			    reqp->aio_req_iocb.iocb;
826			nreqp = reqp->aio_req_next;
827			aphysio_unlock(reqp);
828			aio_copyout_result(reqp);
829			mutex_enter(&aiop->aio_mutex);
830			aio_req_free(aiop, reqp);
831			mutex_exit(&aiop->aio_mutex);
832		}
833	}
834#ifdef	_SYSCALL32_IMPL
835	else {
836		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
837			((caddr32_t *)iocblist)[iocb_index++] =
838			    reqp->aio_req_iocb.iocb32;
839			nreqp = reqp->aio_req_next;
840			aphysio_unlock(reqp);
841			aio_copyout_result(reqp);
842			mutex_enter(&aiop->aio_mutex);
843			aio_req_free(aiop, reqp);
844			mutex_exit(&aiop->aio_mutex);
845		}
846	}
847#endif	/* _SYSCALL32_IMPL */
848	return (iocb_index);
849}
850
851/*
852 * aio_reqlist_concat
853 * moves "max" elements from the done queue to the reqlist queue and removes
854 * the AIO_DONEQ flag.
855 * - reqlist queue is a simple linked list
856 * - done queue is a double linked list
857 */
858
859static int
860aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
861{
862	aio_req_t *q2, *q2work, *list;
863	int count = 0;
864
865	list = *reqlist;
866	q2 = aiop->aio_doneq;
867	q2work = q2;
868	while (max-- > 0) {
869		q2work->aio_req_flags &= ~AIO_DONEQ;
870		q2work = q2work->aio_req_next;
871		count++;
872		if (q2work == q2)
873			break;
874	}
875
876	if (q2work == q2) {
877		/* all elements revised */
878		q2->aio_req_prev->aio_req_next = list;
879		list = q2;
880		aiop->aio_doneq = NULL;
881	} else {
882		/*
883		 * max < elements in the doneq
884		 * detach only the required amount of elements
885		 * out of the doneq
886		 */
887		q2work->aio_req_prev->aio_req_next = list;
888		list = q2;
889
890		aiop->aio_doneq = q2work;
891		q2work->aio_req_prev = q2->aio_req_prev;
892		q2->aio_req_prev->aio_req_next = q2work;
893	}
894	*reqlist = list;
895	return (count);
896}
897
898/*ARGSUSED*/
899static int
900aiosuspend(
901	void	*aiocb,
902	int	nent,
903	struct	timespec	*timout,
904	int	flag,
905	long	*rval,
906	int	run_mode)
907{
908	int 		error;
909	aio_t		*aiop;
910	aio_req_t	*reqp, *found, *next;
911	caddr_t		cbplist = NULL;
912	aiocb_t		*cbp, **ucbp;
913#ifdef	_SYSCALL32_IMPL
914	aiocb32_t	*cbp32;
915	caddr32_t	*ucbp32;
916#endif  /* _SYSCALL32_IMPL */
917	aiocb64_32_t	*cbp64;
918	int		rv;
919	int		i;
920	size_t		ssize;
921	model_t		model = get_udatamodel();
922	int		blocking;
923	int		timecheck;
924	timestruc_t	rqtime;
925	timestruc_t	*rqtp;
926
927	aiop = curproc->p_aio;
928	if (aiop == NULL || nent <= 0)
929		return (EINVAL);
930
931	/*
932	 * Establish the absolute future time for the timeout.
933	 */
934	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
935	if (error)
936		return (error);
937	if (rqtp) {
938		timestruc_t now;
939		timecheck = timechanged;
940		gethrestime(&now);
941		timespecadd(rqtp, &now);
942	}
943
944	/*
945	 * If we are not blocking and there's no IO complete
946	 * skip aiocb copyin.
947	 */
948	if (!blocking && (aiop->aio_pollq == NULL) &&
949	    (aiop->aio_doneq == NULL)) {
950		return (EAGAIN);
951	}
952
953	if (model == DATAMODEL_NATIVE)
954		ssize = (sizeof (aiocb_t *) * nent);
955#ifdef	_SYSCALL32_IMPL
956	else
957		ssize = (sizeof (caddr32_t) * nent);
958#endif  /* _SYSCALL32_IMPL */
959
960	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
961	if (cbplist == NULL)
962		return (ENOMEM);
963
964	if (copyin(aiocb, cbplist, ssize)) {
965		error = EFAULT;
966		goto done;
967	}
968
969	found = NULL;
970	/*
971	 * we need to get the aio_cleanupq_mutex since we call
972	 * aio_req_done().
973	 */
974	mutex_enter(&aiop->aio_cleanupq_mutex);
975	mutex_enter(&aiop->aio_mutex);
976	for (;;) {
977		/* push requests on poll queue to done queue */
978		if (aiop->aio_pollq) {
979			mutex_exit(&aiop->aio_mutex);
980			mutex_exit(&aiop->aio_cleanupq_mutex);
981			aio_cleanup(0);
982			mutex_enter(&aiop->aio_cleanupq_mutex);
983			mutex_enter(&aiop->aio_mutex);
984		}
985		/* check for requests on done queue */
986		if (aiop->aio_doneq) {
987			if (model == DATAMODEL_NATIVE)
988				ucbp = (aiocb_t **)cbplist;
989#ifdef	_SYSCALL32_IMPL
990			else
991				ucbp32 = (caddr32_t *)cbplist;
992#endif  /* _SYSCALL32_IMPL */
993			for (i = 0; i < nent; i++) {
994				if (model == DATAMODEL_NATIVE) {
995					if ((cbp = *ucbp++) == NULL)
996						continue;
997					if (run_mode != AIO_LARGEFILE)
998						reqp = aio_req_done(
999						    &cbp->aio_resultp);
1000					else {
1001						cbp64 = (aiocb64_32_t *)cbp;
1002						reqp = aio_req_done(
1003						    &cbp64->aio_resultp);
1004					}
1005				}
1006#ifdef	_SYSCALL32_IMPL
1007				else {
1008					if (run_mode == AIO_32) {
1009						if ((cbp32 =
1010						    (aiocb32_t *)(uintptr_t)
1011						    *ucbp32++) == NULL)
1012							continue;
1013						reqp = aio_req_done(
1014						    &cbp32->aio_resultp);
1015					} else if (run_mode == AIO_LARGEFILE) {
1016						if ((cbp64 =
1017						    (aiocb64_32_t *)(uintptr_t)
1018						    *ucbp32++) == NULL)
1019							continue;
1020						    reqp = aio_req_done(
1021							&cbp64->aio_resultp);
1022					}
1023
1024				}
1025#endif  /* _SYSCALL32_IMPL */
1026				if (reqp) {
1027					reqp->aio_req_next = found;
1028					found = reqp;
1029				}
1030				if (aiop->aio_doneq == NULL)
1031					break;
1032			}
1033			if (found)
1034				break;
1035		}
1036		if (aiop->aio_notifycnt > 0) {
1037			/*
1038			 * nothing on the kernel's queue. the user
1039			 * has notified the kernel that it has items
1040			 * on a user-level queue.
1041			 */
1042			aiop->aio_notifycnt--;
1043			*rval = 1;
1044			error = 0;
1045			break;
1046		}
1047		/* don't block if nothing is outstanding */
1048		if (aiop->aio_outstanding == 0) {
1049			error = EAGAIN;
1050			break;
1051		}
1052		if (blocking) {
1053			/*
1054			 * drop the aio_cleanupq_mutex as we are
1055			 * going to block.
1056			 */
1057			mutex_exit(&aiop->aio_cleanupq_mutex);
1058			rv = cv_waituntil_sig(&aiop->aio_waitcv,
1059				&aiop->aio_mutex, rqtp, timecheck);
1060			/*
1061			 * we have to drop aio_mutex and
1062			 * grab it in the right order.
1063			 */
1064			mutex_exit(&aiop->aio_mutex);
1065			mutex_enter(&aiop->aio_cleanupq_mutex);
1066			mutex_enter(&aiop->aio_mutex);
1067			if (rv > 0)	/* check done queue again */
1068				continue;
1069			if (rv == 0)	/* interrupted by a signal */
1070				error = EINTR;
1071			else		/* timer expired */
1072				error = ETIME;
1073		} else {
1074			error = EAGAIN;
1075		}
1076		break;
1077	}
1078	mutex_exit(&aiop->aio_mutex);
1079	mutex_exit(&aiop->aio_cleanupq_mutex);
1080	for (reqp = found; reqp != NULL; reqp = next) {
1081		next = reqp->aio_req_next;
1082		aphysio_unlock(reqp);
1083		aio_copyout_result(reqp);
1084		mutex_enter(&aiop->aio_mutex);
1085		aio_req_free(aiop, reqp);
1086		mutex_exit(&aiop->aio_mutex);
1087	}
1088done:
1089	kmem_free(cbplist, ssize);
1090	return (error);
1091}
1092
1093/*
1094 * initialize aio by allocating an aio_t struct for this
1095 * process.
1096 */
1097static int
1098aioinit(void)
1099{
1100	proc_t *p = curproc;
1101	aio_t *aiop;
1102	mutex_enter(&p->p_lock);
1103	if ((aiop = p->p_aio) == NULL) {
1104		aiop = aio_aiop_alloc();
1105		p->p_aio = aiop;
1106	}
1107	mutex_exit(&p->p_lock);
1108	if (aiop == NULL)
1109		return (ENOMEM);
1110	return (0);
1111}
1112
1113/*
1114 * start a special thread that will cleanup after aio requests
1115 * that are preventing a segment from being unmapped. as_unmap()
1116 * blocks until all phsyio to this segment is completed. this
1117 * doesn't happen until all the pages in this segment are not
1118 * SOFTLOCKed. Some pages will be SOFTLOCKed when there are aio
1119 * requests still outstanding. this special thread will make sure
1120 * that these SOFTLOCKed pages will eventually be SOFTUNLOCKed.
1121 *
1122 * this function will return an error if the process has only
1123 * one LWP. the assumption is that the caller is a separate LWP
1124 * that remains blocked in the kernel for the life of this process.
1125 */
1126static int
1127aiostart(void)
1128{
1129	proc_t *p = curproc;
1130	aio_t *aiop;
1131	int first, error = 0;
1132
1133	if (p->p_lwpcnt == 1)
1134		return (EDEADLK);
1135	mutex_enter(&p->p_lock);
1136	if ((aiop = p->p_aio) == NULL)
1137		error = EINVAL;
1138	else {
1139		first = aiop->aio_ok;
1140		if (aiop->aio_ok == 0)
1141			aiop->aio_ok = 1;
1142	}
1143	mutex_exit(&p->p_lock);
1144	if (error == 0 && first == 0) {
1145		return (aio_cleanup_thread(aiop));
1146		/* should return only to exit */
1147	}
1148	return (error);
1149}
1150
1151/*
1152 * Associate an aiocb with a port.
1153 * This function is used by aiorw() to associate a transaction with a port.
1154 * Allocate an event port structure (port_alloc_event()) and store the
1155 * delivered user pointer (portnfy_user) in the portkev_user field of the
1156 * port_kevent_t structure..
1157 * The aio_req_portkev pointer in the aio_req_t structure was added to identify
1158 * the port association.
1159 */
1160
1161static int
1162aio_req_assoc_port_rw(port_notify_t *pntfy, aiocb_t *cbp,
1163	aio_req_t *reqp, int event)
1164{
1165	port_kevent_t	*pkevp = NULL;
1166	int		error;
1167
1168	error = port_alloc_event(pntfy->portnfy_port, PORT_ALLOC_DEFAULT,
1169	    PORT_SOURCE_AIO, &pkevp);
1170	if (error) {
1171		if ((error == ENOMEM) || (error == EAGAIN))
1172			error = EAGAIN;
1173		else
1174			error = EINVAL;
1175	} else {
1176		port_init_event(pkevp, (uintptr_t)cbp, pntfy->portnfy_user,
1177		    aio_port_callback, reqp);
1178		pkevp->portkev_events = event;
1179		reqp->aio_req_portkev = pkevp;
1180		reqp->aio_req_port = pntfy->portnfy_port;
1181	}
1182	return (error);
1183}
1184
1185#ifdef _LP64
1186
1187/*
1188 * Asynchronous list IO. A chain of aiocb's are copied in
1189 * one at a time. If the aiocb is invalid, it is skipped.
1190 * For each aiocb, the appropriate driver entry point is
1191 * called. Optimize for the common case where the list
1192 * of requests is to the same file descriptor.
1193 *
1194 * One possible optimization is to define a new driver entry
1195 * point that supports a list of IO requests. Whether this
1196 * improves performance depends somewhat on the driver's
1197 * locking strategy. Processing a list could adversely impact
1198 * the driver's interrupt latency.
1199 */
1200static int
1201alio(
1202	int		mode_arg,
1203	aiocb_t		**aiocb_arg,
1204	int		nent,
1205	struct sigevent	*sigev)
1206{
1207	file_t		*fp;
1208	file_t		*prev_fp = NULL;
1209	int		prev_mode = -1;
1210	struct vnode	*vp;
1211	aio_lio_t	*head;
1212	aio_req_t	*reqp;
1213	aio_t		*aiop;
1214	caddr_t		cbplist;
1215	aiocb_t		cb;
1216	aiocb_t		*aiocb = &cb;
1217	aiocb_t		*cbp;
1218	aiocb_t		**ucbp;
1219	struct sigevent sigevk;
1220	sigqueue_t	*sqp;
1221	int		(*aio_func)();
1222	int		mode;
1223	int		error = 0;
1224	int		aio_errors = 0;
1225	int		i;
1226	size_t		ssize;
1227	int		deadhead = 0;
1228	int		aio_notsupported = 0;
1229	int		lio_head_port;
1230	int		aio_port;
1231	int		aio_thread;
1232	port_kevent_t	*pkevtp = NULL;
1233	port_notify_t	pnotify;
1234	int		event;
1235
1236	aiop = curproc->p_aio;
1237	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1238		return (EINVAL);
1239
1240	ssize = (sizeof (aiocb_t *) * nent);
1241	cbplist = kmem_alloc(ssize, KM_SLEEP);
1242	ucbp = (aiocb_t **)cbplist;
1243
1244	if (copyin(aiocb_arg, cbplist, ssize) ||
1245	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent)))) {
1246		kmem_free(cbplist, ssize);
1247		return (EFAULT);
1248	}
1249
1250	/* Event Ports  */
1251	if (sigev &&
1252	    (sigevk.sigev_notify == SIGEV_THREAD ||
1253	    sigevk.sigev_notify == SIGEV_PORT)) {
1254		if (sigevk.sigev_notify == SIGEV_THREAD) {
1255			pnotify.portnfy_port = sigevk.sigev_signo;
1256			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
1257		} else if (copyin(sigevk.sigev_value.sival_ptr,
1258		    &pnotify, sizeof (pnotify))) {
1259			kmem_free(cbplist, ssize);
1260			return (EFAULT);
1261		}
1262		error = port_alloc_event(pnotify.portnfy_port,
1263		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
1264		if (error) {
1265			if (error == ENOMEM || error == EAGAIN)
1266				error = EAGAIN;
1267			else
1268				error = EINVAL;
1269			kmem_free(cbplist, ssize);
1270			return (error);
1271		}
1272		lio_head_port = pnotify.portnfy_port;
1273	}
1274
1275	/*
1276	 * a list head should be allocated if notification is
1277	 * enabled for this list.
1278	 */
1279	head = NULL;
1280
1281	if (mode_arg == LIO_WAIT || sigev) {
1282		mutex_enter(&aiop->aio_mutex);
1283		error = aio_lio_alloc(&head);
1284		mutex_exit(&aiop->aio_mutex);
1285		if (error)
1286			goto done;
1287		deadhead = 1;
1288		head->lio_nent = nent;
1289		head->lio_refcnt = nent;
1290		head->lio_port = -1;
1291		head->lio_portkev = NULL;
1292		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
1293		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
1294			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
1295			if (sqp == NULL) {
1296				error = EAGAIN;
1297				goto done;
1298			}
1299			sqp->sq_func = NULL;
1300			sqp->sq_next = NULL;
1301			sqp->sq_info.si_code = SI_ASYNCIO;
1302			sqp->sq_info.si_pid = curproc->p_pid;
1303			sqp->sq_info.si_ctid = PRCTID(curproc);
1304			sqp->sq_info.si_zoneid = getzoneid();
1305			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
1306			sqp->sq_info.si_signo = sigevk.sigev_signo;
1307			sqp->sq_info.si_value = sigevk.sigev_value;
1308			head->lio_sigqp = sqp;
1309		} else {
1310			head->lio_sigqp = NULL;
1311		}
1312		if (pkevtp) {
1313			/*
1314			 * Prepare data to send when list of aiocb's
1315			 * has completed.
1316			 */
1317			port_init_event(pkevtp, (uintptr_t)sigev,
1318			    (void *)(uintptr_t)pnotify.portnfy_user,
1319			    NULL, head);
1320			pkevtp->portkev_events = AIOLIO;
1321			head->lio_portkev = pkevtp;
1322			head->lio_port = pnotify.portnfy_port;
1323		}
1324	}
1325
1326	for (i = 0; i < nent; i++, ucbp++) {
1327
1328		cbp = *ucbp;
1329		/* skip entry if it can't be copied. */
1330		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
1331			if (head) {
1332				mutex_enter(&aiop->aio_mutex);
1333				head->lio_nent--;
1334				head->lio_refcnt--;
1335				mutex_exit(&aiop->aio_mutex);
1336			}
1337			continue;
1338		}
1339
1340		/* skip if opcode for aiocb is LIO_NOP */
1341		mode = aiocb->aio_lio_opcode;
1342		if (mode == LIO_NOP) {
1343			cbp = NULL;
1344			if (head) {
1345				mutex_enter(&aiop->aio_mutex);
1346				head->lio_nent--;
1347				head->lio_refcnt--;
1348				mutex_exit(&aiop->aio_mutex);
1349			}
1350			continue;
1351		}
1352
1353		/* increment file descriptor's ref count. */
1354		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
1355			lio_set_uerror(&cbp->aio_resultp, EBADF);
1356			if (head) {
1357				mutex_enter(&aiop->aio_mutex);
1358				head->lio_nent--;
1359				head->lio_refcnt--;
1360				mutex_exit(&aiop->aio_mutex);
1361			}
1362			aio_errors++;
1363			continue;
1364		}
1365
1366		/*
1367		 * check the permission of the partition
1368		 */
1369		if ((fp->f_flag & mode) == 0) {
1370			releasef(aiocb->aio_fildes);
1371			lio_set_uerror(&cbp->aio_resultp, EBADF);
1372			if (head) {
1373				mutex_enter(&aiop->aio_mutex);
1374				head->lio_nent--;
1375				head->lio_refcnt--;
1376				mutex_exit(&aiop->aio_mutex);
1377			}
1378			aio_errors++;
1379			continue;
1380		}
1381
1382		/*
1383		 * common case where requests are to the same fd
1384		 * for the same r/w operation.
1385		 * for UFS, need to set EBADFD
1386		 */
1387		vp = fp->f_vnode;
1388		if (fp != prev_fp || mode != prev_mode) {
1389			aio_func = check_vp(vp, mode);
1390			if (aio_func == NULL) {
1391				prev_fp = NULL;
1392				releasef(aiocb->aio_fildes);
1393				lio_set_uerror(&cbp->aio_resultp, EBADFD);
1394				aio_notsupported++;
1395				if (head) {
1396					mutex_enter(&aiop->aio_mutex);
1397					head->lio_nent--;
1398					head->lio_refcnt--;
1399					mutex_exit(&aiop->aio_mutex);
1400				}
1401				continue;
1402			} else {
1403				prev_fp = fp;
1404				prev_mode = mode;
1405			}
1406		}
1407
1408		error = aio_req_setup(&reqp, aiop, aiocb,
1409		    &cbp->aio_resultp, vp);
1410		if (error) {
1411			releasef(aiocb->aio_fildes);
1412			lio_set_uerror(&cbp->aio_resultp, error);
1413			if (head) {
1414				mutex_enter(&aiop->aio_mutex);
1415				head->lio_nent--;
1416				head->lio_refcnt--;
1417				mutex_exit(&aiop->aio_mutex);
1418			}
1419			aio_errors++;
1420			continue;
1421		}
1422
1423		reqp->aio_req_lio = head;
1424		deadhead = 0;
1425
1426		/*
1427		 * Set the errno field now before sending the request to
1428		 * the driver to avoid a race condition
1429		 */
1430		(void) suword32(&cbp->aio_resultp.aio_errno,
1431		    EINPROGRESS);
1432
1433		reqp->aio_req_iocb.iocb = (caddr_t)cbp;
1434
1435		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
1436		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
1437		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
1438		if (aio_port | aio_thread) {
1439			port_kevent_t *lpkevp;
1440			/*
1441			 * Prepare data to send with each aiocb completed.
1442			 */
1443			if (aio_port) {
1444				void *paddr =
1445				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1446				if (copyin(paddr, &pnotify, sizeof (pnotify)))
1447					error = EFAULT;
1448			} else {	/* aio_thread */
1449				pnotify.portnfy_port =
1450				    aiocb->aio_sigevent.sigev_signo;
1451				pnotify.portnfy_user =
1452				    aiocb->aio_sigevent.sigev_value.sival_ptr;
1453			}
1454			if (error)
1455				/* EMPTY */;
1456			else if (pkevtp != NULL &&
1457			    pnotify.portnfy_port == lio_head_port)
1458				error = port_dup_event(pkevtp, &lpkevp,
1459				    PORT_ALLOC_DEFAULT);
1460			else
1461				error = port_alloc_event(pnotify.portnfy_port,
1462				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
1463				    &lpkevp);
1464			if (error == 0) {
1465				port_init_event(lpkevp, (uintptr_t)cbp,
1466				    (void *)(uintptr_t)pnotify.portnfy_user,
1467				    aio_port_callback, reqp);
1468				lpkevp->portkev_events = event;
1469				reqp->aio_req_portkev = lpkevp;
1470				reqp->aio_req_port = pnotify.portnfy_port;
1471			}
1472		}
1473
1474		/*
1475		 * send the request to driver.
1476		 */
1477		if (error == 0) {
1478			if (aiocb->aio_nbytes == 0) {
1479				clear_active_fd(aiocb->aio_fildes);
1480				aio_zerolen(reqp);
1481				continue;
1482			}
1483			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
1484			    CRED());
1485		}
1486
1487		/*
1488		 * the fd's ref count is not decremented until the IO has
1489		 * completed unless there was an error.
1490		 */
1491		if (error) {
1492			releasef(aiocb->aio_fildes);
1493			lio_set_uerror(&cbp->aio_resultp, error);
1494			if (head) {
1495				mutex_enter(&aiop->aio_mutex);
1496				head->lio_nent--;
1497				head->lio_refcnt--;
1498				mutex_exit(&aiop->aio_mutex);
1499			}
1500			if (error == ENOTSUP)
1501				aio_notsupported++;
1502			else
1503				aio_errors++;
1504			lio_set_error(reqp);
1505		} else {
1506			clear_active_fd(aiocb->aio_fildes);
1507		}
1508	}
1509
1510	if (aio_notsupported) {
1511		error = ENOTSUP;
1512	} else if (aio_errors) {
1513		/*
1514		 * return EIO if any request failed
1515		 */
1516		error = EIO;
1517	}
1518
1519	if (mode_arg == LIO_WAIT) {
1520		mutex_enter(&aiop->aio_mutex);
1521		while (head->lio_refcnt > 0) {
1522			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1523				mutex_exit(&aiop->aio_mutex);
1524				error = EINTR;
1525				goto done;
1526			}
1527		}
1528		mutex_exit(&aiop->aio_mutex);
1529		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_64);
1530	}
1531
1532done:
1533	kmem_free(cbplist, ssize);
1534	if (deadhead) {
1535		if (head->lio_sigqp)
1536			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
1537		if (head->lio_portkev)
1538			port_free_event(head->lio_portkev);
1539		kmem_free(head, sizeof (aio_lio_t));
1540	}
1541	return (error);
1542}
1543
1544#endif /* _LP64 */
1545
1546/*
1547 * Asynchronous list IO.
1548 * If list I/O is called with LIO_WAIT it can still return
1549 * before all the I/O's are completed if a signal is caught
1550 * or if the list include UFS I/O requests. If this happens,
1551 * libaio will call aliowait() to wait for the I/O's to
1552 * complete
1553 */
1554/*ARGSUSED*/
1555static int
1556aliowait(
1557	int	mode,
1558	void	*aiocb,
1559	int	nent,
1560	void	*sigev,
1561	int	run_mode)
1562{
1563	aio_lio_t	*head;
1564	aio_t		*aiop;
1565	caddr_t		cbplist;
1566	aiocb_t		*cbp, **ucbp;
1567#ifdef	_SYSCALL32_IMPL
1568	aiocb32_t	*cbp32;
1569	caddr32_t	*ucbp32;
1570	aiocb64_32_t	*cbp64;
1571#endif
1572	int		error = 0;
1573	int		i;
1574	size_t		ssize = 0;
1575	model_t		model = get_udatamodel();
1576
1577	aiop = curproc->p_aio;
1578	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
1579		return (EINVAL);
1580
1581	if (model == DATAMODEL_NATIVE)
1582		ssize = (sizeof (aiocb_t *) * nent);
1583#ifdef	_SYSCALL32_IMPL
1584	else
1585		ssize = (sizeof (caddr32_t) * nent);
1586#endif  /* _SYSCALL32_IMPL */
1587
1588	if (ssize == 0)
1589		return (EINVAL);
1590
1591	cbplist = kmem_alloc(ssize, KM_SLEEP);
1592
1593	if (model == DATAMODEL_NATIVE)
1594		ucbp = (aiocb_t **)cbplist;
1595#ifdef	_SYSCALL32_IMPL
1596	else
1597		ucbp32 = (caddr32_t *)cbplist;
1598#endif  /* _SYSCALL32_IMPL */
1599
1600	if (copyin(aiocb, cbplist, ssize)) {
1601		error = EFAULT;
1602		goto done;
1603	}
1604
1605	/*
1606	 * To find the list head, we go through the
1607	 * list of aiocb structs, find the request
1608	 * its for, then get the list head that reqp
1609	 * points to
1610	 */
1611	head = NULL;
1612
1613	for (i = 0; i < nent; i++) {
1614		if (model == DATAMODEL_NATIVE) {
1615			/*
1616			 * Since we are only checking for a NULL pointer
1617			 * Following should work on both native data sizes
1618			 * as well as for largefile aiocb.
1619			 */
1620			if ((cbp = *ucbp++) == NULL)
1621				continue;
1622			if (run_mode != AIO_LARGEFILE)
1623				if (head = aio_list_get(&cbp->aio_resultp))
1624					break;
1625			else {
1626				/*
1627				 * This is a case when largefile call is
1628				 * made on 32 bit kernel.
1629				 * Treat each pointer as pointer to
1630				 * aiocb64_32
1631				 */
1632				if (head = aio_list_get((aio_result_t *)
1633				    &(((aiocb64_32_t *)cbp)->aio_resultp)))
1634					break;
1635			}
1636		}
1637#ifdef	_SYSCALL32_IMPL
1638		else {
1639			if (run_mode == AIO_LARGEFILE) {
1640				if ((cbp64 = (aiocb64_32_t *)
1641				    (uintptr_t)*ucbp32++) == NULL)
1642					continue;
1643				if (head = aio_list_get((aio_result_t *)
1644				    &cbp64->aio_resultp))
1645					break;
1646			} else if (run_mode == AIO_32) {
1647				if ((cbp32 = (aiocb32_t *)
1648				    (uintptr_t)*ucbp32++) == NULL)
1649					continue;
1650				if (head = aio_list_get((aio_result_t *)
1651				    &cbp32->aio_resultp))
1652					break;
1653			}
1654		}
1655#endif	/* _SYSCALL32_IMPL */
1656	}
1657
1658	if (head == NULL) {
1659		error = EINVAL;
1660		goto done;
1661	}
1662
1663	mutex_enter(&aiop->aio_mutex);
1664	while (head->lio_refcnt > 0) {
1665		if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
1666			mutex_exit(&aiop->aio_mutex);
1667			error = EINTR;
1668			goto done;
1669		}
1670	}
1671	mutex_exit(&aiop->aio_mutex);
1672	alio_cleanup(aiop, (aiocb_t **)cbplist, nent, run_mode);
1673done:
1674	kmem_free(cbplist, ssize);
1675	return (error);
1676}
1677
1678aio_lio_t *
1679aio_list_get(aio_result_t *resultp)
1680{
1681	aio_lio_t	*head = NULL;
1682	aio_t		*aiop;
1683	aio_req_t 	**bucket;
1684	aio_req_t 	*reqp;
1685	long		index;
1686
1687	aiop = curproc->p_aio;
1688	if (aiop == NULL)
1689		return (NULL);
1690
1691	if (resultp) {
1692		index = AIO_HASH(resultp);
1693		bucket = &aiop->aio_hash[index];
1694		for (reqp = *bucket; reqp != NULL;
1695		    reqp = reqp->aio_hash_next) {
1696			if (reqp->aio_req_resultp == resultp) {
1697				head = reqp->aio_req_lio;
1698				return (head);
1699			}
1700		}
1701	}
1702	return (NULL);
1703}
1704
1705
1706static void
1707lio_set_uerror(void *resultp, int error)
1708{
1709	/*
1710	 * the resultp field is a pointer to where the
1711	 * error should be written out to the user's
1712	 * aiocb.
1713	 *
1714	 */
1715	if (get_udatamodel() == DATAMODEL_NATIVE) {
1716		(void) sulword(&((aio_result_t *)resultp)->aio_return,
1717		    (ssize_t)-1);
1718		(void) suword32(&((aio_result_t *)resultp)->aio_errno, error);
1719	}
1720#ifdef	_SYSCALL32_IMPL
1721	else {
1722		(void) suword32(&((aio_result32_t *)resultp)->aio_return,
1723		    (uint_t)-1);
1724		(void) suword32(&((aio_result32_t *)resultp)->aio_errno, error);
1725	}
1726#endif  /* _SYSCALL32_IMPL */
1727}
1728
1729/*
1730 * do cleanup completion for all requests in list. memory for
1731 * each request is also freed.
1732 */
1733static void
1734alio_cleanup(aio_t *aiop, aiocb_t **cbp, int nent, int run_mode)
1735{
1736	int i;
1737	aio_req_t *reqp;
1738	aio_result_t *resultp;
1739	aiocb64_32_t *aiocb_64;
1740
1741	for (i = 0; i < nent; i++) {
1742		if (get_udatamodel() == DATAMODEL_NATIVE) {
1743			if (cbp[i] == NULL)
1744				continue;
1745			if (run_mode == AIO_LARGEFILE) {
1746				aiocb_64 = (aiocb64_32_t *)cbp[i];
1747				resultp = (aio_result_t *)
1748				    &aiocb_64->aio_resultp;
1749			} else
1750				resultp = &cbp[i]->aio_resultp;
1751		}
1752#ifdef	_SYSCALL32_IMPL
1753		else {
1754			aiocb32_t *aiocb_32;
1755			caddr32_t *cbp32;
1756
1757			cbp32 = (caddr32_t *)cbp;
1758			if (cbp32[i] == NULL)
1759				continue;
1760			if (run_mode == AIO_32) {
1761				aiocb_32 = (aiocb32_t *)(uintptr_t)cbp32[i];
1762				resultp = (aio_result_t *)&aiocb_32->
1763				    aio_resultp;
1764			} else if (run_mode == AIO_LARGEFILE) {
1765				aiocb_64 = (aiocb64_32_t *)(uintptr_t)cbp32[i];
1766				resultp = (aio_result_t *)&aiocb_64->
1767				    aio_resultp;
1768			}
1769		}
1770#endif  /* _SYSCALL32_IMPL */
1771		/*
1772		 * we need to get the aio_cleanupq_mutex since we call
1773		 * aio_req_done().
1774		 */
1775		mutex_enter(&aiop->aio_cleanupq_mutex);
1776		mutex_enter(&aiop->aio_mutex);
1777		reqp = aio_req_done(resultp);
1778		mutex_exit(&aiop->aio_mutex);
1779		mutex_exit(&aiop->aio_cleanupq_mutex);
1780		if (reqp != NULL) {
1781			aphysio_unlock(reqp);
1782			aio_copyout_result(reqp);
1783			mutex_enter(&aiop->aio_mutex);
1784			aio_req_free(aiop, reqp);
1785			mutex_exit(&aiop->aio_mutex);
1786		}
1787	}
1788}
1789
1790/*
1791 * Write out the results for an aio request that is done.
1792 */
1793static int
1794aioerror(void *cb, int run_mode)
1795{
1796	aio_result_t *resultp;
1797	aio_t *aiop;
1798	aio_req_t *reqp;
1799	int retval;
1800
1801	aiop = curproc->p_aio;
1802	if (aiop == NULL || cb == NULL)
1803		return (EINVAL);
1804
1805	if (get_udatamodel() == DATAMODEL_NATIVE) {
1806		if (run_mode == AIO_LARGEFILE)
1807			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1808			    aio_resultp;
1809		else
1810			resultp = &((aiocb_t *)cb)->aio_resultp;
1811	}
1812#ifdef	_SYSCALL32_IMPL
1813	else {
1814		if (run_mode == AIO_LARGEFILE)
1815			resultp = (aio_result_t *)&((aiocb64_32_t *)cb)->
1816			    aio_resultp;
1817		else if (run_mode == AIO_32)
1818			resultp = (aio_result_t *)&((aiocb32_t *)cb)->
1819			    aio_resultp;
1820	}
1821#endif  /* _SYSCALL32_IMPL */
1822	/*
1823	 * we need to get the aio_cleanupq_mutex since we call
1824	 * aio_req_find().
1825	 */
1826	mutex_enter(&aiop->aio_cleanupq_mutex);
1827	mutex_enter(&aiop->aio_mutex);
1828	retval = aio_req_find(resultp, &reqp);
1829	mutex_exit(&aiop->aio_mutex);
1830	mutex_exit(&aiop->aio_cleanupq_mutex);
1831	if (retval == 0) {
1832		aphysio_unlock(reqp);
1833		aio_copyout_result(reqp);
1834		mutex_enter(&aiop->aio_mutex);
1835		aio_req_free(aiop, reqp);
1836		mutex_exit(&aiop->aio_mutex);
1837		return (0);
1838	} else if (retval == 1)
1839		return (EINPROGRESS);
1840	else if (retval == 2)
1841		return (EINVAL);
1842	return (0);
1843}
1844
1845/*
1846 * 	aio_cancel - if no requests outstanding,
1847 *			return AIO_ALLDONE
1848 *			else
1849 *			return AIO_NOTCANCELED
1850 */
1851static int
1852aio_cancel(
1853	int	fildes,
1854	void 	*cb,
1855	long	*rval,
1856	int	run_mode)
1857{
1858	aio_t *aiop;
1859	void *resultp;
1860	int index;
1861	aio_req_t **bucket;
1862	aio_req_t *ent;
1863
1864
1865	/*
1866	 * Verify valid file descriptor
1867	 */
1868	if ((getf(fildes)) == NULL) {
1869		return (EBADF);
1870	}
1871	releasef(fildes);
1872
1873	aiop = curproc->p_aio;
1874	if (aiop == NULL)
1875		return (EINVAL);
1876
1877	if (aiop->aio_outstanding == 0) {
1878		*rval = AIO_ALLDONE;
1879		return (0);
1880	}
1881
1882	mutex_enter(&aiop->aio_mutex);
1883	if (cb != NULL) {
1884		if (get_udatamodel() == DATAMODEL_NATIVE) {
1885			if (run_mode == AIO_LARGEFILE)
1886				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1887				    ->aio_resultp;
1888			else
1889				resultp = &((aiocb_t *)cb)->aio_resultp;
1890		}
1891#ifdef	_SYSCALL32_IMPL
1892		else {
1893			if (run_mode == AIO_LARGEFILE)
1894				resultp = (aio_result_t *)&((aiocb64_32_t *)cb)
1895				    ->aio_resultp;
1896			else if (run_mode == AIO_32)
1897				resultp = (aio_result_t *)&((aiocb32_t *)cb)
1898				    ->aio_resultp;
1899		}
1900#endif  /* _SYSCALL32_IMPL */
1901		index = AIO_HASH(resultp);
1902		bucket = &aiop->aio_hash[index];
1903		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1904			if (ent->aio_req_resultp == resultp) {
1905				if ((ent->aio_req_flags & AIO_PENDING) == 0) {
1906					mutex_exit(&aiop->aio_mutex);
1907					*rval = AIO_ALLDONE;
1908					return (0);
1909				}
1910				mutex_exit(&aiop->aio_mutex);
1911				*rval = AIO_NOTCANCELED;
1912				return (0);
1913			}
1914		}
1915		mutex_exit(&aiop->aio_mutex);
1916		*rval = AIO_ALLDONE;
1917		return (0);
1918	}
1919
1920	for (index = 0; index < AIO_HASHSZ; index++) {
1921		bucket = &aiop->aio_hash[index];
1922		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
1923			if (ent->aio_req_fd == fildes) {
1924				if ((ent->aio_req_flags & AIO_PENDING) != 0) {
1925					mutex_exit(&aiop->aio_mutex);
1926					*rval = AIO_NOTCANCELED;
1927					return (0);
1928				}
1929			}
1930		}
1931	}
1932	mutex_exit(&aiop->aio_mutex);
1933	*rval = AIO_ALLDONE;
1934	return (0);
1935}
1936
1937/*
1938 * solaris version of asynchronous read and write
1939 */
1940static int
1941arw(
1942	int	opcode,
1943	int	fdes,
1944	char	*bufp,
1945	int	bufsize,
1946	offset_t	offset,
1947	aio_result_t	*resultp,
1948	int		mode)
1949{
1950	file_t		*fp;
1951	int		error;
1952	struct vnode	*vp;
1953	aio_req_t	*reqp;
1954	aio_t		*aiop;
1955	int		(*aio_func)();
1956#ifdef _LP64
1957	aiocb_t		aiocb;
1958#else
1959	aiocb64_32_t	aiocb64;
1960#endif
1961
1962	aiop = curproc->p_aio;
1963	if (aiop == NULL)
1964		return (EINVAL);
1965
1966	if ((fp = getf(fdes)) == NULL) {
1967		return (EBADF);
1968	}
1969
1970	/*
1971	 * check the permission of the partition
1972	 */
1973	if ((fp->f_flag & mode) == 0) {
1974		releasef(fdes);
1975		return (EBADF);
1976	}
1977
1978	vp = fp->f_vnode;
1979	aio_func = check_vp(vp, mode);
1980	if (aio_func == NULL) {
1981		releasef(fdes);
1982		return (EBADFD);
1983	}
1984#ifdef _LP64
1985	aiocb.aio_fildes = fdes;
1986	aiocb.aio_buf = bufp;
1987	aiocb.aio_nbytes = bufsize;
1988	aiocb.aio_offset = offset;
1989	aiocb.aio_sigevent.sigev_notify = 0;
1990	error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp);
1991#else
1992	aiocb64.aio_fildes = fdes;
1993	aiocb64.aio_buf = (caddr32_t)bufp;
1994	aiocb64.aio_nbytes = bufsize;
1995	aiocb64.aio_offset = offset;
1996	aiocb64.aio_sigevent.sigev_notify = 0;
1997	error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp);
1998#endif
1999	if (error) {
2000		releasef(fdes);
2001		return (error);
2002	}
2003
2004	/*
2005	 * enable polling on this request if the opcode has
2006	 * the AIO poll bit set
2007	 */
2008	if (opcode & AIO_POLL_BIT)
2009		reqp->aio_req_flags |= AIO_POLL;
2010
2011	if (bufsize == 0) {
2012		clear_active_fd(fdes);
2013		aio_zerolen(reqp);
2014		return (0);
2015	}
2016	/*
2017	 * send the request to driver.
2018	 */
2019	error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2020	/*
2021	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2022	 * is released by the aio_cleanup_thread() when the IO has
2023	 * completed.
2024	 */
2025	if (error) {
2026		releasef(fdes);
2027		mutex_enter(&aiop->aio_mutex);
2028		aio_req_free(aiop, reqp);
2029		aiop->aio_pending--;
2030		if (aiop->aio_flags & AIO_REQ_BLOCK)
2031			cv_signal(&aiop->aio_cleanupcv);
2032		mutex_exit(&aiop->aio_mutex);
2033		return (error);
2034	}
2035	clear_active_fd(fdes);
2036	return (0);
2037}
2038
2039/*
2040 * posix version of asynchronous read and write
2041 */
2042static int
2043aiorw(
2044	int		opcode,
2045	void		*aiocb_arg,
2046	int		mode,
2047	int		run_mode)
2048{
2049#ifdef _SYSCALL32_IMPL
2050	aiocb32_t	aiocb32;
2051	struct	sigevent32 *sigev32;
2052	port_notify32_t	pntfy32;
2053#endif
2054	aiocb64_32_t	aiocb64;
2055	aiocb_t		aiocb;
2056	file_t		*fp;
2057	int		error, fd;
2058	size_t		bufsize;
2059	struct vnode	*vp;
2060	aio_req_t	*reqp;
2061	aio_t		*aiop;
2062	int		(*aio_func)();
2063	aio_result_t	*resultp;
2064	struct	sigevent *sigev;
2065	model_t		model;
2066	int		aio_use_port = 0;
2067	port_notify_t	pntfy;
2068
2069	model = get_udatamodel();
2070	aiop = curproc->p_aio;
2071	if (aiop == NULL)
2072		return (EINVAL);
2073
2074	if (model == DATAMODEL_NATIVE) {
2075		if (run_mode != AIO_LARGEFILE) {
2076			if (copyin(aiocb_arg, &aiocb, sizeof (aiocb_t)))
2077				return (EFAULT);
2078			bufsize = aiocb.aio_nbytes;
2079			resultp = &(((aiocb_t *)aiocb_arg)->aio_resultp);
2080			if ((fp = getf(fd = aiocb.aio_fildes)) == NULL) {
2081				return (EBADF);
2082			}
2083			sigev = &aiocb.aio_sigevent;
2084		} else {
2085			/*
2086			 * We come here only when we make largefile
2087			 * call on 32 bit kernel using 32 bit library.
2088			 */
2089			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2090				return (EFAULT);
2091			bufsize = aiocb64.aio_nbytes;
2092			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2093			    ->aio_resultp);
2094			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2095				return (EBADF);
2096			sigev = (struct sigevent *)&aiocb64.aio_sigevent;
2097		}
2098
2099		if (sigev->sigev_notify == SIGEV_PORT) {
2100			if (copyin((void *)sigev->sigev_value.sival_ptr,
2101			    &pntfy, sizeof (port_notify_t))) {
2102				releasef(fd);
2103				return (EFAULT);
2104			}
2105			aio_use_port = 1;
2106		} else if (sigev->sigev_notify == SIGEV_THREAD) {
2107			pntfy.portnfy_port = aiocb.aio_sigevent.sigev_signo;
2108			pntfy.portnfy_user =
2109			    aiocb.aio_sigevent.sigev_value.sival_ptr;
2110			aio_use_port = 1;
2111		}
2112	}
2113#ifdef	_SYSCALL32_IMPL
2114	else {
2115		if (run_mode == AIO_32) {
2116			/* 32 bit system call is being made on 64 bit kernel */
2117			if (copyin(aiocb_arg, &aiocb32, sizeof (aiocb32_t)))
2118				return (EFAULT);
2119
2120			bufsize = aiocb32.aio_nbytes;
2121			aiocb_32ton(&aiocb32, &aiocb);
2122			resultp = (aio_result_t *)&(((aiocb32_t *)aiocb_arg)->
2123			    aio_resultp);
2124			if ((fp = getf(fd = aiocb32.aio_fildes)) == NULL) {
2125				return (EBADF);
2126			}
2127			sigev32 = &aiocb32.aio_sigevent;
2128		} else if (run_mode == AIO_LARGEFILE) {
2129			/*
2130			 * We come here only when we make largefile
2131			 * call on 64 bit kernel using 32 bit library.
2132			 */
2133			if (copyin(aiocb_arg, &aiocb64, sizeof (aiocb64_32_t)))
2134				return (EFAULT);
2135			bufsize = aiocb64.aio_nbytes;
2136			aiocb_LFton(&aiocb64, &aiocb);
2137			resultp = (aio_result_t *)&(((aiocb64_32_t *)aiocb_arg)
2138			    ->aio_resultp);
2139			if ((fp = getf(fd = aiocb64.aio_fildes)) == NULL)
2140				return (EBADF);
2141			sigev32 = &aiocb64.aio_sigevent;
2142		}
2143
2144		if (sigev32->sigev_notify == SIGEV_PORT) {
2145			if (copyin(
2146			    (void *)(uintptr_t)sigev32->sigev_value.sival_ptr,
2147			    &pntfy32, sizeof (port_notify32_t))) {
2148				releasef(fd);
2149				return (EFAULT);
2150			}
2151			pntfy.portnfy_port = pntfy32.portnfy_port;
2152			pntfy.portnfy_user = (void *)(uintptr_t)
2153			    pntfy32.portnfy_user;
2154			aio_use_port = 1;
2155		} else if (sigev32->sigev_notify == SIGEV_THREAD) {
2156			pntfy.portnfy_port = sigev32->sigev_signo;
2157			pntfy.portnfy_user = (void *)(uintptr_t)
2158			    sigev32->sigev_value.sival_ptr;
2159			aio_use_port = 1;
2160		}
2161	}
2162#endif  /* _SYSCALL32_IMPL */
2163
2164	/*
2165	 * check the permission of the partition
2166	 */
2167
2168	if ((fp->f_flag & mode) == 0) {
2169		releasef(fd);
2170		return (EBADF);
2171	}
2172
2173	vp = fp->f_vnode;
2174	aio_func = check_vp(vp, mode);
2175	if (aio_func == NULL) {
2176		releasef(fd);
2177		return (EBADFD);
2178	}
2179	if (run_mode == AIO_LARGEFILE)
2180		error = aio_req_setupLF(&reqp, aiop, &aiocb64, resultp, vp);
2181	else
2182		error = aio_req_setup(&reqp, aiop, &aiocb, resultp, vp);
2183
2184	if (error) {
2185		releasef(fd);
2186		return (error);
2187	}
2188	/*
2189	 * enable polling on this request if the opcode has
2190	 * the AIO poll bit set
2191	 */
2192	if (opcode & AIO_POLL_BIT)
2193		reqp->aio_req_flags |= AIO_POLL;
2194
2195	if (model == DATAMODEL_NATIVE)
2196		reqp->aio_req_iocb.iocb = aiocb_arg;
2197#ifdef  _SYSCALL32_IMPL
2198	else
2199		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)aiocb_arg;
2200#endif
2201
2202	if (aio_use_port) {
2203		int event = (run_mode == AIO_LARGEFILE)?
2204		    ((mode == FREAD)? AIOAREAD64 : AIOAWRITE64) :
2205		    ((mode == FREAD)? AIOAREAD : AIOAWRITE);
2206		error = aio_req_assoc_port_rw(&pntfy, aiocb_arg, reqp, event);
2207	}
2208
2209	/*
2210	 * send the request to driver.
2211	 */
2212	if (error == 0) {
2213		if (bufsize == 0) {
2214			clear_active_fd(fd);
2215			aio_zerolen(reqp);
2216			return (0);
2217		}
2218		error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req, CRED());
2219	}
2220
2221	/*
2222	 * the fd is stored in the aio_req_t by aio_req_setup(), and
2223	 * is released by the aio_cleanup_thread() when the IO has
2224	 * completed.
2225	 */
2226	if (error) {
2227		releasef(fd);
2228		mutex_enter(&aiop->aio_mutex);
2229		aio_deq(&aiop->aio_portpending, reqp);
2230		aio_req_free(aiop, reqp);
2231		aiop->aio_pending--;
2232		if (aiop->aio_flags & AIO_REQ_BLOCK)
2233			cv_signal(&aiop->aio_cleanupcv);
2234		mutex_exit(&aiop->aio_mutex);
2235		return (error);
2236	}
2237	clear_active_fd(fd);
2238	return (0);
2239}
2240
2241
2242/*
2243 * set error for a list IO entry that failed.
2244 */
2245static void
2246lio_set_error(aio_req_t *reqp)
2247{
2248	aio_t *aiop = curproc->p_aio;
2249
2250	if (aiop == NULL)
2251		return;
2252
2253	mutex_enter(&aiop->aio_mutex);
2254	aio_deq(&aiop->aio_portpending, reqp);
2255	aiop->aio_pending--;
2256	/* request failed, AIO_PHYSIODONE set to aviod physio cleanup. */
2257	reqp->aio_req_flags |= AIO_PHYSIODONE;
2258	/*
2259	 * Need to free the request now as its never
2260	 * going to get on the done queue
2261	 *
2262	 * Note: aio_outstanding is decremented in
2263	 *	 aio_req_free()
2264	 */
2265	aio_req_free(aiop, reqp);
2266	if (aiop->aio_flags & AIO_REQ_BLOCK)
2267		cv_signal(&aiop->aio_cleanupcv);
2268	mutex_exit(&aiop->aio_mutex);
2269}
2270
2271/*
2272 * check if a specified request is done, and remove it from
2273 * the done queue. otherwise remove anybody from the done queue
2274 * if NULL is specified.
2275 */
2276static aio_req_t *
2277aio_req_done(void *resultp)
2278{
2279	aio_req_t **bucket;
2280	aio_req_t *ent;
2281	aio_t *aiop = curproc->p_aio;
2282	long index;
2283
2284	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2285	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2286
2287	if (resultp) {
2288		index = AIO_HASH(resultp);
2289		bucket = &aiop->aio_hash[index];
2290		for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2291			if (ent->aio_req_resultp == (aio_result_t *)resultp) {
2292				if (ent->aio_req_flags & AIO_DONEQ) {
2293					return (aio_req_remove(ent));
2294				}
2295				return (NULL);
2296			}
2297		}
2298		/* no match, resultp is invalid */
2299		return (NULL);
2300	}
2301	return (aio_req_remove(NULL));
2302}
2303
2304/*
2305 * determine if a user-level resultp pointer is associated with an
2306 * active IO request. Zero is returned when the request is done,
2307 * and the request is removed from the done queue. Only when the
2308 * return value is zero, is the "reqp" pointer valid. One is returned
2309 * when the request is inprogress. Two is returned when the request
2310 * is invalid.
2311 */
2312static int
2313aio_req_find(aio_result_t *resultp, aio_req_t **reqp)
2314{
2315	aio_req_t **bucket;
2316	aio_req_t *ent;
2317	aio_t *aiop = curproc->p_aio;
2318	long index;
2319
2320	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
2321	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2322
2323	index = AIO_HASH(resultp);
2324	bucket = &aiop->aio_hash[index];
2325	for (ent = *bucket; ent != NULL; ent = ent->aio_hash_next) {
2326		if (ent->aio_req_resultp == resultp) {
2327			if (ent->aio_req_flags & AIO_DONEQ) {
2328				*reqp = aio_req_remove(ent);
2329				return (0);
2330			}
2331			return (1);
2332		}
2333	}
2334	/* no match, resultp is invalid */
2335	return (2);
2336}
2337
2338/*
2339 * remove a request from the done queue.
2340 */
2341static aio_req_t *
2342aio_req_remove(aio_req_t *reqp)
2343{
2344	aio_t *aiop = curproc->p_aio;
2345
2346	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2347
2348	if (reqp != NULL) {
2349		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2350		if (reqp->aio_req_next == reqp) {
2351			/* only one request on queue */
2352			if (reqp ==  aiop->aio_doneq) {
2353				aiop->aio_doneq = NULL;
2354			} else {
2355				ASSERT(reqp == aiop->aio_cleanupq);
2356				aiop->aio_cleanupq = NULL;
2357			}
2358		} else {
2359			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2360			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2361			/*
2362			 * The request can be either on the aio_doneq or the
2363			 * aio_cleanupq
2364			 */
2365			if (reqp == aiop->aio_doneq)
2366				aiop->aio_doneq = reqp->aio_req_next;
2367
2368			if (reqp == aiop->aio_cleanupq)
2369				aiop->aio_cleanupq = reqp->aio_req_next;
2370		}
2371		reqp->aio_req_flags &= ~AIO_DONEQ;
2372		reqp->aio_req_next = NULL;
2373		reqp->aio_req_prev = NULL;
2374	} else if ((reqp = aiop->aio_doneq) != NULL) {
2375		ASSERT(reqp->aio_req_flags & AIO_DONEQ);
2376		if (reqp == reqp->aio_req_next) {
2377			/* only one request on queue */
2378			aiop->aio_doneq = NULL;
2379		} else {
2380			reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
2381			reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
2382			aiop->aio_doneq = reqp->aio_req_next;
2383		}
2384		reqp->aio_req_flags &= ~AIO_DONEQ;
2385		reqp->aio_req_next = NULL;
2386		reqp->aio_req_prev = NULL;
2387	}
2388	if (aiop->aio_doneq == NULL && (aiop->aio_flags & AIO_WAITN))
2389		cv_broadcast(&aiop->aio_waitcv);
2390	return (reqp);
2391}
2392
2393static int
2394aio_req_setup(
2395	aio_req_t	**reqpp,
2396	aio_t 		*aiop,
2397	aiocb_t 	*arg,
2398	aio_result_t 	*resultp,
2399	vnode_t		*vp)
2400{
2401	sigqueue_t	*sqp = NULL;
2402	aio_req_t 	*reqp;
2403	struct uio 	*uio;
2404	struct sigevent *sigev;
2405	int		error;
2406
2407	sigev = &arg->aio_sigevent;
2408	if (sigev->sigev_notify == SIGEV_SIGNAL &&
2409	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
2410		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2411		if (sqp == NULL)
2412			return (EAGAIN);
2413		sqp->sq_func = NULL;
2414		sqp->sq_next = NULL;
2415		sqp->sq_info.si_code = SI_ASYNCIO;
2416		sqp->sq_info.si_pid = curproc->p_pid;
2417		sqp->sq_info.si_ctid = PRCTID(curproc);
2418		sqp->sq_info.si_zoneid = getzoneid();
2419		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
2420		sqp->sq_info.si_signo = sigev->sigev_signo;
2421		sqp->sq_info.si_value = sigev->sigev_value;
2422	}
2423
2424	mutex_enter(&aiop->aio_mutex);
2425
2426	if (aiop->aio_flags & AIO_REQ_BLOCK) {
2427		mutex_exit(&aiop->aio_mutex);
2428		if (sqp)
2429			kmem_free(sqp, sizeof (sigqueue_t));
2430		return (EIO);
2431	}
2432	/*
2433	 * get an aio_reqp from the free list or allocate one
2434	 * from dynamic memory.
2435	 */
2436	if (error = aio_req_alloc(&reqp, resultp)) {
2437		mutex_exit(&aiop->aio_mutex);
2438		if (sqp)
2439			kmem_free(sqp, sizeof (sigqueue_t));
2440		return (error);
2441	}
2442	aiop->aio_pending++;
2443	aiop->aio_outstanding++;
2444	reqp->aio_req_flags = AIO_PENDING;
2445	if (sigev->sigev_notify == SIGEV_THREAD ||
2446	    sigev->sigev_notify == SIGEV_PORT)
2447		aio_enq(&aiop->aio_portpending, reqp, 0);
2448	mutex_exit(&aiop->aio_mutex);
2449	/*
2450	 * initialize aio request.
2451	 */
2452	reqp->aio_req_fd = arg->aio_fildes;
2453	reqp->aio_req_sigqp = sqp;
2454	reqp->aio_req_iocb.iocb = NULL;
2455	reqp->aio_req_lio = NULL;
2456	reqp->aio_req_buf.b_file = vp;
2457	uio = reqp->aio_req.aio_uio;
2458	uio->uio_iovcnt = 1;
2459	uio->uio_iov->iov_base = (caddr_t)arg->aio_buf;
2460	uio->uio_iov->iov_len = arg->aio_nbytes;
2461	uio->uio_loffset = arg->aio_offset;
2462	*reqpp = reqp;
2463	return (0);
2464}
2465
2466/*
2467 * Allocate p_aio struct.
2468 */
2469static aio_t *
2470aio_aiop_alloc(void)
2471{
2472	aio_t	*aiop;
2473
2474	ASSERT(MUTEX_HELD(&curproc->p_lock));
2475
2476	aiop = kmem_zalloc(sizeof (struct aio), KM_NOSLEEP);
2477	if (aiop) {
2478		mutex_init(&aiop->aio_mutex, NULL, MUTEX_DEFAULT, NULL);
2479		mutex_init(&aiop->aio_cleanupq_mutex, NULL, MUTEX_DEFAULT,
2480									NULL);
2481		mutex_init(&aiop->aio_portq_mutex, NULL, MUTEX_DEFAULT, NULL);
2482	}
2483	return (aiop);
2484}
2485
2486/*
2487 * Allocate an aio_req struct.
2488 */
2489static int
2490aio_req_alloc(aio_req_t **nreqp, aio_result_t *resultp)
2491{
2492	aio_req_t *reqp;
2493	aio_t *aiop = curproc->p_aio;
2494
2495	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2496
2497	if ((reqp = aiop->aio_free) != NULL) {
2498		aiop->aio_free = reqp->aio_req_next;
2499		bzero(reqp, sizeof (*reqp));
2500	} else {
2501		/*
2502		 * Check whether memory is getting tight.
2503		 * This is a temporary mechanism to avoid memory
2504		 * exhaustion by a single process until we come up
2505		 * with a per process solution such as setrlimit().
2506		 */
2507		if (freemem < desfree)
2508			return (EAGAIN);
2509		reqp = kmem_zalloc(sizeof (struct aio_req_t), KM_NOSLEEP);
2510		if (reqp == NULL)
2511			return (EAGAIN);
2512	}
2513	reqp->aio_req.aio_uio = &reqp->aio_req_uio;
2514	reqp->aio_req.aio_uio->uio_iov = &reqp->aio_req_iov;
2515	reqp->aio_req.aio_private = reqp;
2516	reqp->aio_req_buf.b_offset = -1;
2517	reqp->aio_req_resultp = resultp;
2518	if (aio_hash_insert(reqp, aiop)) {
2519		reqp->aio_req_next = aiop->aio_free;
2520		aiop->aio_free = reqp;
2521		return (EINVAL);
2522	}
2523	*nreqp = reqp;
2524	return (0);
2525}
2526
2527/*
2528 * Allocate an aio_lio_t struct.
2529 */
2530static int
2531aio_lio_alloc(aio_lio_t **head)
2532{
2533	aio_lio_t *liop;
2534	aio_t *aiop = curproc->p_aio;
2535
2536	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
2537
2538	if ((liop = aiop->aio_lio_free) != NULL) {
2539		aiop->aio_lio_free = liop->lio_next;
2540	} else {
2541		/*
2542		 * Check whether memory is getting tight.
2543		 * This is a temporary mechanism to avoid memory
2544		 * exhaustion by a single process until we come up
2545		 * with a per process solution such as setrlimit().
2546		 */
2547		if (freemem < desfree)
2548			return (EAGAIN);
2549
2550		liop = kmem_zalloc(sizeof (aio_lio_t), KM_NOSLEEP);
2551		if (liop == NULL)
2552			return (EAGAIN);
2553	}
2554	*head = liop;
2555	return (0);
2556}
2557
2558/*
2559 * this is a special per-process thread that is only activated if
2560 * the process is unmapping a segment with outstanding aio. normally,
2561 * the process will have completed the aio before unmapping the
2562 * segment. If the process does unmap a segment with outstanding aio,
2563 * this special thread will guarentee that the locked pages due to
2564 * aphysio() are released, thereby permitting the segment to be
2565 * unmapped. In addition to this, the cleanup thread is woken up
2566 * during DR operations to release the locked pages.
2567 */
2568
2569static int
2570aio_cleanup_thread(aio_t *aiop)
2571{
2572	proc_t *p = curproc;
2573	struct as *as = p->p_as;
2574	int poked = 0;
2575	kcondvar_t *cvp;
2576	int exit_flag = 0;
2577	int rqclnup = 0;
2578
2579	sigfillset(&curthread->t_hold);
2580	sigdiffset(&curthread->t_hold, &cantmask);
2581	for (;;) {
2582		/*
2583		 * if a segment is being unmapped, and the current
2584		 * process's done queue is not empty, then every request
2585		 * on the doneq with locked resources should be forced
2586		 * to release their locks. By moving the doneq request
2587		 * to the cleanupq, aio_cleanup() will process the cleanupq,
2588		 * and place requests back onto the doneq. All requests
2589		 * processed by aio_cleanup() will have their physical
2590		 * resources unlocked.
2591		 */
2592		mutex_enter(&aiop->aio_mutex);
2593		if ((aiop->aio_flags & AIO_CLEANUP) == 0) {
2594			aiop->aio_flags |= AIO_CLEANUP;
2595			mutex_enter(&as->a_contents);
2596			if (aiop->aio_rqclnup) {
2597				aiop->aio_rqclnup = 0;
2598				rqclnup = 1;
2599			}
2600
2601			if ((rqclnup || AS_ISUNMAPWAIT(as)) &&
2602			    aiop->aio_doneq) {
2603				aio_req_t *doneqhead = aiop->aio_doneq;
2604				mutex_exit(&as->a_contents);
2605				aiop->aio_doneq = NULL;
2606				aio_cleanupq_concat(aiop, doneqhead, AIO_DONEQ);
2607			} else {
2608				mutex_exit(&as->a_contents);
2609			}
2610		}
2611		mutex_exit(&aiop->aio_mutex);
2612		aio_cleanup(AIO_CLEANUP_THREAD);
2613		/*
2614		 * thread should block on the cleanupcv while
2615		 * AIO_CLEANUP is set.
2616		 */
2617		cvp = &aiop->aio_cleanupcv;
2618		mutex_enter(&aiop->aio_mutex);
2619
2620		if (aiop->aio_pollq != NULL || aiop->aio_cleanupq != NULL ||
2621		    aiop->aio_notifyq != NULL ||
2622		    aiop->aio_portcleanupq != NULL) {
2623			mutex_exit(&aiop->aio_mutex);
2624			continue;
2625		}
2626		mutex_enter(&as->a_contents);
2627
2628		/*
2629		 * AIO_CLEANUP determines when the cleanup thread
2630		 * should be active. This flag is set when
2631		 * the cleanup thread is awakened by as_unmap() or
2632		 * due to DR operations.
2633		 * The flag is cleared when the blocking as_unmap()
2634		 * that originally awakened us is allowed to
2635		 * complete. as_unmap() blocks when trying to
2636		 * unmap a segment that has SOFTLOCKed pages. when
2637		 * the segment's pages are all SOFTUNLOCKed,
2638		 * as->a_flags & AS_UNMAPWAIT should be zero.
2639		 *
2640		 * In case of cleanup request by DR, the flag is cleared
2641		 * once all the pending aio requests have been processed.
2642		 *
2643		 * The flag shouldn't be cleared right away if the
2644		 * cleanup thread was interrupted because the process
2645		 * is doing forkall(). This happens when cv_wait_sig()
2646		 * returns zero, because it was awakened by a pokelwps().
2647		 * If the process is not exiting, it must be doing forkall().
2648		 */
2649		if ((poked == 0) &&
2650			((!rqclnup && (AS_ISUNMAPWAIT(as) == 0)) ||
2651					(aiop->aio_pending == 0))) {
2652			aiop->aio_flags &= ~(AIO_CLEANUP | AIO_CLEANUP_PORT);
2653			cvp = &as->a_cv;
2654			rqclnup = 0;
2655		}
2656		mutex_exit(&aiop->aio_mutex);
2657		if (poked) {
2658			/*
2659			 * If the process is exiting/killed, don't return
2660			 * immediately without waiting for pending I/O's
2661			 * and releasing the page locks.
2662			 */
2663			if (p->p_flag & (SEXITLWPS|SKILLED)) {
2664				/*
2665				 * If exit_flag is set, then it is
2666				 * safe to exit because we have released
2667				 * page locks of completed I/O's.
2668				 */
2669				if (exit_flag)
2670					break;
2671
2672				mutex_exit(&as->a_contents);
2673
2674				/*
2675				 * Wait for all the pending aio to complete.
2676				 */
2677				mutex_enter(&aiop->aio_mutex);
2678				aiop->aio_flags |= AIO_REQ_BLOCK;
2679				while (aiop->aio_pending != 0)
2680					cv_wait(&aiop->aio_cleanupcv,
2681						&aiop->aio_mutex);
2682				mutex_exit(&aiop->aio_mutex);
2683				exit_flag = 1;
2684				continue;
2685			} else if (p->p_flag &
2686			    (SHOLDFORK|SHOLDFORK1|SHOLDWATCH)) {
2687				/*
2688				 * hold LWP until it
2689				 * is continued.
2690				 */
2691				mutex_exit(&as->a_contents);
2692				mutex_enter(&p->p_lock);
2693				stop(PR_SUSPENDED, SUSPEND_NORMAL);
2694				mutex_exit(&p->p_lock);
2695				poked = 0;
2696				continue;
2697			}
2698		} else {
2699			/*
2700			 * When started this thread will sleep on as->a_cv.
2701			 * as_unmap will awake this thread if the
2702			 * segment has SOFTLOCKed pages (poked = 0).
2703			 * 1. pokelwps() awakes this thread =>
2704			 *    break the loop to check SEXITLWPS, SHOLDFORK, etc
2705			 * 2. as_unmap awakes this thread =>
2706			 *    to break the loop it is necessary that
2707			 *    - AS_UNMAPWAIT is set (as_unmap is waiting for
2708			 *	memory to be unlocked)
2709			 *    - AIO_CLEANUP is not set
2710			 *	(if AIO_CLEANUP is set we have to wait for
2711			 *	pending requests. aio_done will send a signal
2712			 *	for every request which completes to continue
2713			 *	unmapping the corresponding address range)
2714			 * 3. A cleanup request will wake this thread up, ex.
2715			 *    by the DR operations. The aio_rqclnup flag will
2716			 *    be set.
2717			 */
2718			while (poked == 0) {
2719				/*
2720				 * we need to handle cleanup requests
2721				 * that come in after we had just cleaned up,
2722				 * so that we do cleanup of any new aio
2723				 * requests that got completed and have
2724				 * locked resources.
2725				 */
2726				if ((aiop->aio_rqclnup ||
2727					(AS_ISUNMAPWAIT(as) != 0)) &&
2728					(aiop->aio_flags & AIO_CLEANUP) == 0)
2729					break;
2730				poked = !cv_wait_sig(cvp, &as->a_contents);
2731				if (AS_ISUNMAPWAIT(as) == 0)
2732					cv_signal(cvp);
2733				if (aiop->aio_outstanding != 0)
2734					break;
2735			}
2736		}
2737		mutex_exit(&as->a_contents);
2738	}
2739exit:
2740	mutex_exit(&as->a_contents);
2741	ASSERT((curproc->p_flag & (SEXITLWPS|SKILLED)));
2742	aston(curthread);	/* make thread do post_syscall */
2743	return (0);
2744}
2745
2746/*
2747 * save a reference to a user's outstanding aio in a hash list.
2748 */
2749static int
2750aio_hash_insert(
2751	aio_req_t *aio_reqp,
2752	aio_t *aiop)
2753{
2754	long index;
2755	aio_result_t *resultp = aio_reqp->aio_req_resultp;
2756	aio_req_t *current;
2757	aio_req_t **nextp;
2758
2759	index = AIO_HASH(resultp);
2760	nextp = &aiop->aio_hash[index];
2761	while ((current = *nextp) != NULL) {
2762		if (current->aio_req_resultp == resultp)
2763			return (DUPLICATE);
2764		nextp = &current->aio_hash_next;
2765	}
2766	*nextp = aio_reqp;
2767	aio_reqp->aio_hash_next = NULL;
2768	return (0);
2769}
2770
2771static int
2772(*check_vp(struct vnode *vp, int mode))(vnode_t *, struct aio_req *,
2773    cred_t *)
2774{
2775	struct snode *sp;
2776	dev_t		dev;
2777	struct cb_ops  	*cb;
2778	major_t		major;
2779	int		(*aio_func)();
2780
2781	dev = vp->v_rdev;
2782	major = getmajor(dev);
2783
2784	/*
2785	 * return NULL for requests to files and STREAMs so
2786	 * that libaio takes care of them.
2787	 */
2788	if (vp->v_type == VCHR) {
2789		/* no stream device for kaio */
2790		if (STREAMSTAB(major)) {
2791			return (NULL);
2792		}
2793	} else {
2794		return (NULL);
2795	}
2796
2797	/*
2798	 * Check old drivers which do not have async I/O entry points.
2799	 */
2800	if (devopsp[major]->devo_rev < 3)
2801		return (NULL);
2802
2803	cb = devopsp[major]->devo_cb_ops;
2804
2805	if (cb->cb_rev < 1)
2806		return (NULL);
2807
2808	/*
2809	 * Check whether this device is a block device.
2810	 * Kaio is not supported for devices like tty.
2811	 */
2812	if (cb->cb_strategy == nodev || cb->cb_strategy == NULL)
2813		return (NULL);
2814
2815	/*
2816	 * Clustering: If vnode is a PXFS vnode, then the device may be remote.
2817	 * We cannot call the driver directly. Instead return the
2818	 * PXFS functions.
2819	 */
2820
2821	if (IS_PXFSVP(vp)) {
2822		if (mode & FREAD)
2823			return (clpxfs_aio_read);
2824		else
2825			return (clpxfs_aio_write);
2826	}
2827	if (mode & FREAD)
2828		aio_func = (cb->cb_aread == nodev) ? NULL : driver_aio_read;
2829	else
2830		aio_func = (cb->cb_awrite == nodev) ? NULL : driver_aio_write;
2831
2832	/*
2833	 * Do we need this ?
2834	 * nodev returns ENXIO anyway.
2835	 */
2836	if (aio_func == nodev)
2837		return (NULL);
2838
2839	sp = VTOS(vp);
2840	smark(sp, SACC);
2841	return (aio_func);
2842}
2843
2844/*
2845 * Clustering: We want check_vp to return a function prototyped
2846 * correctly that will be common to both PXFS and regular case.
2847 * We define this intermediate function that will do the right
2848 * thing for driver cases.
2849 */
2850
2851static int
2852driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2853{
2854	dev_t dev;
2855	struct cb_ops  	*cb;
2856
2857	ASSERT(vp->v_type == VCHR);
2858	ASSERT(!IS_PXFSVP(vp));
2859	dev = VTOS(vp)->s_dev;
2860	ASSERT(STREAMSTAB(getmajor(dev)) == NULL);
2861
2862	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2863
2864	ASSERT(cb->cb_awrite != nodev);
2865	return ((*cb->cb_awrite)(dev, aio, cred_p));
2866}
2867
2868/*
2869 * Clustering: We want check_vp to return a function prototyped
2870 * correctly that will be common to both PXFS and regular case.
2871 * We define this intermediate function that will do the right
2872 * thing for driver cases.
2873 */
2874
2875static int
2876driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p)
2877{
2878	dev_t dev;
2879	struct cb_ops  	*cb;
2880
2881	ASSERT(vp->v_type == VCHR);
2882	ASSERT(!IS_PXFSVP(vp));
2883	dev = VTOS(vp)->s_dev;
2884	ASSERT(!STREAMSTAB(getmajor(dev)));
2885
2886	cb = devopsp[getmajor(dev)]->devo_cb_ops;
2887
2888	ASSERT(cb->cb_aread != nodev);
2889	return ((*cb->cb_aread)(dev, aio, cred_p));
2890}
2891
2892/*
2893 * This routine is called when a largefile call is made by a 32bit
2894 * process on a ILP32 or LP64 kernel. All 64bit processes are large
2895 * file by definition and will call alio() instead.
2896 */
2897static int
2898alioLF(
2899	int		mode_arg,
2900	void		*aiocb_arg,
2901	int		nent,
2902	void		*sigev)
2903{
2904	file_t		*fp;
2905	file_t		*prev_fp = NULL;
2906	int		prev_mode = -1;
2907	struct vnode	*vp;
2908	aio_lio_t	*head;
2909	aio_req_t	*reqp;
2910	aio_t		*aiop;
2911	caddr_t		cbplist;
2912	aiocb64_32_t	cb64;
2913	aiocb64_32_t	*aiocb = &cb64;
2914	aiocb64_32_t	*cbp;
2915	caddr32_t	*ucbp;
2916#ifdef _LP64
2917	aiocb_t		aiocb_n;
2918#endif
2919	struct sigevent32	sigevk;
2920	sigqueue_t	*sqp;
2921	int		(*aio_func)();
2922	int		mode;
2923	int		error = 0;
2924	int		aio_errors = 0;
2925	int		i;
2926	size_t		ssize;
2927	int		deadhead = 0;
2928	int		aio_notsupported = 0;
2929	int		lio_head_port;
2930	int		aio_port;
2931	int		aio_thread;
2932	port_kevent_t	*pkevtp = NULL;
2933	port_notify32_t	pnotify;
2934	int		event;
2935
2936	aiop = curproc->p_aio;
2937	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
2938		return (EINVAL);
2939
2940	ASSERT(get_udatamodel() == DATAMODEL_ILP32);
2941
2942	ssize = (sizeof (caddr32_t) * nent);
2943	cbplist = kmem_alloc(ssize, KM_SLEEP);
2944	ucbp = (caddr32_t *)cbplist;
2945
2946	if (copyin(aiocb_arg, cbplist, ssize) ||
2947	    (sigev && copyin(sigev, &sigevk, sizeof (sigevk)))) {
2948		kmem_free(cbplist, ssize);
2949		return (EFAULT);
2950	}
2951
2952	/* Event Ports  */
2953	if (sigev &&
2954	    (sigevk.sigev_notify == SIGEV_THREAD ||
2955	    sigevk.sigev_notify == SIGEV_PORT)) {
2956		if (sigevk.sigev_notify == SIGEV_THREAD) {
2957			pnotify.portnfy_port = sigevk.sigev_signo;
2958			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
2959		} else if (copyin(
2960		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
2961		    &pnotify, sizeof (pnotify))) {
2962			kmem_free(cbplist, ssize);
2963			return (EFAULT);
2964		}
2965		error = port_alloc_event(pnotify.portnfy_port,
2966		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
2967		if (error) {
2968			if (error == ENOMEM || error == EAGAIN)
2969				error = EAGAIN;
2970			else
2971				error = EINVAL;
2972			kmem_free(cbplist, ssize);
2973			return (error);
2974		}
2975		lio_head_port = pnotify.portnfy_port;
2976	}
2977
2978	/*
2979	 * a list head should be allocated if notification is
2980	 * enabled for this list.
2981	 */
2982	head = NULL;
2983
2984	if (mode_arg == LIO_WAIT || sigev) {
2985		mutex_enter(&aiop->aio_mutex);
2986		error = aio_lio_alloc(&head);
2987		mutex_exit(&aiop->aio_mutex);
2988		if (error)
2989			goto done;
2990		deadhead = 1;
2991		head->lio_nent = nent;
2992		head->lio_refcnt = nent;
2993		head->lio_port = -1;
2994		head->lio_portkev = NULL;
2995		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
2996		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
2997			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
2998			if (sqp == NULL) {
2999				error = EAGAIN;
3000				goto done;
3001			}
3002			sqp->sq_func = NULL;
3003			sqp->sq_next = NULL;
3004			sqp->sq_info.si_code = SI_ASYNCIO;
3005			sqp->sq_info.si_pid = curproc->p_pid;
3006			sqp->sq_info.si_ctid = PRCTID(curproc);
3007			sqp->sq_info.si_zoneid = getzoneid();
3008			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3009			sqp->sq_info.si_signo = sigevk.sigev_signo;
3010			sqp->sq_info.si_value.sival_int =
3011			    sigevk.sigev_value.sival_int;
3012			head->lio_sigqp = sqp;
3013		} else {
3014			head->lio_sigqp = NULL;
3015		}
3016		if (pkevtp) {
3017			/*
3018			 * Prepare data to send when list of aiocb's
3019			 * has completed.
3020			 */
3021			port_init_event(pkevtp, (uintptr_t)sigev,
3022			    (void *)(uintptr_t)pnotify.portnfy_user,
3023			    NULL, head);
3024			pkevtp->portkev_events = AIOLIO64;
3025			head->lio_portkev = pkevtp;
3026			head->lio_port = pnotify.portnfy_port;
3027		}
3028	}
3029
3030	for (i = 0; i < nent; i++, ucbp++) {
3031
3032		cbp = (aiocb64_32_t *)(uintptr_t)*ucbp;
3033		/* skip entry if it can't be copied. */
3034		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb))) {
3035			if (head) {
3036				mutex_enter(&aiop->aio_mutex);
3037				head->lio_nent--;
3038				head->lio_refcnt--;
3039				mutex_exit(&aiop->aio_mutex);
3040			}
3041			continue;
3042		}
3043
3044		/* skip if opcode for aiocb is LIO_NOP */
3045		mode = aiocb->aio_lio_opcode;
3046		if (mode == LIO_NOP) {
3047			cbp = NULL;
3048			if (head) {
3049				mutex_enter(&aiop->aio_mutex);
3050				head->lio_nent--;
3051				head->lio_refcnt--;
3052				mutex_exit(&aiop->aio_mutex);
3053			}
3054			continue;
3055		}
3056
3057		/* increment file descriptor's ref count. */
3058		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3059			lio_set_uerror(&cbp->aio_resultp, EBADF);
3060			if (head) {
3061				mutex_enter(&aiop->aio_mutex);
3062				head->lio_nent--;
3063				head->lio_refcnt--;
3064				mutex_exit(&aiop->aio_mutex);
3065			}
3066			aio_errors++;
3067			continue;
3068		}
3069
3070		/*
3071		 * check the permission of the partition
3072		 */
3073		if ((fp->f_flag & mode) == 0) {
3074			releasef(aiocb->aio_fildes);
3075			lio_set_uerror(&cbp->aio_resultp, EBADF);
3076			if (head) {
3077				mutex_enter(&aiop->aio_mutex);
3078				head->lio_nent--;
3079				head->lio_refcnt--;
3080				mutex_exit(&aiop->aio_mutex);
3081			}
3082			aio_errors++;
3083			continue;
3084		}
3085
3086		/*
3087		 * common case where requests are to the same fd
3088		 * for the same r/w operation
3089		 * for UFS, need to set EBADFD
3090		 */
3091		vp = fp->f_vnode;
3092		if (fp != prev_fp || mode != prev_mode) {
3093			aio_func = check_vp(vp, mode);
3094			if (aio_func == NULL) {
3095				prev_fp = NULL;
3096				releasef(aiocb->aio_fildes);
3097				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3098				aio_notsupported++;
3099				if (head) {
3100					mutex_enter(&aiop->aio_mutex);
3101					head->lio_nent--;
3102					head->lio_refcnt--;
3103					mutex_exit(&aiop->aio_mutex);
3104				}
3105				continue;
3106			} else {
3107				prev_fp = fp;
3108				prev_mode = mode;
3109			}
3110		}
3111
3112#ifdef	_LP64
3113		aiocb_LFton(aiocb, &aiocb_n);
3114		error = aio_req_setup(&reqp, aiop, &aiocb_n,
3115		    (aio_result_t *)&cbp->aio_resultp, vp);
3116#else
3117		error = aio_req_setupLF(&reqp, aiop, aiocb,
3118		    (aio_result_t *)&cbp->aio_resultp, vp);
3119#endif  /* _LP64 */
3120		if (error) {
3121			releasef(aiocb->aio_fildes);
3122			lio_set_uerror(&cbp->aio_resultp, error);
3123			if (head) {
3124				mutex_enter(&aiop->aio_mutex);
3125				head->lio_nent--;
3126				head->lio_refcnt--;
3127				mutex_exit(&aiop->aio_mutex);
3128			}
3129			aio_errors++;
3130			continue;
3131		}
3132
3133		reqp->aio_req_lio = head;
3134		deadhead = 0;
3135
3136		/*
3137		 * Set the errno field now before sending the request to
3138		 * the driver to avoid a race condition
3139		 */
3140		(void) suword32(&cbp->aio_resultp.aio_errno,
3141		    EINPROGRESS);
3142
3143		reqp->aio_req_iocb.iocb32 = *ucbp;
3144
3145		event = (mode == LIO_READ)? AIOAREAD64 : AIOAWRITE64;
3146		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3147		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3148		if (aio_port | aio_thread) {
3149			port_kevent_t *lpkevp;
3150			/*
3151			 * Prepare data to send with each aiocb completed.
3152			 */
3153			if (aio_port) {
3154				void *paddr = (void *)(uintptr_t)
3155				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3156				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3157					error = EFAULT;
3158			} else {	/* aio_thread */
3159				pnotify.portnfy_port =
3160				    aiocb->aio_sigevent.sigev_signo;
3161				pnotify.portnfy_user =
3162				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3163			}
3164			if (error)
3165				/* EMPTY */;
3166			else if (pkevtp != NULL &&
3167			    pnotify.portnfy_port == lio_head_port)
3168				error = port_dup_event(pkevtp, &lpkevp,
3169				    PORT_ALLOC_DEFAULT);
3170			else
3171				error = port_alloc_event(pnotify.portnfy_port,
3172				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3173				    &lpkevp);
3174			if (error == 0) {
3175				port_init_event(lpkevp, (uintptr_t)*ucbp,
3176				    (void *)(uintptr_t)pnotify.portnfy_user,
3177				    aio_port_callback, reqp);
3178				lpkevp->portkev_events = event;
3179				reqp->aio_req_portkev = lpkevp;
3180				reqp->aio_req_port = pnotify.portnfy_port;
3181			}
3182		}
3183
3184		/*
3185		 * send the request to driver.
3186		 */
3187		if (error == 0) {
3188			if (aiocb->aio_nbytes == 0) {
3189				clear_active_fd(aiocb->aio_fildes);
3190				aio_zerolen(reqp);
3191				continue;
3192			}
3193			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3194			    CRED());
3195		}
3196
3197		/*
3198		 * the fd's ref count is not decremented until the IO has
3199		 * completed unless there was an error.
3200		 */
3201		if (error) {
3202			releasef(aiocb->aio_fildes);
3203			lio_set_uerror(&cbp->aio_resultp, error);
3204			if (head) {
3205				mutex_enter(&aiop->aio_mutex);
3206				head->lio_nent--;
3207				head->lio_refcnt--;
3208				mutex_exit(&aiop->aio_mutex);
3209			}
3210			if (error == ENOTSUP)
3211				aio_notsupported++;
3212			else
3213				aio_errors++;
3214			lio_set_error(reqp);
3215		} else {
3216			clear_active_fd(aiocb->aio_fildes);
3217		}
3218	}
3219
3220	if (aio_notsupported) {
3221		error = ENOTSUP;
3222	} else if (aio_errors) {
3223		/*
3224		 * return EIO if any request failed
3225		 */
3226		error = EIO;
3227	}
3228
3229	if (mode_arg == LIO_WAIT) {
3230		mutex_enter(&aiop->aio_mutex);
3231		while (head->lio_refcnt > 0) {
3232			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3233				mutex_exit(&aiop->aio_mutex);
3234				error = EINTR;
3235				goto done;
3236			}
3237		}
3238		mutex_exit(&aiop->aio_mutex);
3239		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_LARGEFILE);
3240	}
3241
3242done:
3243	kmem_free(cbplist, ssize);
3244	if (deadhead) {
3245		if (head->lio_sigqp)
3246			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3247		if (head->lio_portkev)
3248			port_free_event(head->lio_portkev);
3249		kmem_free(head, sizeof (aio_lio_t));
3250	}
3251	return (error);
3252}
3253
3254#ifdef  _SYSCALL32_IMPL
3255static void
3256aiocb_LFton(aiocb64_32_t *src, aiocb_t *dest)
3257{
3258	dest->aio_fildes = src->aio_fildes;
3259	dest->aio_buf = (void *)(uintptr_t)src->aio_buf;
3260	dest->aio_nbytes = (size_t)src->aio_nbytes;
3261	dest->aio_offset = (off_t)src->aio_offset;
3262	dest->aio_reqprio = src->aio_reqprio;
3263	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3264	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3265
3266	/*
3267	 * See comment in sigqueue32() on handling of 32-bit
3268	 * sigvals in a 64-bit kernel.
3269	 */
3270	dest->aio_sigevent.sigev_value.sival_int =
3271	    (int)src->aio_sigevent.sigev_value.sival_int;
3272	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3273	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3274	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3275	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3276	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3277	dest->aio_lio_opcode = src->aio_lio_opcode;
3278	dest->aio_state = src->aio_state;
3279	dest->aio__pad[0] = src->aio__pad[0];
3280}
3281#endif
3282
3283/*
3284 * This function is used only for largefile calls made by
3285 * 32 bit applications.
3286 */
3287static int
3288aio_req_setupLF(
3289	aio_req_t	**reqpp,
3290	aio_t		*aiop,
3291	aiocb64_32_t	*arg,
3292	aio_result_t	*resultp,
3293	vnode_t		*vp)
3294{
3295	sigqueue_t	*sqp = NULL;
3296	aio_req_t	*reqp;
3297	struct uio	*uio;
3298	struct sigevent32 *sigev;
3299	int 		error;
3300
3301	sigev = &arg->aio_sigevent;
3302	if (sigev->sigev_notify == SIGEV_SIGNAL &&
3303	    sigev->sigev_signo > 0 && sigev->sigev_signo < NSIG) {
3304		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3305		if (sqp == NULL)
3306			return (EAGAIN);
3307		sqp->sq_func = NULL;
3308		sqp->sq_next = NULL;
3309		sqp->sq_info.si_code = SI_ASYNCIO;
3310		sqp->sq_info.si_pid = curproc->p_pid;
3311		sqp->sq_info.si_ctid = PRCTID(curproc);
3312		sqp->sq_info.si_zoneid = getzoneid();
3313		sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3314		sqp->sq_info.si_signo = sigev->sigev_signo;
3315		sqp->sq_info.si_value.sival_int = sigev->sigev_value.sival_int;
3316	}
3317
3318	mutex_enter(&aiop->aio_mutex);
3319
3320	if (aiop->aio_flags & AIO_REQ_BLOCK) {
3321		mutex_exit(&aiop->aio_mutex);
3322		if (sqp)
3323			kmem_free(sqp, sizeof (sigqueue_t));
3324		return (EIO);
3325	}
3326	/*
3327	 * get an aio_reqp from the free list or allocate one
3328	 * from dynamic memory.
3329	 */
3330	if (error = aio_req_alloc(&reqp, resultp)) {
3331		mutex_exit(&aiop->aio_mutex);
3332		if (sqp)
3333			kmem_free(sqp, sizeof (sigqueue_t));
3334		return (error);
3335	}
3336	aiop->aio_pending++;
3337	aiop->aio_outstanding++;
3338	reqp->aio_req_flags = AIO_PENDING;
3339	if (sigev->sigev_notify == SIGEV_THREAD ||
3340	    sigev->sigev_notify == SIGEV_PORT)
3341		aio_enq(&aiop->aio_portpending, reqp, 0);
3342	mutex_exit(&aiop->aio_mutex);
3343	/*
3344	 * initialize aio request.
3345	 */
3346	reqp->aio_req_fd = arg->aio_fildes;
3347	reqp->aio_req_sigqp = sqp;
3348	reqp->aio_req_iocb.iocb = NULL;
3349	reqp->aio_req_lio = NULL;
3350	reqp->aio_req_buf.b_file = vp;
3351	uio = reqp->aio_req.aio_uio;
3352	uio->uio_iovcnt = 1;
3353	uio->uio_iov->iov_base = (caddr_t)(uintptr_t)arg->aio_buf;
3354	uio->uio_iov->iov_len = arg->aio_nbytes;
3355	uio->uio_loffset = arg->aio_offset;
3356	*reqpp = reqp;
3357	return (0);
3358}
3359
3360/*
3361 * This routine is called when a non largefile call is made by a 32bit
3362 * process on a ILP32 or LP64 kernel.
3363 */
3364static int
3365alio32(
3366	int		mode_arg,
3367	void		*aiocb_arg,
3368	int		nent,
3369	void		*sigev)
3370{
3371	file_t		*fp;
3372	file_t		*prev_fp = NULL;
3373	int		prev_mode = -1;
3374	struct vnode	*vp;
3375	aio_lio_t	*head;
3376	aio_req_t	*reqp;
3377	aio_t		*aiop;
3378	caddr_t		cbplist;
3379	aiocb_t		cb;
3380	aiocb_t		*aiocb = &cb;
3381#ifdef	_LP64
3382	aiocb32_t	*cbp;
3383	caddr32_t	*ucbp;
3384	aiocb32_t	cb32;
3385	aiocb32_t	*aiocb32 = &cb32;
3386	struct sigevent32	sigevk;
3387#else
3388	aiocb_t		*cbp, **ucbp;
3389	struct sigevent	sigevk;
3390#endif
3391	sigqueue_t	*sqp;
3392	int		(*aio_func)();
3393	int		mode;
3394	int		error = 0;
3395	int		aio_errors = 0;
3396	int		i;
3397	size_t		ssize;
3398	int		deadhead = 0;
3399	int		aio_notsupported = 0;
3400	int		lio_head_port;
3401	int		aio_port;
3402	int		aio_thread;
3403	port_kevent_t	*pkevtp = NULL;
3404#ifdef	_LP64
3405	port_notify32_t	pnotify;
3406#else
3407	port_notify_t	pnotify;
3408#endif
3409	int		event;
3410
3411	aiop = curproc->p_aio;
3412	if (aiop == NULL || nent <= 0 || nent > _AIO_LISTIO_MAX)
3413		return (EINVAL);
3414
3415#ifdef	_LP64
3416	ssize = (sizeof (caddr32_t) * nent);
3417#else
3418	ssize = (sizeof (aiocb_t *) * nent);
3419#endif
3420	cbplist = kmem_alloc(ssize, KM_SLEEP);
3421	ucbp = (void *)cbplist;
3422
3423	if (copyin(aiocb_arg, cbplist, ssize) ||
3424	    (sigev && copyin(sigev, &sigevk, sizeof (struct sigevent32)))) {
3425		kmem_free(cbplist, ssize);
3426		return (EFAULT);
3427	}
3428
3429	/* Event Ports  */
3430	if (sigev &&
3431	    (sigevk.sigev_notify == SIGEV_THREAD ||
3432	    sigevk.sigev_notify == SIGEV_PORT)) {
3433		if (sigevk.sigev_notify == SIGEV_THREAD) {
3434			pnotify.portnfy_port = sigevk.sigev_signo;
3435			pnotify.portnfy_user = sigevk.sigev_value.sival_ptr;
3436		} else if (copyin(
3437		    (void *)(uintptr_t)sigevk.sigev_value.sival_ptr,
3438		    &pnotify, sizeof (pnotify))) {
3439			kmem_free(cbplist, ssize);
3440			return (EFAULT);
3441		}
3442		error = port_alloc_event(pnotify.portnfy_port,
3443		    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO, &pkevtp);
3444		if (error) {
3445			if (error == ENOMEM || error == EAGAIN)
3446				error = EAGAIN;
3447			else
3448				error = EINVAL;
3449			kmem_free(cbplist, ssize);
3450			return (error);
3451		}
3452		lio_head_port = pnotify.portnfy_port;
3453	}
3454
3455	/*
3456	 * a list head should be allocated if notification is
3457	 * enabled for this list.
3458	 */
3459	head = NULL;
3460
3461	if (mode_arg == LIO_WAIT || sigev) {
3462		mutex_enter(&aiop->aio_mutex);
3463		error = aio_lio_alloc(&head);
3464		mutex_exit(&aiop->aio_mutex);
3465		if (error)
3466			goto done;
3467		deadhead = 1;
3468		head->lio_nent = nent;
3469		head->lio_refcnt = nent;
3470		head->lio_port = -1;
3471		head->lio_portkev = NULL;
3472		if (sigev && sigevk.sigev_notify == SIGEV_SIGNAL &&
3473		    sigevk.sigev_signo > 0 && sigevk.sigev_signo < NSIG) {
3474			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_NOSLEEP);
3475			if (sqp == NULL) {
3476				error = EAGAIN;
3477				goto done;
3478			}
3479			sqp->sq_func = NULL;
3480			sqp->sq_next = NULL;
3481			sqp->sq_info.si_code = SI_ASYNCIO;
3482			sqp->sq_info.si_pid = curproc->p_pid;
3483			sqp->sq_info.si_ctid = PRCTID(curproc);
3484			sqp->sq_info.si_zoneid = getzoneid();
3485			sqp->sq_info.si_uid = crgetuid(curproc->p_cred);
3486			sqp->sq_info.si_signo = sigevk.sigev_signo;
3487			sqp->sq_info.si_value.sival_int =
3488			    sigevk.sigev_value.sival_int;
3489			head->lio_sigqp = sqp;
3490		} else {
3491			head->lio_sigqp = NULL;
3492		}
3493		if (pkevtp) {
3494			/*
3495			 * Prepare data to send when list of aiocb's has
3496			 * completed.
3497			 */
3498			port_init_event(pkevtp, (uintptr_t)sigev,
3499			    (void *)(uintptr_t)pnotify.portnfy_user,
3500			    NULL, head);
3501			pkevtp->portkev_events = AIOLIO;
3502			head->lio_portkev = pkevtp;
3503			head->lio_port = pnotify.portnfy_port;
3504		}
3505	}
3506
3507	for (i = 0; i < nent; i++, ucbp++) {
3508
3509		/* skip entry if it can't be copied. */
3510#ifdef	_LP64
3511		cbp = (aiocb32_t *)(uintptr_t)*ucbp;
3512		if (cbp == NULL || copyin(cbp, aiocb32, sizeof (*aiocb32)))
3513#else
3514		cbp = (aiocb_t *)*ucbp;
3515		if (cbp == NULL || copyin(cbp, aiocb, sizeof (*aiocb)))
3516#endif
3517		{
3518			if (head) {
3519				mutex_enter(&aiop->aio_mutex);
3520				head->lio_nent--;
3521				head->lio_refcnt--;
3522				mutex_exit(&aiop->aio_mutex);
3523			}
3524			continue;
3525		}
3526#ifdef	_LP64
3527		/*
3528		 * copy 32 bit structure into 64 bit structure
3529		 */
3530		aiocb_32ton(aiocb32, aiocb);
3531#endif /* _LP64 */
3532
3533		/* skip if opcode for aiocb is LIO_NOP */
3534		mode = aiocb->aio_lio_opcode;
3535		if (mode == LIO_NOP) {
3536			cbp = NULL;
3537			if (head) {
3538				mutex_enter(&aiop->aio_mutex);
3539				head->lio_nent--;
3540				head->lio_refcnt--;
3541				mutex_exit(&aiop->aio_mutex);
3542			}
3543			continue;
3544		}
3545
3546		/* increment file descriptor's ref count. */
3547		if ((fp = getf(aiocb->aio_fildes)) == NULL) {
3548			lio_set_uerror(&cbp->aio_resultp, EBADF);
3549			if (head) {
3550				mutex_enter(&aiop->aio_mutex);
3551				head->lio_nent--;
3552				head->lio_refcnt--;
3553				mutex_exit(&aiop->aio_mutex);
3554			}
3555			aio_errors++;
3556			continue;
3557		}
3558
3559		/*
3560		 * check the permission of the partition
3561		 */
3562		if ((fp->f_flag & mode) == 0) {
3563			releasef(aiocb->aio_fildes);
3564			lio_set_uerror(&cbp->aio_resultp, EBADF);
3565			if (head) {
3566				mutex_enter(&aiop->aio_mutex);
3567				head->lio_nent--;
3568				head->lio_refcnt--;
3569				mutex_exit(&aiop->aio_mutex);
3570			}
3571			aio_errors++;
3572			continue;
3573		}
3574
3575		/*
3576		 * common case where requests are to the same fd
3577		 * for the same r/w operation
3578		 * for UFS, need to set EBADFD
3579		 */
3580		vp = fp->f_vnode;
3581		if (fp != prev_fp || mode != prev_mode) {
3582			aio_func = check_vp(vp, mode);
3583			if (aio_func == NULL) {
3584				prev_fp = NULL;
3585				releasef(aiocb->aio_fildes);
3586				lio_set_uerror(&cbp->aio_resultp, EBADFD);
3587				aio_notsupported++;
3588				if (head) {
3589					mutex_enter(&aiop->aio_mutex);
3590					head->lio_nent--;
3591					head->lio_refcnt--;
3592					mutex_exit(&aiop->aio_mutex);
3593				}
3594				continue;
3595			} else {
3596				prev_fp = fp;
3597				prev_mode = mode;
3598			}
3599		}
3600
3601		error = aio_req_setup(&reqp, aiop, aiocb,
3602		    (aio_result_t *)&cbp->aio_resultp, vp);
3603		if (error) {
3604			releasef(aiocb->aio_fildes);
3605			lio_set_uerror(&cbp->aio_resultp, error);
3606			if (head) {
3607				mutex_enter(&aiop->aio_mutex);
3608				head->lio_nent--;
3609				head->lio_refcnt--;
3610				mutex_exit(&aiop->aio_mutex);
3611			}
3612			aio_errors++;
3613			continue;
3614		}
3615
3616		reqp->aio_req_lio = head;
3617		deadhead = 0;
3618
3619		/*
3620		 * Set the errno field now before sending the request to
3621		 * the driver to avoid a race condition
3622		 */
3623		(void) suword32(&cbp->aio_resultp.aio_errno,
3624		    EINPROGRESS);
3625
3626		reqp->aio_req_iocb.iocb32 = (caddr32_t)(uintptr_t)cbp;
3627
3628		event = (mode == LIO_READ)? AIOAREAD : AIOAWRITE;
3629		aio_port = (aiocb->aio_sigevent.sigev_notify == SIGEV_PORT);
3630		aio_thread = (aiocb->aio_sigevent.sigev_notify == SIGEV_THREAD);
3631		if (aio_port | aio_thread) {
3632			port_kevent_t *lpkevp;
3633			/*
3634			 * Prepare data to send with each aiocb completed.
3635			 */
3636#ifdef _LP64
3637			if (aio_port) {
3638				void *paddr = (void  *)(uintptr_t)
3639				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3640				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3641					error = EFAULT;
3642			} else {	/* aio_thread */
3643				pnotify.portnfy_port =
3644				    aiocb32->aio_sigevent.sigev_signo;
3645				pnotify.portnfy_user =
3646				    aiocb32->aio_sigevent.sigev_value.sival_ptr;
3647			}
3648#else
3649			if (aio_port) {
3650				void *paddr =
3651				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3652				if (copyin(paddr, &pnotify, sizeof (pnotify)))
3653					error = EFAULT;
3654			} else {	/* aio_thread */
3655				pnotify.portnfy_port =
3656				    aiocb->aio_sigevent.sigev_signo;
3657				pnotify.portnfy_user =
3658				    aiocb->aio_sigevent.sigev_value.sival_ptr;
3659			}
3660#endif
3661			if (error)
3662				/* EMPTY */;
3663			else if (pkevtp != NULL &&
3664			    pnotify.portnfy_port == lio_head_port)
3665				error = port_dup_event(pkevtp, &lpkevp,
3666				    PORT_ALLOC_DEFAULT);
3667			else
3668				error = port_alloc_event(pnotify.portnfy_port,
3669				    PORT_ALLOC_DEFAULT, PORT_SOURCE_AIO,
3670				    &lpkevp);
3671			if (error == 0) {
3672				port_init_event(lpkevp, (uintptr_t)cbp,
3673				    (void *)(uintptr_t)pnotify.portnfy_user,
3674				    aio_port_callback, reqp);
3675				lpkevp->portkev_events = event;
3676				reqp->aio_req_portkev = lpkevp;
3677				reqp->aio_req_port = pnotify.portnfy_port;
3678			}
3679		}
3680
3681		/*
3682		 * send the request to driver.
3683		 */
3684		if (error == 0) {
3685			if (aiocb->aio_nbytes == 0) {
3686				clear_active_fd(aiocb->aio_fildes);
3687				aio_zerolen(reqp);
3688				continue;
3689			}
3690			error = (*aio_func)(vp, (aio_req_t *)&reqp->aio_req,
3691			    CRED());
3692		}
3693
3694		/*
3695		 * the fd's ref count is not decremented until the IO has
3696		 * completed unless there was an error.
3697		 */
3698		if (error) {
3699			releasef(aiocb->aio_fildes);
3700			lio_set_uerror(&cbp->aio_resultp, error);
3701			if (head) {
3702				mutex_enter(&aiop->aio_mutex);
3703				head->lio_nent--;
3704				head->lio_refcnt--;
3705				mutex_exit(&aiop->aio_mutex);
3706			}
3707			if (error == ENOTSUP)
3708				aio_notsupported++;
3709			else
3710				aio_errors++;
3711			lio_set_error(reqp);
3712		} else {
3713			clear_active_fd(aiocb->aio_fildes);
3714		}
3715	}
3716
3717	if (aio_notsupported) {
3718		error = ENOTSUP;
3719	} else if (aio_errors) {
3720		/*
3721		 * return EIO if any request failed
3722		 */
3723		error = EIO;
3724	}
3725
3726	if (mode_arg == LIO_WAIT) {
3727		mutex_enter(&aiop->aio_mutex);
3728		while (head->lio_refcnt > 0) {
3729			if (!cv_wait_sig(&head->lio_notify, &aiop->aio_mutex)) {
3730				mutex_exit(&aiop->aio_mutex);
3731				error = EINTR;
3732				goto done;
3733			}
3734		}
3735		mutex_exit(&aiop->aio_mutex);
3736		alio_cleanup(aiop, (aiocb_t **)cbplist, nent, AIO_32);
3737	}
3738
3739done:
3740	kmem_free(cbplist, ssize);
3741	if (deadhead) {
3742		if (head->lio_sigqp)
3743			kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
3744		if (head->lio_portkev)
3745			port_free_event(head->lio_portkev);
3746		kmem_free(head, sizeof (aio_lio_t));
3747	}
3748	return (error);
3749}
3750
3751
3752#ifdef  _SYSCALL32_IMPL
3753void
3754aiocb_32ton(aiocb32_t *src, aiocb_t *dest)
3755{
3756	dest->aio_fildes = src->aio_fildes;
3757	dest->aio_buf = (caddr_t)(uintptr_t)src->aio_buf;
3758	dest->aio_nbytes = (size_t)src->aio_nbytes;
3759	dest->aio_offset = (off_t)src->aio_offset;
3760	dest->aio_reqprio = src->aio_reqprio;
3761	dest->aio_sigevent.sigev_notify = src->aio_sigevent.sigev_notify;
3762	dest->aio_sigevent.sigev_signo = src->aio_sigevent.sigev_signo;
3763
3764	/*
3765	 * See comment in sigqueue32() on handling of 32-bit
3766	 * sigvals in a 64-bit kernel.
3767	 */
3768	dest->aio_sigevent.sigev_value.sival_int =
3769	    (int)src->aio_sigevent.sigev_value.sival_int;
3770	dest->aio_sigevent.sigev_notify_function = (void (*)(union sigval))
3771	    (uintptr_t)src->aio_sigevent.sigev_notify_function;
3772	dest->aio_sigevent.sigev_notify_attributes = (pthread_attr_t *)
3773	    (uintptr_t)src->aio_sigevent.sigev_notify_attributes;
3774	dest->aio_sigevent.__sigev_pad2 = src->aio_sigevent.__sigev_pad2;
3775	dest->aio_lio_opcode = src->aio_lio_opcode;
3776	dest->aio_state = src->aio_state;
3777	dest->aio__pad[0] = src->aio__pad[0];
3778}
3779#endif /* _SYSCALL32_IMPL */
3780
3781/*
3782 * aio_port_callback() is called just before the event is retrieved from the
3783 * port. The task of this callback function is to finish the work of the
3784 * transaction for the application, it means :
3785 * - copyout transaction data to the application
3786 *	(this thread is running in the right process context)
3787 * - keep trace of the transaction (update of counters).
3788 * - free allocated buffers
3789 * The aiocb pointer is the object element of the port_kevent_t structure.
3790 *
3791 * flag :
3792 *	PORT_CALLBACK_DEFAULT : do copyout and free resources
3793 *	PORT_CALLBACK_CLOSE   : don't do copyout, free resources
3794 */
3795
3796/*ARGSUSED*/
3797int
3798aio_port_callback(void *arg, int *events, pid_t pid, int flag, void *evp)
3799{
3800	aio_t		*aiop = curproc->p_aio;
3801	aio_req_t	*reqp = arg;
3802	struct	iovec	*iov;
3803	struct	buf	*bp;
3804	void		*resultp;
3805
3806	if (pid != curproc->p_pid) {
3807		/* wrong proc !!, can not deliver data here ... */
3808		return (EACCES);
3809	}
3810
3811	mutex_enter(&aiop->aio_portq_mutex);
3812	reqp->aio_req_portkev = NULL;
3813	aio_req_remove_portq(aiop, reqp); /* remove request from portq */
3814	mutex_exit(&aiop->aio_portq_mutex);
3815	aphysio_unlock(reqp);		/* unlock used pages */
3816	mutex_enter(&aiop->aio_mutex);
3817	if (reqp->aio_req_flags & AIO_COPYOUTDONE) {
3818		aio_req_free_port(aiop, reqp);	/* back to free list */
3819		mutex_exit(&aiop->aio_mutex);
3820		return (0);
3821	}
3822
3823	iov = reqp->aio_req_uio.uio_iov;
3824	bp = &reqp->aio_req_buf;
3825	resultp = (void *)reqp->aio_req_resultp;
3826	aio_req_free_port(aiop, reqp);	/* request struct back to free list */
3827	mutex_exit(&aiop->aio_mutex);
3828	if (flag == PORT_CALLBACK_DEFAULT)
3829		aio_copyout_result_port(iov, bp, resultp);
3830	return (0);
3831}
3832