sys_generic.c revision 92252
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 92252 2002-03-14 01:32:30Z alfred $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#include <sys/condvar.h>
65#ifdef KTRACE
66#include <sys/ktrace.h>
67#endif
68#include <vm/vm.h>
69#include <vm/vm_page.h>
70
71#include <machine/limits.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int	pollscan(struct thread *, struct pollfd *, u_int);
78static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int	dofileread(struct thread *, struct file *, int, void *,
80		    size_t, off_t, int);
81static int	dofilewrite(struct thread *, struct file *, int,
82		    const void *, size_t, off_t, int);
83
84/*
85 * Read system call.
86 */
87#ifndef _SYS_SYSPROTO_H_
88struct read_args {
89	int	fd;
90	void	*buf;
91	size_t	nbyte;
92};
93#endif
94/*
95 * MPSAFE
96 */
97int
98read(td, uap)
99	struct thread *td;
100	struct read_args *uap;
101{
102	struct file *fp;
103	int error;
104
105	mtx_lock(&Giant);
106	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107		error = dofileread(td, fp, uap->fd, uap->buf,
108			    uap->nbyte, (off_t)-1, 0);
109		fdrop(fp, td);
110	}
111	mtx_unlock(&Giant);
112	return(error);
113}
114
115/*
116 * Pread system call
117 */
118#ifndef _SYS_SYSPROTO_H_
119struct pread_args {
120	int	fd;
121	void	*buf;
122	size_t	nbyte;
123	int	pad;
124	off_t	offset;
125};
126#endif
127/*
128 * MPSAFE
129 */
130int
131pread(td, uap)
132	struct thread *td;
133	struct pread_args *uap;
134{
135	struct file *fp;
136	int error;
137
138	if ((error = fget_read(td, uap->fd, &fp)) != 0)
139		return (error);
140	mtx_lock(&Giant);
141	if (fp->f_type != DTYPE_VNODE) {
142		error = ESPIPE;
143	} else {
144		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145			    uap->offset, FOF_OFFSET);
146	}
147	fdrop(fp, td);
148	mtx_unlock(&Giant);
149	return(error);
150}
151
152/*
153 * Code common for read and pread
154 */
155int
156dofileread(td, fp, fd, buf, nbyte, offset, flags)
157	struct thread *td;
158	struct file *fp;
159	int fd, flags;
160	void *buf;
161	size_t nbyte;
162	off_t offset;
163{
164	struct uio auio;
165	struct iovec aiov;
166	long cnt, error = 0;
167#ifdef KTRACE
168	struct iovec ktriov;
169	struct uio ktruio;
170	int didktr = 0;
171#endif
172
173	aiov.iov_base = (caddr_t)buf;
174	aiov.iov_len = nbyte;
175	auio.uio_iov = &aiov;
176	auio.uio_iovcnt = 1;
177	auio.uio_offset = offset;
178	if (nbyte > INT_MAX)
179		return (EINVAL);
180	auio.uio_resid = nbyte;
181	auio.uio_rw = UIO_READ;
182	auio.uio_segflg = UIO_USERSPACE;
183	auio.uio_td = td;
184#ifdef KTRACE
185	/*
186	 * if tracing, save a copy of iovec
187	 */
188	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
189		ktriov = aiov;
190		ktruio = auio;
191		didktr = 1;
192	}
193#endif
194	cnt = nbyte;
195
196	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
197		if (auio.uio_resid != cnt && (error == ERESTART ||
198		    error == EINTR || error == EWOULDBLOCK))
199			error = 0;
200	}
201	cnt -= auio.uio_resid;
202#ifdef KTRACE
203	if (didktr && error == 0) {
204		ktruio.uio_iov = &ktriov;
205		ktruio.uio_resid = cnt;
206		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
207	}
208#endif
209	td->td_retval[0] = cnt;
210	return (error);
211}
212
213/*
214 * Scatter read system call.
215 */
216#ifndef _SYS_SYSPROTO_H_
217struct readv_args {
218	int	fd;
219	struct	iovec *iovp;
220	u_int	iovcnt;
221};
222#endif
223/*
224 * MPSAFE
225 */
226int
227readv(td, uap)
228	struct thread *td;
229	struct readv_args *uap;
230{
231	struct file *fp;
232	struct uio auio;
233	struct iovec *iov;
234	struct iovec *needfree;
235	struct iovec aiov[UIO_SMALLIOV];
236	long i, cnt, error = 0;
237	u_int iovlen;
238#ifdef KTRACE
239	struct iovec *ktriov = NULL;
240	struct uio ktruio;
241#endif
242	mtx_lock(&Giant);
243
244	if ((error = fget_read(td, uap->fd, &fp)) != 0)
245		goto done2;
246	/* note: can't use iovlen until iovcnt is validated */
247	iovlen = uap->iovcnt * sizeof (struct iovec);
248	if (uap->iovcnt > UIO_SMALLIOV) {
249		if (uap->iovcnt > UIO_MAXIOV) {
250			error = EINVAL;
251			goto done2;
252		}
253		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
254		needfree = iov;
255	} else {
256		iov = aiov;
257		needfree = NULL;
258	}
259	auio.uio_iov = iov;
260	auio.uio_iovcnt = uap->iovcnt;
261	auio.uio_rw = UIO_READ;
262	auio.uio_segflg = UIO_USERSPACE;
263	auio.uio_td = td;
264	auio.uio_offset = -1;
265	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
266		goto done;
267	auio.uio_resid = 0;
268	for (i = 0; i < uap->iovcnt; i++) {
269		if (iov->iov_len > INT_MAX - auio.uio_resid) {
270			error = EINVAL;
271			goto done;
272		}
273		auio.uio_resid += iov->iov_len;
274		iov++;
275	}
276#ifdef KTRACE
277	/*
278	 * if tracing, save a copy of iovec
279	 */
280	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
281		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
282		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
283		ktruio = auio;
284	}
285#endif
286	cnt = auio.uio_resid;
287	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
288		if (auio.uio_resid != cnt && (error == ERESTART ||
289		    error == EINTR || error == EWOULDBLOCK))
290			error = 0;
291	}
292	cnt -= auio.uio_resid;
293#ifdef KTRACE
294	if (ktriov != NULL) {
295		if (error == 0) {
296			ktruio.uio_iov = ktriov;
297			ktruio.uio_resid = cnt;
298			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
299			    error);
300		}
301		FREE(ktriov, M_TEMP);
302	}
303#endif
304	td->td_retval[0] = cnt;
305done:
306	fdrop(fp, td);
307	if (needfree)
308		FREE(needfree, M_IOV);
309done2:
310	mtx_unlock(&Giant);
311	return (error);
312}
313
314/*
315 * Write system call
316 */
317#ifndef _SYS_SYSPROTO_H_
318struct write_args {
319	int	fd;
320	const void *buf;
321	size_t	nbyte;
322};
323#endif
324/*
325 * MPSAFE
326 */
327int
328write(td, uap)
329	struct thread *td;
330	struct write_args *uap;
331{
332	struct file *fp;
333	int error;
334
335	mtx_lock(&Giant);
336	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
337		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
338			    (off_t)-1, 0);
339		fdrop(fp, td);
340	} else {
341		error = EBADF;	/* XXX this can't be right */
342	}
343	mtx_unlock(&Giant);
344	return(error);
345}
346
347/*
348 * Pwrite system call
349 */
350#ifndef _SYS_SYSPROTO_H_
351struct pwrite_args {
352	int	fd;
353	const void *buf;
354	size_t	nbyte;
355	int	pad;
356	off_t	offset;
357};
358#endif
359/*
360 * MPSAFE
361 */
362int
363pwrite(td, uap)
364	struct thread *td;
365	struct pwrite_args *uap;
366{
367	struct file *fp;
368	int error;
369
370	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
371		mtx_lock(&Giant);
372		if (fp->f_type == DTYPE_VNODE) {
373			error = dofilewrite(td, fp, uap->fd, uap->buf,
374				    uap->nbyte, uap->offset, FOF_OFFSET);
375		} else {
376			error = ESPIPE;
377		}
378		fdrop(fp, td);
379		mtx_unlock(&Giant);
380	} else {
381		error = EBADF;	/* this can't be right */
382	}
383	return(error);
384}
385
386static int
387dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
388	struct thread *td;
389	struct file *fp;
390	int fd, flags;
391	const void *buf;
392	size_t nbyte;
393	off_t offset;
394{
395	struct uio auio;
396	struct iovec aiov;
397	long cnt, error = 0;
398#ifdef KTRACE
399	struct iovec ktriov;
400	struct uio ktruio;
401	int didktr = 0;
402#endif
403
404	aiov.iov_base = (void *)(uintptr_t)buf;
405	aiov.iov_len = nbyte;
406	auio.uio_iov = &aiov;
407	auio.uio_iovcnt = 1;
408	auio.uio_offset = offset;
409	if (nbyte > INT_MAX)
410		return (EINVAL);
411	auio.uio_resid = nbyte;
412	auio.uio_rw = UIO_WRITE;
413	auio.uio_segflg = UIO_USERSPACE;
414	auio.uio_td = td;
415#ifdef KTRACE
416	/*
417	 * if tracing, save a copy of iovec and uio
418	 */
419	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
420		ktriov = aiov;
421		ktruio = auio;
422		didktr = 1;
423	}
424#endif
425	cnt = nbyte;
426	if (fp->f_type == DTYPE_VNODE)
427		bwillwrite();
428	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
429		if (auio.uio_resid != cnt && (error == ERESTART ||
430		    error == EINTR || error == EWOULDBLOCK))
431			error = 0;
432		if (error == EPIPE) {
433			PROC_LOCK(td->td_proc);
434			psignal(td->td_proc, SIGPIPE);
435			PROC_UNLOCK(td->td_proc);
436		}
437	}
438	cnt -= auio.uio_resid;
439#ifdef KTRACE
440	if (didktr && error == 0) {
441		ktruio.uio_iov = &ktriov;
442		ktruio.uio_resid = cnt;
443		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
444	}
445#endif
446	td->td_retval[0] = cnt;
447	return (error);
448}
449
450/*
451 * Gather write system call
452 */
453#ifndef _SYS_SYSPROTO_H_
454struct writev_args {
455	int	fd;
456	struct	iovec *iovp;
457	u_int	iovcnt;
458};
459#endif
460/*
461 * MPSAFE
462 */
463int
464writev(td, uap)
465	struct thread *td;
466	register struct writev_args *uap;
467{
468	struct file *fp;
469	struct uio auio;
470	register struct iovec *iov;
471	struct iovec *needfree;
472	struct iovec aiov[UIO_SMALLIOV];
473	long i, cnt, error = 0;
474	u_int iovlen;
475#ifdef KTRACE
476	struct iovec *ktriov = NULL;
477	struct uio ktruio;
478#endif
479
480	mtx_lock(&Giant);
481	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
482		error = EBADF;
483		goto done2;
484	}
485	/* note: can't use iovlen until iovcnt is validated */
486	iovlen = uap->iovcnt * sizeof (struct iovec);
487	if (uap->iovcnt > UIO_SMALLIOV) {
488		if (uap->iovcnt > UIO_MAXIOV) {
489			needfree = NULL;
490			error = EINVAL;
491			goto done;
492		}
493		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
494		needfree = iov;
495	} else {
496		iov = aiov;
497		needfree = NULL;
498	}
499	auio.uio_iov = iov;
500	auio.uio_iovcnt = uap->iovcnt;
501	auio.uio_rw = UIO_WRITE;
502	auio.uio_segflg = UIO_USERSPACE;
503	auio.uio_td = td;
504	auio.uio_offset = -1;
505	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
506		goto done;
507	auio.uio_resid = 0;
508	for (i = 0; i < uap->iovcnt; i++) {
509		if (iov->iov_len > INT_MAX - auio.uio_resid) {
510			error = EINVAL;
511			goto done;
512		}
513		auio.uio_resid += iov->iov_len;
514		iov++;
515	}
516#ifdef KTRACE
517	/*
518	 * if tracing, save a copy of iovec and uio
519	 */
520	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
521		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
522		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
523		ktruio = auio;
524	}
525#endif
526	cnt = auio.uio_resid;
527	if (fp->f_type == DTYPE_VNODE)
528		bwillwrite();
529	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
530		if (auio.uio_resid != cnt && (error == ERESTART ||
531		    error == EINTR || error == EWOULDBLOCK))
532			error = 0;
533		if (error == EPIPE) {
534			PROC_LOCK(td->td_proc);
535			psignal(td->td_proc, SIGPIPE);
536			PROC_UNLOCK(td->td_proc);
537		}
538	}
539	cnt -= auio.uio_resid;
540#ifdef KTRACE
541	if (ktriov != NULL) {
542		if (error == 0) {
543			ktruio.uio_iov = ktriov;
544			ktruio.uio_resid = cnt;
545			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
546			    error);
547		}
548		FREE(ktriov, M_TEMP);
549	}
550#endif
551	td->td_retval[0] = cnt;
552done:
553	fdrop(fp, td);
554	if (needfree)
555		FREE(needfree, M_IOV);
556done2:
557	mtx_unlock(&Giant);
558	return (error);
559}
560
561/*
562 * Ioctl system call
563 */
564#ifndef _SYS_SYSPROTO_H_
565struct ioctl_args {
566	int	fd;
567	u_long	com;
568	caddr_t	data;
569};
570#endif
571/*
572 * MPSAFE
573 */
574/* ARGSUSED */
575int
576ioctl(td, uap)
577	struct thread *td;
578	register struct ioctl_args *uap;
579{
580	struct file *fp;
581	register struct filedesc *fdp;
582	register u_long com;
583	int error = 0;
584	register u_int size;
585	caddr_t data, memp;
586	int tmp;
587#define STK_PARAMS	128
588	union {
589	    char stkbuf[STK_PARAMS];
590	    long align;
591	} ubuf;
592
593	if ((error = fget(td, uap->fd, &fp)) != 0)
594		return (error);
595	mtx_lock(&Giant);
596	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
597		fdrop(fp, td);
598		mtx_unlock(&Giant);
599		return (EBADF);
600	}
601	fdp = td->td_proc->p_fd;
602	switch (com = uap->com) {
603	case FIONCLEX:
604		FILEDESC_LOCK(fdp);
605		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
606		FILEDESC_UNLOCK(fdp);
607		fdrop(fp, td);
608		mtx_unlock(&Giant);
609		return (0);
610	case FIOCLEX:
611		FILEDESC_LOCK(fdp);
612		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
613		FILEDESC_UNLOCK(fdp);
614		fdrop(fp, td);
615		mtx_unlock(&Giant);
616		return (0);
617	}
618
619	/*
620	 * Interpret high order word to find amount of data to be
621	 * copied to/from the user's address space.
622	 */
623	size = IOCPARM_LEN(com);
624	if (size > IOCPARM_MAX) {
625		fdrop(fp, td);
626		mtx_unlock(&Giant);
627		return (ENOTTY);
628	}
629
630	memp = NULL;
631	if (size > sizeof (ubuf.stkbuf)) {
632		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
633		data = memp;
634	} else {
635		data = ubuf.stkbuf;
636	}
637	if (com&IOC_IN) {
638		if (size) {
639			error = copyin(uap->data, data, (u_int)size);
640			if (error) {
641				if (memp)
642					free(memp, M_IOCTLOPS);
643				fdrop(fp, td);
644				goto done;
645			}
646		} else {
647			*(caddr_t *)data = uap->data;
648		}
649	} else if ((com&IOC_OUT) && size) {
650		/*
651		 * Zero the buffer so the user always
652		 * gets back something deterministic.
653		 */
654		bzero(data, size);
655	} else if (com&IOC_VOID) {
656		*(caddr_t *)data = uap->data;
657	}
658
659	switch (com) {
660
661	case FIONBIO:
662		FILE_LOCK(fp);
663		if ((tmp = *(int *)data))
664			fp->f_flag |= FNONBLOCK;
665		else
666			fp->f_flag &= ~FNONBLOCK;
667		FILE_UNLOCK(fp);
668		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
669		break;
670
671	case FIOASYNC:
672		FILE_LOCK(fp);
673		if ((tmp = *(int *)data))
674			fp->f_flag |= FASYNC;
675		else
676			fp->f_flag &= ~FASYNC;
677		FILE_UNLOCK(fp);
678		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
679		break;
680
681	default:
682		error = fo_ioctl(fp, com, data, td);
683		/*
684		 * Copy any data to user, size was
685		 * already set and checked above.
686		 */
687		if (error == 0 && (com&IOC_OUT) && size)
688			error = copyout(data, uap->data, (u_int)size);
689		break;
690	}
691	if (memp)
692		free(memp, M_IOCTLOPS);
693	fdrop(fp, td);
694done:
695	mtx_unlock(&Giant);
696	return (error);
697}
698
699/*
700 * sellock and selwait are initialized in selectinit() via SYSINIT.
701 */
702struct mtx	sellock;
703struct cv	selwait;
704int	nselcoll;	/* Select collisions since boot */
705SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
706
707/*
708 * Select system call.
709 */
710#ifndef _SYS_SYSPROTO_H_
711struct select_args {
712	int	nd;
713	fd_set	*in, *ou, *ex;
714	struct	timeval *tv;
715};
716#endif
717/*
718 * MPSAFE
719 */
720int
721select(td, uap)
722	register struct thread *td;
723	register struct select_args *uap;
724{
725	struct filedesc *fdp;
726	/*
727	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
728	 * infds with the new FD_SETSIZE of 1024, and more than enough for
729	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
730	 * of 256.
731	 */
732	fd_mask s_selbits[howmany(2048, NFDBITS)];
733	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
734	struct timeval atv, rtv, ttv;
735	int ncoll, error, timo;
736	u_int nbufbytes, ncpbytes, nfdbits;
737
738	if (uap->nd < 0)
739		return (EINVAL);
740	fdp = td->td_proc->p_fd;
741	mtx_lock(&Giant);
742	FILEDESC_LOCK(fdp);
743
744	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
745		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
746	FILEDESC_UNLOCK(fdp);
747
748	/*
749	 * Allocate just enough bits for the non-null fd_sets.  Use the
750	 * preallocated auto buffer if possible.
751	 */
752	nfdbits = roundup(uap->nd, NFDBITS);
753	ncpbytes = nfdbits / NBBY;
754	nbufbytes = 0;
755	if (uap->in != NULL)
756		nbufbytes += 2 * ncpbytes;
757	if (uap->ou != NULL)
758		nbufbytes += 2 * ncpbytes;
759	if (uap->ex != NULL)
760		nbufbytes += 2 * ncpbytes;
761	if (nbufbytes <= sizeof s_selbits)
762		selbits = &s_selbits[0];
763	else
764		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
765
766	/*
767	 * Assign pointers into the bit buffers and fetch the input bits.
768	 * Put the output buffers together so that they can be bzeroed
769	 * together.
770	 */
771	sbp = selbits;
772#define	getbits(name, x) \
773	do {								\
774		if (uap->name == NULL)					\
775			ibits[x] = NULL;				\
776		else {							\
777			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
778			obits[x] = sbp;					\
779			sbp += ncpbytes / sizeof *sbp;			\
780			error = copyin(uap->name, ibits[x], ncpbytes);	\
781			if (error != 0)					\
782				goto done_nosellock;			\
783		}							\
784	} while (0)
785	getbits(in, 0);
786	getbits(ou, 1);
787	getbits(ex, 2);
788#undef	getbits
789	if (nbufbytes != 0)
790		bzero(selbits, nbufbytes / 2);
791
792	if (uap->tv) {
793		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
794			sizeof (atv));
795		if (error)
796			goto done_nosellock;
797		if (itimerfix(&atv)) {
798			error = EINVAL;
799			goto done_nosellock;
800		}
801		getmicrouptime(&rtv);
802		timevaladd(&atv, &rtv);
803	} else {
804		atv.tv_sec = 0;
805		atv.tv_usec = 0;
806	}
807	timo = 0;
808	mtx_lock(&sellock);
809retry:
810	ncoll = nselcoll;
811	mtx_lock_spin(&sched_lock);
812	td->td_flags |= TDF_SELECT;
813	mtx_unlock_spin(&sched_lock);
814	mtx_unlock(&sellock);
815
816	/* XXX Is there a better place for this? */
817	TAILQ_INIT(&td->td_selq);
818	error = selscan(td, ibits, obits, uap->nd);
819	mtx_lock(&sellock);
820	if (error || td->td_retval[0])
821		goto done;
822	if (atv.tv_sec || atv.tv_usec) {
823		getmicrouptime(&rtv);
824		if (timevalcmp(&rtv, &atv, >=))
825			goto done;
826		ttv = atv;
827		timevalsub(&ttv, &rtv);
828		timo = ttv.tv_sec > 24 * 60 * 60 ?
829		    24 * 60 * 60 * hz : tvtohz(&ttv);
830	}
831
832	/*
833	 * An event of interest may occur while we do not hold
834	 * sellock, so check TDF_SELECT and the number of
835	 * collisions and rescan the file descriptors if
836	 * necessary.
837	 */
838	mtx_lock_spin(&sched_lock);
839	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
840		mtx_unlock_spin(&sched_lock);
841		goto retry;
842	}
843	mtx_unlock_spin(&sched_lock);
844
845	if (timo > 0)
846		error = cv_timedwait_sig(&selwait, &sellock, timo);
847	else
848		error = cv_wait_sig(&selwait, &sellock);
849
850	if (error == 0)
851		goto retry;
852
853done:
854	clear_selinfo_list(td);
855	mtx_lock_spin(&sched_lock);
856	td->td_flags &= ~TDF_SELECT;
857	mtx_unlock_spin(&sched_lock);
858	mtx_unlock(&sellock);
859
860done_nosellock:
861	/* select is not restarted after signals... */
862	if (error == ERESTART)
863		error = EINTR;
864	if (error == EWOULDBLOCK)
865		error = 0;
866#define	putbits(name, x) \
867	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
868		error = error2;
869	if (error == 0) {
870		int error2;
871
872		putbits(in, 0);
873		putbits(ou, 1);
874		putbits(ex, 2);
875#undef putbits
876	}
877	if (selbits != &s_selbits[0])
878		free(selbits, M_SELECT);
879
880	mtx_unlock(&Giant);
881	return (error);
882}
883
884static int
885selscan(td, ibits, obits, nfd)
886	struct thread *td;
887	fd_mask **ibits, **obits;
888	int nfd;
889{
890	int msk, i, fd;
891	fd_mask bits;
892	struct file *fp;
893	int n = 0;
894	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
895	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
896	struct filedesc *fdp = td->td_proc->p_fd;
897
898	FILEDESC_LOCK(fdp);
899	for (msk = 0; msk < 3; msk++) {
900		if (ibits[msk] == NULL)
901			continue;
902		for (i = 0; i < nfd; i += NFDBITS) {
903			bits = ibits[msk][i/NFDBITS];
904			/* ffs(int mask) not portable, fd_mask is long */
905			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
906				if (!(bits & 1))
907					continue;
908				if ((fp = fget_locked(fdp, fd)) == NULL) {
909					FILEDESC_UNLOCK(fdp);
910					return (EBADF);
911				}
912				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
913					obits[msk][(fd)/NFDBITS] |=
914					    ((fd_mask)1 << ((fd) % NFDBITS));
915					n++;
916				}
917			}
918		}
919	}
920	FILEDESC_UNLOCK(fdp);
921	td->td_retval[0] = n;
922	return (0);
923}
924
925/*
926 * Poll system call.
927 */
928#ifndef _SYS_SYSPROTO_H_
929struct poll_args {
930	struct pollfd *fds;
931	u_int	nfds;
932	int	timeout;
933};
934#endif
935/*
936 * MPSAFE
937 */
938int
939poll(td, uap)
940	struct thread *td;
941	struct poll_args *uap;
942{
943	caddr_t bits;
944	char smallbits[32 * sizeof(struct pollfd)];
945	struct timeval atv, rtv, ttv;
946	int ncoll, error = 0, timo;
947	u_int nfds;
948	size_t ni;
949
950	nfds = SCARG(uap, nfds);
951
952	mtx_lock(&Giant);
953	/*
954	 * This is kinda bogus.  We have fd limits, but that is not
955	 * really related to the size of the pollfd array.  Make sure
956	 * we let the process use at least FD_SETSIZE entries and at
957	 * least enough for the current limits.  We want to be reasonably
958	 * safe, but not overly restrictive.
959	 */
960	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
961	    (nfds > FD_SETSIZE)) {
962		error = EINVAL;
963		goto done2;
964	}
965	ni = nfds * sizeof(struct pollfd);
966	if (ni > sizeof(smallbits))
967		bits = malloc(ni, M_TEMP, M_WAITOK);
968	else
969		bits = smallbits;
970	error = copyin(SCARG(uap, fds), bits, ni);
971	if (error)
972		goto done_nosellock;
973	if (SCARG(uap, timeout) != INFTIM) {
974		atv.tv_sec = SCARG(uap, timeout) / 1000;
975		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
976		if (itimerfix(&atv)) {
977			error = EINVAL;
978			goto done_nosellock;
979		}
980		getmicrouptime(&rtv);
981		timevaladd(&atv, &rtv);
982	} else {
983		atv.tv_sec = 0;
984		atv.tv_usec = 0;
985	}
986	timo = 0;
987	mtx_lock(&sellock);
988retry:
989	ncoll = nselcoll;
990	mtx_lock_spin(&sched_lock);
991	td->td_flags |= TDF_SELECT;
992	mtx_unlock_spin(&sched_lock);
993	mtx_unlock(&sellock);
994
995	/* XXX Is there a better place for this? */
996	TAILQ_INIT(&td->td_selq);
997	error = pollscan(td, (struct pollfd *)bits, nfds);
998	mtx_lock(&sellock);
999	if (error || td->td_retval[0])
1000		goto done;
1001	if (atv.tv_sec || atv.tv_usec) {
1002		getmicrouptime(&rtv);
1003		if (timevalcmp(&rtv, &atv, >=))
1004			goto done;
1005		ttv = atv;
1006		timevalsub(&ttv, &rtv);
1007		timo = ttv.tv_sec > 24 * 60 * 60 ?
1008		    24 * 60 * 60 * hz : tvtohz(&ttv);
1009	}
1010	/*
1011	 * An event of interest may occur while we do not hold
1012	 * sellock, so check TDF_SELECT and the number of collisions
1013	 * and rescan the file descriptors if necessary.
1014	 */
1015	mtx_lock_spin(&sched_lock);
1016	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1017		mtx_unlock_spin(&sched_lock);
1018		goto retry;
1019	}
1020	mtx_unlock_spin(&sched_lock);
1021
1022	if (timo > 0)
1023		error = cv_timedwait_sig(&selwait, &sellock, timo);
1024	else
1025		error = cv_wait_sig(&selwait, &sellock);
1026
1027	if (error == 0)
1028		goto retry;
1029
1030done:
1031	clear_selinfo_list(td);
1032	mtx_lock_spin(&sched_lock);
1033	td->td_flags &= ~TDF_SELECT;
1034	mtx_unlock_spin(&sched_lock);
1035	mtx_unlock(&sellock);
1036
1037done_nosellock:
1038	/* poll is not restarted after signals... */
1039	if (error == ERESTART)
1040		error = EINTR;
1041	if (error == EWOULDBLOCK)
1042		error = 0;
1043	if (error == 0) {
1044		error = copyout(bits, SCARG(uap, fds), ni);
1045		if (error)
1046			goto out;
1047	}
1048out:
1049	if (ni > sizeof(smallbits))
1050		free(bits, M_TEMP);
1051done2:
1052	mtx_unlock(&Giant);
1053	return (error);
1054}
1055
1056static int
1057pollscan(td, fds, nfd)
1058	struct thread *td;
1059	struct pollfd *fds;
1060	u_int nfd;
1061{
1062	register struct filedesc *fdp = td->td_proc->p_fd;
1063	int i;
1064	struct file *fp;
1065	int n = 0;
1066
1067	FILEDESC_LOCK(fdp);
1068	for (i = 0; i < nfd; i++, fds++) {
1069		if (fds->fd >= fdp->fd_nfiles) {
1070			fds->revents = POLLNVAL;
1071			n++;
1072		} else if (fds->fd < 0) {
1073			fds->revents = 0;
1074		} else {
1075			fp = fdp->fd_ofiles[fds->fd];
1076			if (fp == NULL) {
1077				fds->revents = POLLNVAL;
1078				n++;
1079			} else {
1080				/*
1081				 * Note: backend also returns POLLHUP and
1082				 * POLLERR if appropriate.
1083				 */
1084				fds->revents = fo_poll(fp, fds->events,
1085				    fp->f_cred, td);
1086				if (fds->revents != 0)
1087					n++;
1088			}
1089		}
1090	}
1091	FILEDESC_UNLOCK(fdp);
1092	td->td_retval[0] = n;
1093	return (0);
1094}
1095
1096/*
1097 * OpenBSD poll system call.
1098 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1099 */
1100#ifndef _SYS_SYSPROTO_H_
1101struct openbsd_poll_args {
1102	struct pollfd *fds;
1103	u_int	nfds;
1104	int	timeout;
1105};
1106#endif
1107/*
1108 * MPSAFE
1109 */
1110int
1111openbsd_poll(td, uap)
1112	register struct thread *td;
1113	register struct openbsd_poll_args *uap;
1114{
1115	return (poll(td, (struct poll_args *)uap));
1116}
1117
1118/*
1119 * Remove the references to the thread from all of the objects
1120 * we were polling.
1121 *
1122 * This code assumes that the underlying owner of the selinfo
1123 * structure will hold sellock before it changes it, and that
1124 * it will unlink itself from our list if it goes away.
1125 */
1126void
1127clear_selinfo_list(td)
1128	struct thread *td;
1129{
1130	struct selinfo *si;
1131
1132	mtx_assert(&sellock, MA_OWNED);
1133	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1134		si->si_thread = NULL;
1135	TAILQ_INIT(&td->td_selq);
1136}
1137
1138/*ARGSUSED*/
1139int
1140seltrue(dev, events, td)
1141	dev_t dev;
1142	int events;
1143	struct thread *td;
1144{
1145
1146	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1147}
1148
1149/*
1150 * Record a select request.
1151 */
1152void
1153selrecord(selector, sip)
1154	struct thread *selector;
1155	struct selinfo *sip;
1156{
1157
1158	mtx_lock(&sellock);
1159	/*
1160	 * If the thread is NULL then take ownership of selinfo
1161	 * however if the thread is not NULL and the thread points to
1162	 * someone else, then we have a collision, otherwise leave it alone
1163	 * as we've owned it in a previous selrecord on this selinfo.
1164	 */
1165	if (sip->si_thread == NULL) {
1166		sip->si_thread = selector;
1167		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1168	} else if (sip->si_thread != selector) {
1169		sip->si_flags |= SI_COLL;
1170	}
1171
1172	mtx_unlock(&sellock);
1173}
1174
1175/*
1176 * Do a wakeup when a selectable event occurs.
1177 */
1178void
1179selwakeup(sip)
1180	struct selinfo *sip;
1181{
1182	struct thread *td;
1183
1184	mtx_lock(&sellock);
1185	td = sip->si_thread;
1186	if ((sip->si_flags & SI_COLL) != 0) {
1187		nselcoll++;
1188		sip->si_flags &= ~SI_COLL;
1189		cv_broadcast(&selwait);
1190	}
1191	if (td == NULL) {
1192		mtx_unlock(&sellock);
1193		return;
1194	}
1195	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1196	sip->si_thread = NULL;
1197	mtx_lock_spin(&sched_lock);
1198	if (td->td_wchan == (caddr_t)&selwait) {
1199		if (td->td_proc->p_stat == SSLEEP)
1200			setrunnable(td);
1201		else
1202			cv_waitq_remove(td);
1203	} else
1204		td->td_flags &= ~TDF_SELECT;
1205	mtx_unlock_spin(&sched_lock);
1206	mtx_unlock(&sellock);
1207}
1208
1209static void selectinit __P((void *));
1210SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1211
1212/* ARGSUSED*/
1213static void
1214selectinit(dummy)
1215	void *dummy;
1216{
1217	cv_init(&selwait, "select");
1218	mtx_init(&sellock, "sellck", MTX_DEF);
1219}
1220