sys_generic.c revision 114216
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 114216 2003-04-29 13:36:06Z kan $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/limits.h>
57#include <sys/malloc.h>
58#include <sys/poll.h>
59#include <sys/resourcevar.h>
60#include <sys/selinfo.h>
61#include <sys/syscallsubr.h>
62#include <sys/sysctl.h>
63#include <sys/sysent.h>
64#include <sys/bio.h>
65#include <sys/buf.h>
66#include <sys/condvar.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70#include <vm/vm.h>
71#include <vm/vm_page.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int	pollscan(struct thread *, struct pollfd *, u_int);
78static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int	dofileread(struct thread *, struct file *, int, void *,
80		    size_t, off_t, int);
81static int	dofilewrite(struct thread *, struct file *, int,
82		    const void *, size_t, off_t, int);
83
84/*
85 * Read system call.
86 */
87#ifndef _SYS_SYSPROTO_H_
88struct read_args {
89	int	fd;
90	void	*buf;
91	size_t	nbyte;
92};
93#endif
94/*
95 * MPSAFE
96 */
97int
98read(td, uap)
99	struct thread *td;
100	struct read_args *uap;
101{
102	struct file *fp;
103	int error;
104
105	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
106		error = dofileread(td, fp, uap->fd, uap->buf,
107			    uap->nbyte, (off_t)-1, 0);
108		fdrop(fp, td);
109	}
110	return(error);
111}
112
113/*
114 * Pread system call
115 */
116#ifndef _SYS_SYSPROTO_H_
117struct pread_args {
118	int	fd;
119	void	*buf;
120	size_t	nbyte;
121	int	pad;
122	off_t	offset;
123};
124#endif
125/*
126 * MPSAFE
127 */
128int
129pread(td, uap)
130	struct thread *td;
131	struct pread_args *uap;
132{
133	struct file *fp;
134	int error;
135
136	if ((error = fget_read(td, uap->fd, &fp)) != 0)
137		return (error);
138	if (fp->f_type != DTYPE_VNODE) {
139		error = ESPIPE;
140	} else {
141		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
142			    uap->offset, FOF_OFFSET);
143	}
144	fdrop(fp, td);
145	return(error);
146}
147
148/*
149 * Code common for read and pread
150 */
151static int
152dofileread(td, fp, fd, buf, nbyte, offset, flags)
153	struct thread *td;
154	struct file *fp;
155	int fd, flags;
156	void *buf;
157	size_t nbyte;
158	off_t offset;
159{
160	struct uio auio;
161	struct iovec aiov;
162	long cnt, error = 0;
163#ifdef KTRACE
164	struct iovec ktriov;
165	struct uio ktruio;
166	int didktr = 0;
167#endif
168
169	aiov.iov_base = buf;
170	aiov.iov_len = nbyte;
171	auio.uio_iov = &aiov;
172	auio.uio_iovcnt = 1;
173	auio.uio_offset = offset;
174	if (nbyte > INT_MAX)
175		return (EINVAL);
176	auio.uio_resid = nbyte;
177	auio.uio_rw = UIO_READ;
178	auio.uio_segflg = UIO_USERSPACE;
179	auio.uio_td = td;
180#ifdef KTRACE
181	/*
182	 * if tracing, save a copy of iovec
183	 */
184	if (KTRPOINT(td, KTR_GENIO)) {
185		ktriov = aiov;
186		ktruio = auio;
187		didktr = 1;
188	}
189#endif
190	cnt = nbyte;
191
192	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
193		if (auio.uio_resid != cnt && (error == ERESTART ||
194		    error == EINTR || error == EWOULDBLOCK))
195			error = 0;
196	}
197	cnt -= auio.uio_resid;
198#ifdef KTRACE
199	if (didktr && error == 0) {
200		ktruio.uio_iov = &ktriov;
201		ktruio.uio_resid = cnt;
202		ktrgenio(fd, UIO_READ, &ktruio, error);
203	}
204#endif
205	td->td_retval[0] = cnt;
206	return (error);
207}
208
209/*
210 * Scatter read system call.
211 */
212#ifndef _SYS_SYSPROTO_H_
213struct readv_args {
214	int	fd;
215	struct	iovec *iovp;
216	u_int	iovcnt;
217};
218#endif
219/*
220 * MPSAFE
221 */
222int
223readv(td, uap)
224	struct thread *td;
225	struct readv_args *uap;
226{
227	struct file *fp;
228	struct uio auio;
229	struct iovec *iov;
230	struct iovec *needfree;
231	struct iovec aiov[UIO_SMALLIOV];
232	long i, cnt;
233	int error;
234	u_int iovlen;
235#ifdef KTRACE
236	struct iovec *ktriov = NULL;
237	struct uio ktruio;
238#endif
239
240	if ((error = fget_read(td, uap->fd, &fp)) != 0)
241		return (error);
242	needfree = NULL;
243	/* note: can't use iovlen until iovcnt is validated */
244	iovlen = uap->iovcnt * sizeof (struct iovec);
245	if (uap->iovcnt > UIO_SMALLIOV) {
246		if (uap->iovcnt > UIO_MAXIOV) {
247			error = EINVAL;
248			goto done;
249		}
250		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251		needfree = iov;
252	} else
253		iov = aiov;
254	auio.uio_iov = iov;
255	auio.uio_iovcnt = uap->iovcnt;
256	auio.uio_rw = UIO_READ;
257	auio.uio_segflg = UIO_USERSPACE;
258	auio.uio_td = td;
259	auio.uio_offset = -1;
260	if ((error = copyin(uap->iovp, iov, iovlen)))
261		goto done;
262	auio.uio_resid = 0;
263	for (i = 0; i < uap->iovcnt; i++) {
264		if (iov->iov_len > INT_MAX - auio.uio_resid) {
265			error = EINVAL;
266			goto done;
267		}
268		auio.uio_resid += iov->iov_len;
269		iov++;
270	}
271#ifdef KTRACE
272	/*
273	 * if tracing, save a copy of iovec
274	 */
275	if (KTRPOINT(td, KTR_GENIO))  {
276		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
277		bcopy(auio.uio_iov, ktriov, iovlen);
278		ktruio = auio;
279	}
280#endif
281	cnt = auio.uio_resid;
282	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
283		if (auio.uio_resid != cnt && (error == ERESTART ||
284		    error == EINTR || error == EWOULDBLOCK))
285			error = 0;
286	}
287	cnt -= auio.uio_resid;
288#ifdef KTRACE
289	if (ktriov != NULL) {
290		if (error == 0) {
291			ktruio.uio_iov = ktriov;
292			ktruio.uio_resid = cnt;
293			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
294		}
295		FREE(ktriov, M_TEMP);
296	}
297#endif
298	td->td_retval[0] = cnt;
299done:
300	fdrop(fp, td);
301	if (needfree)
302		FREE(needfree, M_IOV);
303	return (error);
304}
305
306/*
307 * Write system call
308 */
309#ifndef _SYS_SYSPROTO_H_
310struct write_args {
311	int	fd;
312	const void *buf;
313	size_t	nbyte;
314};
315#endif
316/*
317 * MPSAFE
318 */
319int
320write(td, uap)
321	struct thread *td;
322	struct write_args *uap;
323{
324	struct file *fp;
325	int error;
326
327	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
328		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
329			    (off_t)-1, 0);
330		fdrop(fp, td);
331	} else {
332		error = EBADF;	/* XXX this can't be right */
333	}
334	return(error);
335}
336
337/*
338 * Pwrite system call
339 */
340#ifndef _SYS_SYSPROTO_H_
341struct pwrite_args {
342	int	fd;
343	const void *buf;
344	size_t	nbyte;
345	int	pad;
346	off_t	offset;
347};
348#endif
349/*
350 * MPSAFE
351 */
352int
353pwrite(td, uap)
354	struct thread *td;
355	struct pwrite_args *uap;
356{
357	struct file *fp;
358	int error;
359
360	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
361		if (fp->f_type == DTYPE_VNODE) {
362			error = dofilewrite(td, fp, uap->fd, uap->buf,
363				    uap->nbyte, uap->offset, FOF_OFFSET);
364		} else {
365			error = ESPIPE;
366		}
367		fdrop(fp, td);
368	} else {
369		error = EBADF;	/* this can't be right */
370	}
371	return(error);
372}
373
374static int
375dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
376	struct thread *td;
377	struct file *fp;
378	int fd, flags;
379	const void *buf;
380	size_t nbyte;
381	off_t offset;
382{
383	struct uio auio;
384	struct iovec aiov;
385	long cnt, error = 0;
386#ifdef KTRACE
387	struct iovec ktriov;
388	struct uio ktruio;
389	int didktr = 0;
390#endif
391
392	aiov.iov_base = (void *)(uintptr_t)buf;
393	aiov.iov_len = nbyte;
394	auio.uio_iov = &aiov;
395	auio.uio_iovcnt = 1;
396	auio.uio_offset = offset;
397	if (nbyte > INT_MAX)
398		return (EINVAL);
399	auio.uio_resid = nbyte;
400	auio.uio_rw = UIO_WRITE;
401	auio.uio_segflg = UIO_USERSPACE;
402	auio.uio_td = td;
403#ifdef KTRACE
404	/*
405	 * if tracing, save a copy of iovec and uio
406	 */
407	if (KTRPOINT(td, KTR_GENIO)) {
408		ktriov = aiov;
409		ktruio = auio;
410		didktr = 1;
411	}
412#endif
413	cnt = nbyte;
414	if (fp->f_type == DTYPE_VNODE)
415		bwillwrite();
416	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
417		if (auio.uio_resid != cnt && (error == ERESTART ||
418		    error == EINTR || error == EWOULDBLOCK))
419			error = 0;
420		/* Socket layer is responsible for issuing SIGPIPE. */
421		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
422			PROC_LOCK(td->td_proc);
423			psignal(td->td_proc, SIGPIPE);
424			PROC_UNLOCK(td->td_proc);
425		}
426	}
427	cnt -= auio.uio_resid;
428#ifdef KTRACE
429	if (didktr && error == 0) {
430		ktruio.uio_iov = &ktriov;
431		ktruio.uio_resid = cnt;
432		ktrgenio(fd, UIO_WRITE, &ktruio, error);
433	}
434#endif
435	td->td_retval[0] = cnt;
436	return (error);
437}
438
439/*
440 * Gather write system call
441 */
442#ifndef _SYS_SYSPROTO_H_
443struct writev_args {
444	int	fd;
445	struct	iovec *iovp;
446	u_int	iovcnt;
447};
448#endif
449/*
450 * MPSAFE
451 */
452int
453writev(td, uap)
454	struct thread *td;
455	register struct writev_args *uap;
456{
457	struct file *fp;
458	struct uio auio;
459	register struct iovec *iov;
460	struct iovec *needfree;
461	struct iovec aiov[UIO_SMALLIOV];
462	long i, cnt, error = 0;
463	u_int iovlen;
464#ifdef KTRACE
465	struct iovec *ktriov = NULL;
466	struct uio ktruio;
467#endif
468
469	mtx_lock(&Giant);
470	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
471		error = EBADF;
472		goto done2;
473	}
474	/* note: can't use iovlen until iovcnt is validated */
475	iovlen = uap->iovcnt * sizeof (struct iovec);
476	if (uap->iovcnt > UIO_SMALLIOV) {
477		if (uap->iovcnt > UIO_MAXIOV) {
478			needfree = NULL;
479			error = EINVAL;
480			goto done;
481		}
482		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
483		needfree = iov;
484	} else {
485		iov = aiov;
486		needfree = NULL;
487	}
488	auio.uio_iov = iov;
489	auio.uio_iovcnt = uap->iovcnt;
490	auio.uio_rw = UIO_WRITE;
491	auio.uio_segflg = UIO_USERSPACE;
492	auio.uio_td = td;
493	auio.uio_offset = -1;
494	if ((error = copyin(uap->iovp, iov, iovlen)))
495		goto done;
496	auio.uio_resid = 0;
497	for (i = 0; i < uap->iovcnt; i++) {
498		if (iov->iov_len > INT_MAX - auio.uio_resid) {
499			error = EINVAL;
500			goto done;
501		}
502		auio.uio_resid += iov->iov_len;
503		iov++;
504	}
505#ifdef KTRACE
506	/*
507	 * if tracing, save a copy of iovec and uio
508	 */
509	if (KTRPOINT(td, KTR_GENIO))  {
510		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
511		bcopy(auio.uio_iov, ktriov, iovlen);
512		ktruio = auio;
513	}
514#endif
515	cnt = auio.uio_resid;
516	if (fp->f_type == DTYPE_VNODE)
517		bwillwrite();
518	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
519		if (auio.uio_resid != cnt && (error == ERESTART ||
520		    error == EINTR || error == EWOULDBLOCK))
521			error = 0;
522		if (error == EPIPE) {
523			PROC_LOCK(td->td_proc);
524			psignal(td->td_proc, SIGPIPE);
525			PROC_UNLOCK(td->td_proc);
526		}
527	}
528	cnt -= auio.uio_resid;
529#ifdef KTRACE
530	if (ktriov != NULL) {
531		if (error == 0) {
532			ktruio.uio_iov = ktriov;
533			ktruio.uio_resid = cnt;
534			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
535		}
536		FREE(ktriov, M_TEMP);
537	}
538#endif
539	td->td_retval[0] = cnt;
540done:
541	fdrop(fp, td);
542	if (needfree)
543		FREE(needfree, M_IOV);
544done2:
545	mtx_unlock(&Giant);
546	return (error);
547}
548
549/*
550 * Ioctl system call
551 */
552#ifndef _SYS_SYSPROTO_H_
553struct ioctl_args {
554	int	fd;
555	u_long	com;
556	caddr_t	data;
557};
558#endif
559/*
560 * MPSAFE
561 */
562/* ARGSUSED */
563int
564ioctl(td, uap)
565	struct thread *td;
566	register struct ioctl_args *uap;
567{
568	struct file *fp;
569	register struct filedesc *fdp;
570	register u_long com;
571	int error = 0;
572	register u_int size;
573	caddr_t data, memp;
574	int tmp;
575#define STK_PARAMS	128
576	union {
577	    char stkbuf[STK_PARAMS];
578	    long align;
579	} ubuf;
580
581	if ((error = fget(td, uap->fd, &fp)) != 0)
582		return (error);
583	mtx_lock(&Giant);
584	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
585		fdrop(fp, td);
586		mtx_unlock(&Giant);
587		return (EBADF);
588	}
589	fdp = td->td_proc->p_fd;
590	switch (com = uap->com) {
591	case FIONCLEX:
592		FILEDESC_LOCK(fdp);
593		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
594		FILEDESC_UNLOCK(fdp);
595		fdrop(fp, td);
596		mtx_unlock(&Giant);
597		return (0);
598	case FIOCLEX:
599		FILEDESC_LOCK(fdp);
600		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
601		FILEDESC_UNLOCK(fdp);
602		fdrop(fp, td);
603		mtx_unlock(&Giant);
604		return (0);
605	}
606
607	/*
608	 * Interpret high order word to find amount of data to be
609	 * copied to/from the user's address space.
610	 */
611	size = IOCPARM_LEN(com);
612	if (size > IOCPARM_MAX) {
613		fdrop(fp, td);
614		mtx_unlock(&Giant);
615		return (ENOTTY);
616	}
617
618	memp = NULL;
619	if (size > sizeof (ubuf.stkbuf)) {
620		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
621		data = memp;
622	} else {
623		data = ubuf.stkbuf;
624	}
625	if (com&IOC_IN) {
626		if (size) {
627			error = copyin(uap->data, data, (u_int)size);
628			if (error) {
629				if (memp)
630					free(memp, M_IOCTLOPS);
631				fdrop(fp, td);
632				goto done;
633			}
634		} else {
635			*(caddr_t *)data = uap->data;
636		}
637	} else if ((com&IOC_OUT) && size) {
638		/*
639		 * Zero the buffer so the user always
640		 * gets back something deterministic.
641		 */
642		bzero(data, size);
643	} else if (com&IOC_VOID) {
644		*(caddr_t *)data = uap->data;
645	}
646
647	switch (com) {
648
649	case FIONBIO:
650		FILE_LOCK(fp);
651		if ((tmp = *(int *)data))
652			fp->f_flag |= FNONBLOCK;
653		else
654			fp->f_flag &= ~FNONBLOCK;
655		FILE_UNLOCK(fp);
656		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
657		break;
658
659	case FIOASYNC:
660		FILE_LOCK(fp);
661		if ((tmp = *(int *)data))
662			fp->f_flag |= FASYNC;
663		else
664			fp->f_flag &= ~FASYNC;
665		FILE_UNLOCK(fp);
666		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
667		break;
668
669	default:
670		error = fo_ioctl(fp, com, data, td->td_ucred, td);
671		/*
672		 * Copy any data to user, size was
673		 * already set and checked above.
674		 */
675		if (error == 0 && (com&IOC_OUT) && size)
676			error = copyout(data, uap->data, (u_int)size);
677		break;
678	}
679	if (memp)
680		free(memp, M_IOCTLOPS);
681	fdrop(fp, td);
682done:
683	mtx_unlock(&Giant);
684	return (error);
685}
686
687/*
688 * sellock and selwait are initialized in selectinit() via SYSINIT.
689 */
690struct mtx	sellock;
691struct cv	selwait;
692u_int		nselcoll;	/* Select collisions since boot */
693SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
694
695/*
696 * Select system call.
697 */
698#ifndef _SYS_SYSPROTO_H_
699struct select_args {
700	int	nd;
701	fd_set	*in, *ou, *ex;
702	struct	timeval *tv;
703};
704#endif
705/*
706 * MPSAFE
707 */
708int
709select(td, uap)
710	register struct thread *td;
711	register struct select_args *uap;
712{
713	struct timeval tv, *tvp;
714	int error;
715
716	if (uap->tv != NULL) {
717		error = copyin(uap->tv, &tv, sizeof(tv));
718		if (error)
719			return (error);
720		tvp = &tv;
721	} else
722		tvp = NULL;
723
724	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
725}
726
727int
728kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
729    fd_set *fd_ex, struct timeval *tvp)
730{
731	struct filedesc *fdp;
732	/*
733	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
734	 * infds with the new FD_SETSIZE of 1024, and more than enough for
735	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
736	 * of 256.
737	 */
738	fd_mask s_selbits[howmany(2048, NFDBITS)];
739	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
740	struct timeval atv, rtv, ttv;
741	int error, timo;
742	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
743
744	if (nd < 0)
745		return (EINVAL);
746	fdp = td->td_proc->p_fd;
747	mtx_lock(&Giant);
748	FILEDESC_LOCK(fdp);
749
750	if (nd > td->td_proc->p_fd->fd_nfiles)
751		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
752	FILEDESC_UNLOCK(fdp);
753
754	/*
755	 * Allocate just enough bits for the non-null fd_sets.  Use the
756	 * preallocated auto buffer if possible.
757	 */
758	nfdbits = roundup(nd, NFDBITS);
759	ncpbytes = nfdbits / NBBY;
760	nbufbytes = 0;
761	if (fd_in != NULL)
762		nbufbytes += 2 * ncpbytes;
763	if (fd_ou != NULL)
764		nbufbytes += 2 * ncpbytes;
765	if (fd_ex != NULL)
766		nbufbytes += 2 * ncpbytes;
767	if (nbufbytes <= sizeof s_selbits)
768		selbits = &s_selbits[0];
769	else
770		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
771
772	/*
773	 * Assign pointers into the bit buffers and fetch the input bits.
774	 * Put the output buffers together so that they can be bzeroed
775	 * together.
776	 */
777	sbp = selbits;
778#define	getbits(name, x) \
779	do {								\
780		if (name == NULL)					\
781			ibits[x] = NULL;				\
782		else {							\
783			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
784			obits[x] = sbp;					\
785			sbp += ncpbytes / sizeof *sbp;			\
786			error = copyin(name, ibits[x], ncpbytes);	\
787			if (error != 0)					\
788				goto done_nosellock;			\
789		}							\
790	} while (0)
791	getbits(fd_in, 0);
792	getbits(fd_ou, 1);
793	getbits(fd_ex, 2);
794#undef	getbits
795	if (nbufbytes != 0)
796		bzero(selbits, nbufbytes / 2);
797
798	if (tvp != NULL) {
799		atv = *tvp;
800		if (itimerfix(&atv)) {
801			error = EINVAL;
802			goto done_nosellock;
803		}
804		getmicrouptime(&rtv);
805		timevaladd(&atv, &rtv);
806	} else {
807		atv.tv_sec = 0;
808		atv.tv_usec = 0;
809	}
810	timo = 0;
811	TAILQ_INIT(&td->td_selq);
812	mtx_lock(&sellock);
813retry:
814	ncoll = nselcoll;
815	mtx_lock_spin(&sched_lock);
816	td->td_flags |= TDF_SELECT;
817	mtx_unlock_spin(&sched_lock);
818	mtx_unlock(&sellock);
819
820	error = selscan(td, ibits, obits, nd);
821	mtx_lock(&sellock);
822	if (error || td->td_retval[0])
823		goto done;
824	if (atv.tv_sec || atv.tv_usec) {
825		getmicrouptime(&rtv);
826		if (timevalcmp(&rtv, &atv, >=))
827			goto done;
828		ttv = atv;
829		timevalsub(&ttv, &rtv);
830		timo = ttv.tv_sec > 24 * 60 * 60 ?
831		    24 * 60 * 60 * hz : tvtohz(&ttv);
832	}
833
834	/*
835	 * An event of interest may occur while we do not hold
836	 * sellock, so check TDF_SELECT and the number of
837	 * collisions and rescan the file descriptors if
838	 * necessary.
839	 */
840	mtx_lock_spin(&sched_lock);
841	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
842		mtx_unlock_spin(&sched_lock);
843		goto retry;
844	}
845	mtx_unlock_spin(&sched_lock);
846
847	if (timo > 0)
848		error = cv_timedwait_sig(&selwait, &sellock, timo);
849	else
850		error = cv_wait_sig(&selwait, &sellock);
851
852	if (error == 0)
853		goto retry;
854
855done:
856	clear_selinfo_list(td);
857	mtx_lock_spin(&sched_lock);
858	td->td_flags &= ~TDF_SELECT;
859	mtx_unlock_spin(&sched_lock);
860	mtx_unlock(&sellock);
861
862done_nosellock:
863	/* select is not restarted after signals... */
864	if (error == ERESTART)
865		error = EINTR;
866	if (error == EWOULDBLOCK)
867		error = 0;
868#define	putbits(name, x) \
869	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
870		error = error2;
871	if (error == 0) {
872		int error2;
873
874		putbits(fd_in, 0);
875		putbits(fd_ou, 1);
876		putbits(fd_ex, 2);
877#undef putbits
878	}
879	if (selbits != &s_selbits[0])
880		free(selbits, M_SELECT);
881
882	mtx_unlock(&Giant);
883	return (error);
884}
885
886static int
887selscan(td, ibits, obits, nfd)
888	struct thread *td;
889	fd_mask **ibits, **obits;
890	int nfd;
891{
892	int msk, i, fd;
893	fd_mask bits;
894	struct file *fp;
895	int n = 0;
896	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
897	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
898	struct filedesc *fdp = td->td_proc->p_fd;
899
900	FILEDESC_LOCK(fdp);
901	for (msk = 0; msk < 3; msk++) {
902		if (ibits[msk] == NULL)
903			continue;
904		for (i = 0; i < nfd; i += NFDBITS) {
905			bits = ibits[msk][i/NFDBITS];
906			/* ffs(int mask) not portable, fd_mask is long */
907			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
908				if (!(bits & 1))
909					continue;
910				if ((fp = fget_locked(fdp, fd)) == NULL) {
911					FILEDESC_UNLOCK(fdp);
912					return (EBADF);
913				}
914				if (fo_poll(fp, flag[msk], td->td_ucred,
915				    td)) {
916					obits[msk][(fd)/NFDBITS] |=
917					    ((fd_mask)1 << ((fd) % NFDBITS));
918					n++;
919				}
920			}
921		}
922	}
923	FILEDESC_UNLOCK(fdp);
924	td->td_retval[0] = n;
925	return (0);
926}
927
928/*
929 * Poll system call.
930 */
931#ifndef _SYS_SYSPROTO_H_
932struct poll_args {
933	struct pollfd *fds;
934	u_int	nfds;
935	int	timeout;
936};
937#endif
938/*
939 * MPSAFE
940 */
941int
942poll(td, uap)
943	struct thread *td;
944	struct poll_args *uap;
945{
946	caddr_t bits;
947	char smallbits[32 * sizeof(struct pollfd)];
948	struct timeval atv, rtv, ttv;
949	int error = 0, timo;
950	u_int ncoll, nfds;
951	size_t ni;
952
953	nfds = uap->nfds;
954
955	mtx_lock(&Giant);
956	/*
957	 * This is kinda bogus.  We have fd limits, but that is not
958	 * really related to the size of the pollfd array.  Make sure
959	 * we let the process use at least FD_SETSIZE entries and at
960	 * least enough for the current limits.  We want to be reasonably
961	 * safe, but not overly restrictive.
962	 */
963	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
964	    (nfds > FD_SETSIZE)) {
965		error = EINVAL;
966		goto done2;
967	}
968	ni = nfds * sizeof(struct pollfd);
969	if (ni > sizeof(smallbits))
970		bits = malloc(ni, M_TEMP, M_WAITOK);
971	else
972		bits = smallbits;
973	error = copyin(uap->fds, bits, ni);
974	if (error)
975		goto done_nosellock;
976	if (uap->timeout != INFTIM) {
977		atv.tv_sec = uap->timeout / 1000;
978		atv.tv_usec = (uap->timeout % 1000) * 1000;
979		if (itimerfix(&atv)) {
980			error = EINVAL;
981			goto done_nosellock;
982		}
983		getmicrouptime(&rtv);
984		timevaladd(&atv, &rtv);
985	} else {
986		atv.tv_sec = 0;
987		atv.tv_usec = 0;
988	}
989	timo = 0;
990	TAILQ_INIT(&td->td_selq);
991	mtx_lock(&sellock);
992retry:
993	ncoll = nselcoll;
994	mtx_lock_spin(&sched_lock);
995	td->td_flags |= TDF_SELECT;
996	mtx_unlock_spin(&sched_lock);
997	mtx_unlock(&sellock);
998
999	error = pollscan(td, (struct pollfd *)bits, nfds);
1000	mtx_lock(&sellock);
1001	if (error || td->td_retval[0])
1002		goto done;
1003	if (atv.tv_sec || atv.tv_usec) {
1004		getmicrouptime(&rtv);
1005		if (timevalcmp(&rtv, &atv, >=))
1006			goto done;
1007		ttv = atv;
1008		timevalsub(&ttv, &rtv);
1009		timo = ttv.tv_sec > 24 * 60 * 60 ?
1010		    24 * 60 * 60 * hz : tvtohz(&ttv);
1011	}
1012	/*
1013	 * An event of interest may occur while we do not hold
1014	 * sellock, so check TDF_SELECT and the number of collisions
1015	 * and rescan the file descriptors if necessary.
1016	 */
1017	mtx_lock_spin(&sched_lock);
1018	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1019		mtx_unlock_spin(&sched_lock);
1020		goto retry;
1021	}
1022	mtx_unlock_spin(&sched_lock);
1023
1024	if (timo > 0)
1025		error = cv_timedwait_sig(&selwait, &sellock, timo);
1026	else
1027		error = cv_wait_sig(&selwait, &sellock);
1028
1029	if (error == 0)
1030		goto retry;
1031
1032done:
1033	clear_selinfo_list(td);
1034	mtx_lock_spin(&sched_lock);
1035	td->td_flags &= ~TDF_SELECT;
1036	mtx_unlock_spin(&sched_lock);
1037	mtx_unlock(&sellock);
1038
1039done_nosellock:
1040	/* poll is not restarted after signals... */
1041	if (error == ERESTART)
1042		error = EINTR;
1043	if (error == EWOULDBLOCK)
1044		error = 0;
1045	if (error == 0) {
1046		error = copyout(bits, uap->fds, ni);
1047		if (error)
1048			goto out;
1049	}
1050out:
1051	if (ni > sizeof(smallbits))
1052		free(bits, M_TEMP);
1053done2:
1054	mtx_unlock(&Giant);
1055	return (error);
1056}
1057
1058static int
1059pollscan(td, fds, nfd)
1060	struct thread *td;
1061	struct pollfd *fds;
1062	u_int nfd;
1063{
1064	register struct filedesc *fdp = td->td_proc->p_fd;
1065	int i;
1066	struct file *fp;
1067	int n = 0;
1068
1069	FILEDESC_LOCK(fdp);
1070	for (i = 0; i < nfd; i++, fds++) {
1071		if (fds->fd >= fdp->fd_nfiles) {
1072			fds->revents = POLLNVAL;
1073			n++;
1074		} else if (fds->fd < 0) {
1075			fds->revents = 0;
1076		} else {
1077			fp = fdp->fd_ofiles[fds->fd];
1078			if (fp == NULL) {
1079				fds->revents = POLLNVAL;
1080				n++;
1081			} else {
1082				/*
1083				 * Note: backend also returns POLLHUP and
1084				 * POLLERR if appropriate.
1085				 */
1086				fds->revents = fo_poll(fp, fds->events,
1087				    td->td_ucred, td);
1088				if (fds->revents != 0)
1089					n++;
1090			}
1091		}
1092	}
1093	FILEDESC_UNLOCK(fdp);
1094	td->td_retval[0] = n;
1095	return (0);
1096}
1097
1098/*
1099 * OpenBSD poll system call.
1100 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1101 */
1102#ifndef _SYS_SYSPROTO_H_
1103struct openbsd_poll_args {
1104	struct pollfd *fds;
1105	u_int	nfds;
1106	int	timeout;
1107};
1108#endif
1109/*
1110 * MPSAFE
1111 */
1112int
1113openbsd_poll(td, uap)
1114	register struct thread *td;
1115	register struct openbsd_poll_args *uap;
1116{
1117	return (poll(td, (struct poll_args *)uap));
1118}
1119
1120/*
1121 * Remove the references to the thread from all of the objects
1122 * we were polling.
1123 *
1124 * This code assumes that the underlying owner of the selinfo
1125 * structure will hold sellock before it changes it, and that
1126 * it will unlink itself from our list if it goes away.
1127 */
1128void
1129clear_selinfo_list(td)
1130	struct thread *td;
1131{
1132	struct selinfo *si;
1133
1134	mtx_assert(&sellock, MA_OWNED);
1135	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1136		si->si_thread = NULL;
1137	TAILQ_INIT(&td->td_selq);
1138}
1139
1140/*ARGSUSED*/
1141int
1142seltrue(dev, events, td)
1143	dev_t dev;
1144	int events;
1145	struct thread *td;
1146{
1147
1148	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1149}
1150
1151/*
1152 * Record a select request.
1153 */
1154void
1155selrecord(selector, sip)
1156	struct thread *selector;
1157	struct selinfo *sip;
1158{
1159
1160	mtx_lock(&sellock);
1161	/*
1162	 * If the selinfo's thread pointer is NULL then take ownership of it.
1163	 *
1164	 * If the thread pointer is not NULL and it points to another
1165	 * thread, then we have a collision.
1166	 *
1167	 * If the thread pointer is not NULL and points back to us then leave
1168	 * it alone as we've already added pointed it at us and added it to
1169	 * our list.
1170	 */
1171	if (sip->si_thread == NULL) {
1172		sip->si_thread = selector;
1173		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1174	} else if (sip->si_thread != selector) {
1175		sip->si_flags |= SI_COLL;
1176	}
1177
1178	mtx_unlock(&sellock);
1179}
1180
1181/*
1182 * Do a wakeup when a selectable event occurs.
1183 */
1184void
1185selwakeup(sip)
1186	struct selinfo *sip;
1187{
1188	struct thread *td;
1189
1190	mtx_lock(&sellock);
1191	td = sip->si_thread;
1192	if ((sip->si_flags & SI_COLL) != 0) {
1193		nselcoll++;
1194		sip->si_flags &= ~SI_COLL;
1195		cv_broadcast(&selwait);
1196	}
1197	if (td == NULL) {
1198		mtx_unlock(&sellock);
1199		return;
1200	}
1201	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1202	sip->si_thread = NULL;
1203	mtx_lock_spin(&sched_lock);
1204	if (td->td_wchan == &selwait) {
1205		cv_waitq_remove(td);
1206		TD_CLR_SLEEPING(td);
1207		setrunnable(td);
1208	} else
1209		td->td_flags &= ~TDF_SELECT;
1210	mtx_unlock_spin(&sched_lock);
1211	mtx_unlock(&sellock);
1212}
1213
1214static void selectinit(void *);
1215SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1216
1217/* ARGSUSED*/
1218static void
1219selectinit(dummy)
1220	void *dummy;
1221{
1222	cv_init(&selwait, "select");
1223	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1224}
1225