sys_generic.c revision 103216
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 103216 2002-09-11 08:13:56Z julian $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/sysent.h>
63#include <sys/bio.h>
64#include <sys/buf.h>
65#include <sys/condvar.h>
66#ifdef __alpha__
67#include <sys/disklabel.h>
68#endif
69#ifdef KTRACE
70#include <sys/ktrace.h>
71#endif
72#include <vm/vm.h>
73#include <vm/vm_page.h>
74
75#include <machine/limits.h>
76
77static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
78static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
79MALLOC_DEFINE(M_IOV, "iov", "large iov's");
80
81static int	pollscan(struct thread *, struct pollfd *, u_int);
82static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
83static int	dofileread(struct thread *, struct file *, int, void *,
84		    size_t, off_t, int);
85static int	dofilewrite(struct thread *, struct file *, int,
86		    const void *, size_t, off_t, int);
87
88/*
89 * Read system call.
90 */
91#ifndef _SYS_SYSPROTO_H_
92struct read_args {
93	int	fd;
94	void	*buf;
95	size_t	nbyte;
96};
97#endif
98/*
99 * MPSAFE
100 */
101int
102read(td, uap)
103	struct thread *td;
104	struct read_args *uap;
105{
106	struct file *fp;
107	int error;
108
109	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
110		error = dofileread(td, fp, uap->fd, uap->buf,
111			    uap->nbyte, (off_t)-1, 0);
112		fdrop(fp, td);
113	}
114	return(error);
115}
116
117/*
118 * Pread system call
119 */
120#ifndef _SYS_SYSPROTO_H_
121struct pread_args {
122	int	fd;
123	void	*buf;
124	size_t	nbyte;
125	int	pad;
126	off_t	offset;
127};
128#endif
129/*
130 * MPSAFE
131 */
132int
133pread(td, uap)
134	struct thread *td;
135	struct pread_args *uap;
136{
137	struct file *fp;
138	int error;
139
140	if ((error = fget_read(td, uap->fd, &fp)) != 0)
141		return (error);
142	if (fp->f_type != DTYPE_VNODE) {
143		error = ESPIPE;
144	} else {
145		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
146			    uap->offset, FOF_OFFSET);
147	}
148	fdrop(fp, td);
149	return(error);
150}
151
152/*
153 * Code common for read and pread
154 */
155int
156dofileread(td, fp, fd, buf, nbyte, offset, flags)
157	struct thread *td;
158	struct file *fp;
159	int fd, flags;
160	void *buf;
161	size_t nbyte;
162	off_t offset;
163{
164	struct uio auio;
165	struct iovec aiov;
166	long cnt, error = 0;
167#ifdef KTRACE
168	struct iovec ktriov;
169	struct uio ktruio;
170	int didktr = 0;
171#endif
172
173	aiov.iov_base = buf;
174	aiov.iov_len = nbyte;
175	auio.uio_iov = &aiov;
176	auio.uio_iovcnt = 1;
177	auio.uio_offset = offset;
178	if (nbyte > INT_MAX)
179		return (EINVAL);
180	auio.uio_resid = nbyte;
181	auio.uio_rw = UIO_READ;
182	auio.uio_segflg = UIO_USERSPACE;
183	auio.uio_td = td;
184#ifdef KTRACE
185	/*
186	 * if tracing, save a copy of iovec
187	 */
188	if (KTRPOINT(td, KTR_GENIO)) {
189		ktriov = aiov;
190		ktruio = auio;
191		didktr = 1;
192	}
193#endif
194	cnt = nbyte;
195
196	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
197		if (auio.uio_resid != cnt && (error == ERESTART ||
198		    error == EINTR || error == EWOULDBLOCK))
199			error = 0;
200	}
201	cnt -= auio.uio_resid;
202#ifdef KTRACE
203	if (didktr && error == 0) {
204		ktruio.uio_iov = &ktriov;
205		ktruio.uio_resid = cnt;
206		ktrgenio(fd, UIO_READ, &ktruio, error);
207	}
208#endif
209	td->td_retval[0] = cnt;
210	return (error);
211}
212
213/*
214 * Scatter read system call.
215 */
216#ifndef _SYS_SYSPROTO_H_
217struct readv_args {
218	int	fd;
219	struct	iovec *iovp;
220	u_int	iovcnt;
221};
222#endif
223/*
224 * MPSAFE
225 */
226int
227readv(td, uap)
228	struct thread *td;
229	struct readv_args *uap;
230{
231	struct file *fp;
232	struct uio auio;
233	struct iovec *iov;
234	struct iovec *needfree;
235	struct iovec aiov[UIO_SMALLIOV];
236	long i, cnt;
237	int error;
238	u_int iovlen;
239#ifdef KTRACE
240	struct iovec *ktriov = NULL;
241	struct uio ktruio;
242#endif
243
244	if ((error = fget_read(td, uap->fd, &fp)) != 0)
245		return (error);
246	needfree = NULL;
247	/* note: can't use iovlen until iovcnt is validated */
248	iovlen = uap->iovcnt * sizeof (struct iovec);
249	if (uap->iovcnt > UIO_SMALLIOV) {
250		if (uap->iovcnt > UIO_MAXIOV) {
251			error = EINVAL;
252			goto done;
253		}
254		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
255		needfree = iov;
256	} else
257		iov = aiov;
258	auio.uio_iov = iov;
259	auio.uio_iovcnt = uap->iovcnt;
260	auio.uio_rw = UIO_READ;
261	auio.uio_segflg = UIO_USERSPACE;
262	auio.uio_td = td;
263	auio.uio_offset = -1;
264	if ((error = copyin(uap->iovp, iov, iovlen)))
265		goto done;
266	auio.uio_resid = 0;
267	for (i = 0; i < uap->iovcnt; i++) {
268		if (iov->iov_len > INT_MAX - auio.uio_resid) {
269			error = EINVAL;
270			goto done;
271		}
272		auio.uio_resid += iov->iov_len;
273		iov++;
274	}
275#ifdef KTRACE
276	/*
277	 * if tracing, save a copy of iovec
278	 */
279	if (KTRPOINT(td, KTR_GENIO))  {
280		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
281		bcopy(auio.uio_iov, ktriov, iovlen);
282		ktruio = auio;
283	}
284#endif
285	cnt = auio.uio_resid;
286	if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
287		if (auio.uio_resid != cnt && (error == ERESTART ||
288		    error == EINTR || error == EWOULDBLOCK))
289			error = 0;
290	}
291	cnt -= auio.uio_resid;
292#ifdef KTRACE
293	if (ktriov != NULL) {
294		if (error == 0) {
295			ktruio.uio_iov = ktriov;
296			ktruio.uio_resid = cnt;
297			ktrgenio(uap->fd, UIO_READ, &ktruio, error);
298		}
299		FREE(ktriov, M_TEMP);
300	}
301#endif
302	td->td_retval[0] = cnt;
303done:
304	fdrop(fp, td);
305	if (needfree)
306		FREE(needfree, M_IOV);
307	return (error);
308}
309
310/*
311 * Write system call
312 */
313#ifndef _SYS_SYSPROTO_H_
314struct write_args {
315	int	fd;
316	const void *buf;
317	size_t	nbyte;
318};
319#endif
320/*
321 * MPSAFE
322 */
323int
324write(td, uap)
325	struct thread *td;
326	struct write_args *uap;
327{
328	struct file *fp;
329	int error;
330
331	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
332		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
333			    (off_t)-1, 0);
334		fdrop(fp, td);
335	} else {
336		error = EBADF;	/* XXX this can't be right */
337	}
338	return(error);
339}
340
341/*
342 * Pwrite system call
343 */
344#ifndef _SYS_SYSPROTO_H_
345struct pwrite_args {
346	int	fd;
347	const void *buf;
348	size_t	nbyte;
349	int	pad;
350	off_t	offset;
351};
352#endif
353/*
354 * MPSAFE
355 */
356int
357pwrite(td, uap)
358	struct thread *td;
359	struct pwrite_args *uap;
360{
361	struct file *fp;
362	int error;
363
364	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
365		if (fp->f_type == DTYPE_VNODE) {
366			error = dofilewrite(td, fp, uap->fd, uap->buf,
367				    uap->nbyte, uap->offset, FOF_OFFSET);
368		} else {
369			error = ESPIPE;
370		}
371		fdrop(fp, td);
372	} else {
373		error = EBADF;	/* this can't be right */
374	}
375	return(error);
376}
377
378static int
379dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
380	struct thread *td;
381	struct file *fp;
382	int fd, flags;
383	const void *buf;
384	size_t nbyte;
385	off_t offset;
386{
387	struct uio auio;
388	struct iovec aiov;
389	long cnt, error = 0;
390#ifdef KTRACE
391	struct iovec ktriov;
392	struct uio ktruio;
393	int didktr = 0;
394#endif
395
396	aiov.iov_base = (void *)(uintptr_t)buf;
397	aiov.iov_len = nbyte;
398	auio.uio_iov = &aiov;
399	auio.uio_iovcnt = 1;
400	auio.uio_offset = offset;
401	if (nbyte > INT_MAX)
402		return (EINVAL);
403	auio.uio_resid = nbyte;
404	auio.uio_rw = UIO_WRITE;
405	auio.uio_segflg = UIO_USERSPACE;
406	auio.uio_td = td;
407#ifdef KTRACE
408	/*
409	 * if tracing, save a copy of iovec and uio
410	 */
411	if (KTRPOINT(td, KTR_GENIO)) {
412		ktriov = aiov;
413		ktruio = auio;
414		didktr = 1;
415	}
416#endif
417	cnt = nbyte;
418	if (fp->f_type == DTYPE_VNODE)
419		bwillwrite();
420	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
421		if (auio.uio_resid != cnt && (error == ERESTART ||
422		    error == EINTR || error == EWOULDBLOCK))
423			error = 0;
424		/* Socket layer is responsible for issuing SIGPIPE. */
425		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
426			PROC_LOCK(td->td_proc);
427			psignal(td->td_proc, SIGPIPE);
428			PROC_UNLOCK(td->td_proc);
429		}
430	}
431	cnt -= auio.uio_resid;
432#ifdef KTRACE
433	if (didktr && error == 0) {
434		ktruio.uio_iov = &ktriov;
435		ktruio.uio_resid = cnt;
436		ktrgenio(fd, UIO_WRITE, &ktruio, error);
437	}
438#endif
439	td->td_retval[0] = cnt;
440	return (error);
441}
442
443/*
444 * Gather write system call
445 */
446#ifndef _SYS_SYSPROTO_H_
447struct writev_args {
448	int	fd;
449	struct	iovec *iovp;
450	u_int	iovcnt;
451};
452#endif
453/*
454 * MPSAFE
455 */
456int
457writev(td, uap)
458	struct thread *td;
459	register struct writev_args *uap;
460{
461	struct file *fp;
462	struct uio auio;
463	register struct iovec *iov;
464	struct iovec *needfree;
465	struct iovec aiov[UIO_SMALLIOV];
466	long i, cnt, error = 0;
467	u_int iovlen;
468#ifdef KTRACE
469	struct iovec *ktriov = NULL;
470	struct uio ktruio;
471#endif
472
473	mtx_lock(&Giant);
474	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
475		error = EBADF;
476		goto done2;
477	}
478	/* note: can't use iovlen until iovcnt is validated */
479	iovlen = uap->iovcnt * sizeof (struct iovec);
480	if (uap->iovcnt > UIO_SMALLIOV) {
481		if (uap->iovcnt > UIO_MAXIOV) {
482			needfree = NULL;
483			error = EINVAL;
484			goto done;
485		}
486		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
487		needfree = iov;
488	} else {
489		iov = aiov;
490		needfree = NULL;
491	}
492	auio.uio_iov = iov;
493	auio.uio_iovcnt = uap->iovcnt;
494	auio.uio_rw = UIO_WRITE;
495	auio.uio_segflg = UIO_USERSPACE;
496	auio.uio_td = td;
497	auio.uio_offset = -1;
498	if ((error = copyin(uap->iovp, iov, iovlen)))
499		goto done;
500	auio.uio_resid = 0;
501	for (i = 0; i < uap->iovcnt; i++) {
502		if (iov->iov_len > INT_MAX - auio.uio_resid) {
503			error = EINVAL;
504			goto done;
505		}
506		auio.uio_resid += iov->iov_len;
507		iov++;
508	}
509#ifdef KTRACE
510	/*
511	 * if tracing, save a copy of iovec and uio
512	 */
513	if (KTRPOINT(td, KTR_GENIO))  {
514		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
515		bcopy(auio.uio_iov, ktriov, iovlen);
516		ktruio = auio;
517	}
518#endif
519	cnt = auio.uio_resid;
520	if (fp->f_type == DTYPE_VNODE)
521		bwillwrite();
522	if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
523		if (auio.uio_resid != cnt && (error == ERESTART ||
524		    error == EINTR || error == EWOULDBLOCK))
525			error = 0;
526		if (error == EPIPE) {
527			PROC_LOCK(td->td_proc);
528			psignal(td->td_proc, SIGPIPE);
529			PROC_UNLOCK(td->td_proc);
530		}
531	}
532	cnt -= auio.uio_resid;
533#ifdef KTRACE
534	if (ktriov != NULL) {
535		if (error == 0) {
536			ktruio.uio_iov = ktriov;
537			ktruio.uio_resid = cnt;
538			ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
539		}
540		FREE(ktriov, M_TEMP);
541	}
542#endif
543	td->td_retval[0] = cnt;
544done:
545	fdrop(fp, td);
546	if (needfree)
547		FREE(needfree, M_IOV);
548done2:
549	mtx_unlock(&Giant);
550	return (error);
551}
552
553/*
554 * Ioctl system call
555 */
556#ifndef _SYS_SYSPROTO_H_
557struct ioctl_args {
558	int	fd;
559	u_long	com;
560	caddr_t	data;
561};
562#endif
563/*
564 * MPSAFE
565 */
566/* ARGSUSED */
567int
568ioctl(td, uap)
569	struct thread *td;
570	register struct ioctl_args *uap;
571{
572	struct file *fp;
573	register struct filedesc *fdp;
574	register u_long com;
575	int error = 0;
576	register u_int size;
577	caddr_t data, memp;
578	int tmp;
579#define STK_PARAMS	128
580	union {
581	    char stkbuf[STK_PARAMS];
582	    long align;
583	} ubuf;
584
585	if ((error = fget(td, uap->fd, &fp)) != 0)
586		return (error);
587	mtx_lock(&Giant);
588	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
589		fdrop(fp, td);
590		mtx_unlock(&Giant);
591		return (EBADF);
592	}
593	fdp = td->td_proc->p_fd;
594	switch (com = uap->com) {
595	case FIONCLEX:
596		FILEDESC_LOCK(fdp);
597		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
598		FILEDESC_UNLOCK(fdp);
599		fdrop(fp, td);
600		mtx_unlock(&Giant);
601		return (0);
602	case FIOCLEX:
603		FILEDESC_LOCK(fdp);
604		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
605		FILEDESC_UNLOCK(fdp);
606		fdrop(fp, td);
607		mtx_unlock(&Giant);
608		return (0);
609	}
610
611	/*
612	 * Interpret high order word to find amount of data to be
613	 * copied to/from the user's address space.
614	 */
615	size = IOCPARM_LEN(com);
616	if (size > IOCPARM_MAX) {
617		fdrop(fp, td);
618		mtx_unlock(&Giant);
619		return (ENOTTY);
620	}
621
622	memp = NULL;
623	if (size > sizeof (ubuf.stkbuf)) {
624		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
625		data = memp;
626	} else {
627		data = ubuf.stkbuf;
628	}
629	if (com&IOC_IN) {
630		if (size) {
631			error = copyin(uap->data, data, (u_int)size);
632			if (error) {
633				if (memp)
634					free(memp, M_IOCTLOPS);
635				fdrop(fp, td);
636				goto done;
637			}
638		} else {
639			*(caddr_t *)data = uap->data;
640		}
641	} else if ((com&IOC_OUT) && size) {
642		/*
643		 * Zero the buffer so the user always
644		 * gets back something deterministic.
645		 */
646		bzero(data, size);
647	} else if (com&IOC_VOID) {
648		*(caddr_t *)data = uap->data;
649	}
650
651	switch (com) {
652
653	case FIONBIO:
654		FILE_LOCK(fp);
655		if ((tmp = *(int *)data))
656			fp->f_flag |= FNONBLOCK;
657		else
658			fp->f_flag &= ~FNONBLOCK;
659		FILE_UNLOCK(fp);
660		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
661		break;
662
663	case FIOASYNC:
664		FILE_LOCK(fp);
665		if ((tmp = *(int *)data))
666			fp->f_flag |= FASYNC;
667		else
668			fp->f_flag &= ~FASYNC;
669		FILE_UNLOCK(fp);
670		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
671		break;
672
673	default:
674		error = fo_ioctl(fp, com, data, td->td_ucred, td);
675		/*
676		 * Copy any data to user, size was
677		 * already set and checked above.
678		 */
679		if (error == 0 && (com&IOC_OUT) && size)
680			error = copyout(data, uap->data, (u_int)size);
681		break;
682	}
683	if (memp)
684		free(memp, M_IOCTLOPS);
685	fdrop(fp, td);
686done:
687	mtx_unlock(&Giant);
688	return (error);
689}
690
691/*
692 * sellock and selwait are initialized in selectinit() via SYSINIT.
693 */
694struct mtx	sellock;
695struct cv	selwait;
696u_int		nselcoll;	/* Select collisions since boot */
697SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
698
699/*
700 * Select system call.
701 */
702#ifndef _SYS_SYSPROTO_H_
703struct select_args {
704	int	nd;
705	fd_set	*in, *ou, *ex;
706	struct	timeval *tv;
707};
708#endif
709/*
710 * MPSAFE
711 */
712int
713select(td, uap)
714	register struct thread *td;
715	register struct select_args *uap;
716{
717	struct timeval tv, *tvp;
718	int error;
719
720	if (uap->tv != NULL) {
721		error = copyin(uap->tv, &tv, sizeof(tv));
722		if (error)
723			return (error);
724		tvp = &tv;
725	} else
726		tvp = NULL;
727
728	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
729}
730
731int
732kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
733    fd_set *fd_ex, struct timeval *tvp)
734{
735	struct filedesc *fdp;
736	/*
737	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
738	 * infds with the new FD_SETSIZE of 1024, and more than enough for
739	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
740	 * of 256.
741	 */
742	fd_mask s_selbits[howmany(2048, NFDBITS)];
743	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
744	struct timeval atv, rtv, ttv;
745	int error, timo;
746	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
747
748	if (nd < 0)
749		return (EINVAL);
750	fdp = td->td_proc->p_fd;
751	mtx_lock(&Giant);
752	FILEDESC_LOCK(fdp);
753
754	if (nd > td->td_proc->p_fd->fd_nfiles)
755		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
756	FILEDESC_UNLOCK(fdp);
757
758	/*
759	 * Allocate just enough bits for the non-null fd_sets.  Use the
760	 * preallocated auto buffer if possible.
761	 */
762	nfdbits = roundup(nd, NFDBITS);
763	ncpbytes = nfdbits / NBBY;
764	nbufbytes = 0;
765	if (fd_in != NULL)
766		nbufbytes += 2 * ncpbytes;
767	if (fd_ou != NULL)
768		nbufbytes += 2 * ncpbytes;
769	if (fd_ex != NULL)
770		nbufbytes += 2 * ncpbytes;
771	if (nbufbytes <= sizeof s_selbits)
772		selbits = &s_selbits[0];
773	else
774		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
775
776	/*
777	 * Assign pointers into the bit buffers and fetch the input bits.
778	 * Put the output buffers together so that they can be bzeroed
779	 * together.
780	 */
781	sbp = selbits;
782#define	getbits(name, x) \
783	do {								\
784		if (name == NULL)					\
785			ibits[x] = NULL;				\
786		else {							\
787			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
788			obits[x] = sbp;					\
789			sbp += ncpbytes / sizeof *sbp;			\
790			error = copyin(name, ibits[x], ncpbytes);	\
791			if (error != 0)					\
792				goto done_nosellock;			\
793		}							\
794	} while (0)
795	getbits(fd_in, 0);
796	getbits(fd_ou, 1);
797	getbits(fd_ex, 2);
798#undef	getbits
799	if (nbufbytes != 0)
800		bzero(selbits, nbufbytes / 2);
801
802	if (tvp != NULL) {
803		atv = *tvp;
804		if (itimerfix(&atv)) {
805			error = EINVAL;
806			goto done_nosellock;
807		}
808		getmicrouptime(&rtv);
809		timevaladd(&atv, &rtv);
810	} else {
811		atv.tv_sec = 0;
812		atv.tv_usec = 0;
813	}
814	timo = 0;
815	TAILQ_INIT(&td->td_selq);
816	mtx_lock(&sellock);
817retry:
818	ncoll = nselcoll;
819	mtx_lock_spin(&sched_lock);
820	td->td_flags |= TDF_SELECT;
821	mtx_unlock_spin(&sched_lock);
822	mtx_unlock(&sellock);
823
824	error = selscan(td, ibits, obits, nd);
825	mtx_lock(&sellock);
826	if (error || td->td_retval[0])
827		goto done;
828	if (atv.tv_sec || atv.tv_usec) {
829		getmicrouptime(&rtv);
830		if (timevalcmp(&rtv, &atv, >=))
831			goto done;
832		ttv = atv;
833		timevalsub(&ttv, &rtv);
834		timo = ttv.tv_sec > 24 * 60 * 60 ?
835		    24 * 60 * 60 * hz : tvtohz(&ttv);
836	}
837
838	/*
839	 * An event of interest may occur while we do not hold
840	 * sellock, so check TDF_SELECT and the number of
841	 * collisions and rescan the file descriptors if
842	 * necessary.
843	 */
844	mtx_lock_spin(&sched_lock);
845	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
846		mtx_unlock_spin(&sched_lock);
847		goto retry;
848	}
849	mtx_unlock_spin(&sched_lock);
850
851	if (timo > 0)
852		error = cv_timedwait_sig(&selwait, &sellock, timo);
853	else
854		error = cv_wait_sig(&selwait, &sellock);
855
856	if (error == 0)
857		goto retry;
858
859done:
860	clear_selinfo_list(td);
861	mtx_lock_spin(&sched_lock);
862	td->td_flags &= ~TDF_SELECT;
863	mtx_unlock_spin(&sched_lock);
864	mtx_unlock(&sellock);
865
866done_nosellock:
867	/* select is not restarted after signals... */
868	if (error == ERESTART)
869		error = EINTR;
870	if (error == EWOULDBLOCK)
871		error = 0;
872#define	putbits(name, x) \
873	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
874		error = error2;
875	if (error == 0) {
876		int error2;
877
878		putbits(fd_in, 0);
879		putbits(fd_ou, 1);
880		putbits(fd_ex, 2);
881#undef putbits
882	}
883	if (selbits != &s_selbits[0])
884		free(selbits, M_SELECT);
885
886	mtx_unlock(&Giant);
887	return (error);
888}
889
890static int
891selscan(td, ibits, obits, nfd)
892	struct thread *td;
893	fd_mask **ibits, **obits;
894	int nfd;
895{
896	int msk, i, fd;
897	fd_mask bits;
898	struct file *fp;
899	int n = 0;
900	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
901	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
902	struct filedesc *fdp = td->td_proc->p_fd;
903
904	FILEDESC_LOCK(fdp);
905	for (msk = 0; msk < 3; msk++) {
906		if (ibits[msk] == NULL)
907			continue;
908		for (i = 0; i < nfd; i += NFDBITS) {
909			bits = ibits[msk][i/NFDBITS];
910			/* ffs(int mask) not portable, fd_mask is long */
911			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
912				if (!(bits & 1))
913					continue;
914				if ((fp = fget_locked(fdp, fd)) == NULL) {
915					FILEDESC_UNLOCK(fdp);
916					return (EBADF);
917				}
918				if (fo_poll(fp, flag[msk], td->td_ucred,
919				    td)) {
920					obits[msk][(fd)/NFDBITS] |=
921					    ((fd_mask)1 << ((fd) % NFDBITS));
922					n++;
923				}
924			}
925		}
926	}
927	FILEDESC_UNLOCK(fdp);
928	td->td_retval[0] = n;
929	return (0);
930}
931
932/*
933 * Poll system call.
934 */
935#ifndef _SYS_SYSPROTO_H_
936struct poll_args {
937	struct pollfd *fds;
938	u_int	nfds;
939	int	timeout;
940};
941#endif
942/*
943 * MPSAFE
944 */
945int
946poll(td, uap)
947	struct thread *td;
948	struct poll_args *uap;
949{
950	caddr_t bits;
951	char smallbits[32 * sizeof(struct pollfd)];
952	struct timeval atv, rtv, ttv;
953	int error = 0, timo;
954	u_int ncoll, nfds;
955	size_t ni;
956
957	nfds = SCARG(uap, nfds);
958
959	mtx_lock(&Giant);
960	/*
961	 * This is kinda bogus.  We have fd limits, but that is not
962	 * really related to the size of the pollfd array.  Make sure
963	 * we let the process use at least FD_SETSIZE entries and at
964	 * least enough for the current limits.  We want to be reasonably
965	 * safe, but not overly restrictive.
966	 */
967	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
968	    (nfds > FD_SETSIZE)) {
969		error = EINVAL;
970		goto done2;
971	}
972	ni = nfds * sizeof(struct pollfd);
973	if (ni > sizeof(smallbits))
974		bits = malloc(ni, M_TEMP, M_WAITOK);
975	else
976		bits = smallbits;
977	error = copyin(SCARG(uap, fds), bits, ni);
978	if (error)
979		goto done_nosellock;
980	if (SCARG(uap, timeout) != INFTIM) {
981		atv.tv_sec = SCARG(uap, timeout) / 1000;
982		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
983		if (itimerfix(&atv)) {
984			error = EINVAL;
985			goto done_nosellock;
986		}
987		getmicrouptime(&rtv);
988		timevaladd(&atv, &rtv);
989	} else {
990		atv.tv_sec = 0;
991		atv.tv_usec = 0;
992	}
993	timo = 0;
994	TAILQ_INIT(&td->td_selq);
995	mtx_lock(&sellock);
996retry:
997	ncoll = nselcoll;
998	mtx_lock_spin(&sched_lock);
999	td->td_flags |= TDF_SELECT;
1000	mtx_unlock_spin(&sched_lock);
1001	mtx_unlock(&sellock);
1002
1003	error = pollscan(td, (struct pollfd *)bits, nfds);
1004	mtx_lock(&sellock);
1005	if (error || td->td_retval[0])
1006		goto done;
1007	if (atv.tv_sec || atv.tv_usec) {
1008		getmicrouptime(&rtv);
1009		if (timevalcmp(&rtv, &atv, >=))
1010			goto done;
1011		ttv = atv;
1012		timevalsub(&ttv, &rtv);
1013		timo = ttv.tv_sec > 24 * 60 * 60 ?
1014		    24 * 60 * 60 * hz : tvtohz(&ttv);
1015	}
1016	/*
1017	 * An event of interest may occur while we do not hold
1018	 * sellock, so check TDF_SELECT and the number of collisions
1019	 * and rescan the file descriptors if necessary.
1020	 */
1021	mtx_lock_spin(&sched_lock);
1022	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1023		mtx_unlock_spin(&sched_lock);
1024		goto retry;
1025	}
1026	mtx_unlock_spin(&sched_lock);
1027
1028	if (timo > 0)
1029		error = cv_timedwait_sig(&selwait, &sellock, timo);
1030	else
1031		error = cv_wait_sig(&selwait, &sellock);
1032
1033	if (error == 0)
1034		goto retry;
1035
1036done:
1037	clear_selinfo_list(td);
1038	mtx_lock_spin(&sched_lock);
1039	td->td_flags &= ~TDF_SELECT;
1040	mtx_unlock_spin(&sched_lock);
1041	mtx_unlock(&sellock);
1042
1043done_nosellock:
1044	/* poll is not restarted after signals... */
1045	if (error == ERESTART)
1046		error = EINTR;
1047	if (error == EWOULDBLOCK)
1048		error = 0;
1049	if (error == 0) {
1050		error = copyout(bits, SCARG(uap, fds), ni);
1051		if (error)
1052			goto out;
1053	}
1054out:
1055	if (ni > sizeof(smallbits))
1056		free(bits, M_TEMP);
1057done2:
1058	mtx_unlock(&Giant);
1059	return (error);
1060}
1061
1062static int
1063pollscan(td, fds, nfd)
1064	struct thread *td;
1065	struct pollfd *fds;
1066	u_int nfd;
1067{
1068	register struct filedesc *fdp = td->td_proc->p_fd;
1069	int i;
1070	struct file *fp;
1071	int n = 0;
1072
1073	FILEDESC_LOCK(fdp);
1074	for (i = 0; i < nfd; i++, fds++) {
1075		if (fds->fd >= fdp->fd_nfiles) {
1076			fds->revents = POLLNVAL;
1077			n++;
1078		} else if (fds->fd < 0) {
1079			fds->revents = 0;
1080		} else {
1081			fp = fdp->fd_ofiles[fds->fd];
1082			if (fp == NULL) {
1083				fds->revents = POLLNVAL;
1084				n++;
1085			} else {
1086				/*
1087				 * Note: backend also returns POLLHUP and
1088				 * POLLERR if appropriate.
1089				 */
1090				fds->revents = fo_poll(fp, fds->events,
1091				    td->td_ucred, td);
1092				if (fds->revents != 0)
1093					n++;
1094			}
1095		}
1096	}
1097	FILEDESC_UNLOCK(fdp);
1098	td->td_retval[0] = n;
1099	return (0);
1100}
1101
1102/*
1103 * OpenBSD poll system call.
1104 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1105 */
1106#ifndef _SYS_SYSPROTO_H_
1107struct openbsd_poll_args {
1108	struct pollfd *fds;
1109	u_int	nfds;
1110	int	timeout;
1111};
1112#endif
1113/*
1114 * MPSAFE
1115 */
1116int
1117openbsd_poll(td, uap)
1118	register struct thread *td;
1119	register struct openbsd_poll_args *uap;
1120{
1121	return (poll(td, (struct poll_args *)uap));
1122}
1123
1124/*
1125 * Remove the references to the thread from all of the objects
1126 * we were polling.
1127 *
1128 * This code assumes that the underlying owner of the selinfo
1129 * structure will hold sellock before it changes it, and that
1130 * it will unlink itself from our list if it goes away.
1131 */
1132void
1133clear_selinfo_list(td)
1134	struct thread *td;
1135{
1136	struct selinfo *si;
1137
1138	mtx_assert(&sellock, MA_OWNED);
1139	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1140		si->si_thread = NULL;
1141	TAILQ_INIT(&td->td_selq);
1142}
1143
1144/*ARGSUSED*/
1145int
1146seltrue(dev, events, td)
1147	dev_t dev;
1148	int events;
1149	struct thread *td;
1150{
1151
1152	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1153}
1154
1155/*
1156 * Record a select request.
1157 */
1158void
1159selrecord(selector, sip)
1160	struct thread *selector;
1161	struct selinfo *sip;
1162{
1163
1164	mtx_lock(&sellock);
1165	/*
1166	 * If the selinfo's thread pointer is NULL then take ownership of it.
1167	 *
1168	 * If the thread pointer is not NULL and it points to another
1169	 * thread, then we have a collision.
1170	 *
1171	 * If the thread pointer is not NULL and points back to us then leave
1172	 * it alone as we've already added pointed it at us and added it to
1173	 * our list.
1174	 */
1175	if (sip->si_thread == NULL) {
1176		sip->si_thread = selector;
1177		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1178	} else if (sip->si_thread != selector) {
1179		sip->si_flags |= SI_COLL;
1180	}
1181
1182	mtx_unlock(&sellock);
1183}
1184
1185/*
1186 * Do a wakeup when a selectable event occurs.
1187 */
1188void
1189selwakeup(sip)
1190	struct selinfo *sip;
1191{
1192	struct thread *td;
1193
1194	mtx_lock(&sellock);
1195	td = sip->si_thread;
1196	if ((sip->si_flags & SI_COLL) != 0) {
1197		nselcoll++;
1198		sip->si_flags &= ~SI_COLL;
1199		cv_broadcast(&selwait);
1200	}
1201	if (td == NULL) {
1202		mtx_unlock(&sellock);
1203		return;
1204	}
1205	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1206	sip->si_thread = NULL;
1207	mtx_lock_spin(&sched_lock);
1208	if (td->td_wchan == &selwait) {
1209		cv_waitq_remove(td);
1210		TD_CLR_SLEEPING(td);
1211		setrunnable(td);
1212	} else
1213		td->td_flags &= ~TDF_SELECT;
1214	mtx_unlock_spin(&sched_lock);
1215	mtx_unlock(&sellock);
1216}
1217
1218static void selectinit(void *);
1219SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1220
1221/* ARGSUSED*/
1222static void
1223selectinit(dummy)
1224	void *dummy;
1225{
1226	cv_init(&selwait, "select");
1227	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1228}
1229