sys_generic.c revision 50477
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 50477 1999-08-28 01:08:13Z peter $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/ttycom.h>
50#include <sys/fcntl.h>
51#include <sys/file.h>
52#include <sys/proc.h>
53#include <sys/signalvar.h>
54#include <sys/socketvar.h>
55#include <sys/uio.h>
56#include <sys/kernel.h>
57#include <sys/malloc.h>
58#include <sys/poll.h>
59#include <sys/sysent.h>
60#ifdef KTRACE
61#include <sys/ktrace.h>
62#endif
63
64#include <machine/limits.h>
65
66static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69
70static int	pollscan __P((struct proc *, struct pollfd *, int));
71static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72static struct file* getfp __P((struct filedesc *, int, int));
73static int	dofileread __P((struct proc *, struct file *, int, void *,
74		    size_t, off_t, int));
75static int	dofilewrite __P((struct proc *, struct file *, int,
76		    const void *, size_t, off_t, int));
77
78static struct file*
79getfp(fdp, fd, flag)
80	struct filedesc* fdp;
81	int fd, flag;
82{
83	struct file* fp;
84
85	if (((u_int)fd) >= fdp->fd_nfiles ||
86	    (fp = fdp->fd_ofiles[fd]) == NULL ||
87	    (fp->f_flag & flag) == 0)
88		return (NULL);
89	return (fp);
90}
91
92/*
93 * Read system call.
94 */
95#ifndef _SYS_SYSPROTO_H_
96struct read_args {
97	int	fd;
98	void	*buf;
99	size_t	nbyte;
100};
101#endif
102int
103read(p, uap)
104	struct proc *p;
105	register struct read_args *uap;
106{
107	register struct file *fp;
108
109	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
110		return (EBADF);
111	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
112}
113
114/*
115 * Pread system call
116 */
117#ifndef _SYS_SYSPROTO_H_
118struct pread_args {
119	int	fd;
120	void	*buf;
121	size_t	nbyte;
122	int	pad;
123	off_t	offset;
124};
125#endif
126int
127pread(p, uap)
128	struct proc *p;
129	register struct pread_args *uap;
130{
131	register struct file *fp;
132
133	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
134		return (EBADF);
135	if (fp->f_type != DTYPE_VNODE)
136		return (ESPIPE);
137	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
138	    FOF_OFFSET));
139}
140
141/*
142 * Code common for read and pread
143 */
144int
145dofileread(p, fp, fd, buf, nbyte, offset, flags)
146	struct proc *p;
147	struct file *fp;
148	int fd, flags;
149	void *buf;
150	size_t nbyte;
151	off_t offset;
152{
153	struct uio auio;
154	struct iovec aiov;
155	long cnt, error = 0;
156#ifdef KTRACE
157	struct iovec ktriov;
158#endif
159
160	aiov.iov_base = (caddr_t)buf;
161	aiov.iov_len = nbyte;
162	auio.uio_iov = &aiov;
163	auio.uio_iovcnt = 1;
164	auio.uio_offset = offset;
165	if (nbyte > INT_MAX)
166		return (EINVAL);
167	auio.uio_resid = nbyte;
168	auio.uio_rw = UIO_READ;
169	auio.uio_segflg = UIO_USERSPACE;
170	auio.uio_procp = p;
171#ifdef KTRACE
172	/*
173	 * if tracing, save a copy of iovec
174	 */
175	if (KTRPOINT(p, KTR_GENIO))
176		ktriov = aiov;
177#endif
178	cnt = nbyte;
179	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, flags)))
180		if (auio.uio_resid != cnt && (error == ERESTART ||
181		    error == EINTR || error == EWOULDBLOCK))
182			error = 0;
183	cnt -= auio.uio_resid;
184#ifdef KTRACE
185	if (KTRPOINT(p, KTR_GENIO) && error == 0)
186		ktrgenio(p->p_tracep, fd, UIO_READ, &ktriov, cnt, error);
187#endif
188	p->p_retval[0] = cnt;
189	return (error);
190}
191
192/*
193 * Scatter read system call.
194 */
195#ifndef _SYS_SYSPROTO_H_
196struct readv_args {
197	int	fd;
198	struct	iovec *iovp;
199	u_int	iovcnt;
200};
201#endif
202int
203readv(p, uap)
204	struct proc *p;
205	register struct readv_args *uap;
206{
207	register struct file *fp;
208	register struct filedesc *fdp = p->p_fd;
209	struct uio auio;
210	register struct iovec *iov;
211	struct iovec *needfree;
212	struct iovec aiov[UIO_SMALLIOV];
213	long i, cnt, error = 0;
214	u_int iovlen;
215#ifdef KTRACE
216	struct iovec *ktriov = NULL;
217#endif
218
219	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
220		return (EBADF);
221	/* note: can't use iovlen until iovcnt is validated */
222	iovlen = uap->iovcnt * sizeof (struct iovec);
223	if (uap->iovcnt > UIO_SMALLIOV) {
224		if (uap->iovcnt > UIO_MAXIOV)
225			return (EINVAL);
226		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
227		needfree = iov;
228	} else {
229		iov = aiov;
230		needfree = NULL;
231	}
232	auio.uio_iov = iov;
233	auio.uio_iovcnt = uap->iovcnt;
234	auio.uio_rw = UIO_READ;
235	auio.uio_segflg = UIO_USERSPACE;
236	auio.uio_procp = p;
237	auio.uio_offset = -1;
238	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
239		goto done;
240	auio.uio_resid = 0;
241	for (i = 0; i < uap->iovcnt; i++) {
242		if (iov->iov_len > INT_MAX - auio.uio_resid) {
243			error = EINVAL;
244			goto done;
245		}
246		auio.uio_resid += iov->iov_len;
247		iov++;
248	}
249#ifdef KTRACE
250	/*
251	 * if tracing, save a copy of iovec
252	 */
253	if (KTRPOINT(p, KTR_GENIO))  {
254		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
255		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
256	}
257#endif
258	cnt = auio.uio_resid;
259	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred, 0)))
260		if (auio.uio_resid != cnt && (error == ERESTART ||
261		    error == EINTR || error == EWOULDBLOCK))
262			error = 0;
263	cnt -= auio.uio_resid;
264#ifdef KTRACE
265	if (ktriov != NULL) {
266		if (error == 0)
267			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
268			    cnt, error);
269		FREE(ktriov, M_TEMP);
270	}
271#endif
272	p->p_retval[0] = cnt;
273done:
274	if (needfree)
275		FREE(needfree, M_IOV);
276	return (error);
277}
278
279/*
280 * Write system call
281 */
282#ifndef _SYS_SYSPROTO_H_
283struct write_args {
284	int	fd;
285	const void *buf;
286	size_t	nbyte;
287};
288#endif
289int
290write(p, uap)
291	struct proc *p;
292	register struct write_args *uap;
293{
294	register struct file *fp;
295
296	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
297		return (EBADF);
298	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
299}
300
301/*
302 * Pwrite system call
303 */
304#ifndef _SYS_SYSPROTO_H_
305struct pwrite_args {
306	int	fd;
307	const void *buf;
308	size_t	nbyte;
309	int	pad;
310	off_t	offset;
311};
312#endif
313int
314pwrite(p, uap)
315	struct proc *p;
316	register struct pwrite_args *uap;
317{
318	register struct file *fp;
319
320	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
321		return (EBADF);
322	if (fp->f_type != DTYPE_VNODE)
323		return (ESPIPE);
324	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
325	    FOF_OFFSET));
326}
327
328static int
329dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
330	struct proc *p;
331	struct file *fp;
332	int fd, flags;
333	const void *buf;
334	size_t nbyte;
335	off_t offset;
336{
337	struct uio auio;
338	struct iovec aiov;
339	long cnt, error = 0;
340#ifdef KTRACE
341	struct iovec ktriov;
342#endif
343
344	aiov.iov_base = (void *)buf;
345	aiov.iov_len = nbyte;
346	auio.uio_iov = &aiov;
347	auio.uio_iovcnt = 1;
348	auio.uio_offset = offset;
349	if (nbyte > INT_MAX)
350		return (EINVAL);
351	auio.uio_resid = nbyte;
352	auio.uio_rw = UIO_WRITE;
353	auio.uio_segflg = UIO_USERSPACE;
354	auio.uio_procp = p;
355#ifdef KTRACE
356	/*
357	 * if tracing, save a copy of iovec
358	 */
359	if (KTRPOINT(p, KTR_GENIO))
360		ktriov = aiov;
361#endif
362	cnt = nbyte;
363	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, flags))) {
364		if (auio.uio_resid != cnt && (error == ERESTART ||
365		    error == EINTR || error == EWOULDBLOCK))
366			error = 0;
367		if (error == EPIPE)
368			psignal(p, SIGPIPE);
369	}
370	cnt -= auio.uio_resid;
371#ifdef KTRACE
372	if (KTRPOINT(p, KTR_GENIO) && error == 0)
373		ktrgenio(p->p_tracep, fd, UIO_WRITE,
374		    &ktriov, cnt, error);
375#endif
376	p->p_retval[0] = cnt;
377	return (error);
378}
379
380/*
381 * Gather write system call
382 */
383#ifndef _SYS_SYSPROTO_H_
384struct writev_args {
385	int	fd;
386	struct	iovec *iovp;
387	u_int	iovcnt;
388};
389#endif
390int
391writev(p, uap)
392	struct proc *p;
393	register struct writev_args *uap;
394{
395	register struct file *fp;
396	register struct filedesc *fdp = p->p_fd;
397	struct uio auio;
398	register struct iovec *iov;
399	struct iovec *needfree;
400	struct iovec aiov[UIO_SMALLIOV];
401	long i, cnt, error = 0;
402	u_int iovlen;
403#ifdef KTRACE
404	struct iovec *ktriov = NULL;
405#endif
406
407	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
408		return (EBADF);
409	/* note: can't use iovlen until iovcnt is validated */
410	iovlen = uap->iovcnt * sizeof (struct iovec);
411	if (uap->iovcnt > UIO_SMALLIOV) {
412		if (uap->iovcnt > UIO_MAXIOV)
413			return (EINVAL);
414		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
415		needfree = iov;
416	} else {
417		iov = aiov;
418		needfree = NULL;
419	}
420	auio.uio_iov = iov;
421	auio.uio_iovcnt = uap->iovcnt;
422	auio.uio_rw = UIO_WRITE;
423	auio.uio_segflg = UIO_USERSPACE;
424	auio.uio_procp = p;
425	auio.uio_offset = -1;
426	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
427		goto done;
428	auio.uio_resid = 0;
429	for (i = 0; i < uap->iovcnt; i++) {
430		if (iov->iov_len > INT_MAX - auio.uio_resid) {
431			error = EINVAL;
432			goto done;
433		}
434		auio.uio_resid += iov->iov_len;
435		iov++;
436	}
437#ifdef KTRACE
438	/*
439	 * if tracing, save a copy of iovec
440	 */
441	if (KTRPOINT(p, KTR_GENIO))  {
442		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
443		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
444	}
445#endif
446	cnt = auio.uio_resid;
447	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred, 0))) {
448		if (auio.uio_resid != cnt && (error == ERESTART ||
449		    error == EINTR || error == EWOULDBLOCK))
450			error = 0;
451		if (error == EPIPE)
452			psignal(p, SIGPIPE);
453	}
454	cnt -= auio.uio_resid;
455#ifdef KTRACE
456	if (ktriov != NULL) {
457		if (error == 0)
458			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
459				ktriov, cnt, error);
460		FREE(ktriov, M_TEMP);
461	}
462#endif
463	p->p_retval[0] = cnt;
464done:
465	if (needfree)
466		FREE(needfree, M_IOV);
467	return (error);
468}
469
470/*
471 * Ioctl system call
472 */
473#ifndef _SYS_SYSPROTO_H_
474struct ioctl_args {
475	int	fd;
476	u_long	com;
477	caddr_t	data;
478};
479#endif
480/* ARGSUSED */
481int
482ioctl(p, uap)
483	struct proc *p;
484	register struct ioctl_args *uap;
485{
486	register struct file *fp;
487	register struct filedesc *fdp;
488	register u_long com;
489	int error;
490	register u_int size;
491	caddr_t data, memp;
492	int tmp;
493#define STK_PARAMS	128
494	char stkbuf[STK_PARAMS];
495
496	fdp = p->p_fd;
497	if ((u_int)uap->fd >= fdp->fd_nfiles ||
498	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
499		return (EBADF);
500
501	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
502		return (EBADF);
503
504	switch (com = uap->com) {
505	case FIONCLEX:
506		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
507		return (0);
508	case FIOCLEX:
509		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
510		return (0);
511	}
512
513	/*
514	 * Interpret high order word to find amount of data to be
515	 * copied to/from the user's address space.
516	 */
517	size = IOCPARM_LEN(com);
518	if (size > IOCPARM_MAX)
519		return (ENOTTY);
520	memp = NULL;
521	if (size > sizeof (stkbuf)) {
522		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
523		data = memp;
524	} else
525		data = stkbuf;
526	if (com&IOC_IN) {
527		if (size) {
528			error = copyin(uap->data, data, (u_int)size);
529			if (error) {
530				if (memp)
531					free(memp, M_IOCTLOPS);
532				return (error);
533			}
534		} else
535			*(caddr_t *)data = uap->data;
536	} else if ((com&IOC_OUT) && size)
537		/*
538		 * Zero the buffer so the user always
539		 * gets back something deterministic.
540		 */
541		bzero(data, size);
542	else if (com&IOC_VOID)
543		*(caddr_t *)data = uap->data;
544
545	switch (com) {
546
547	case FIONBIO:
548		if ((tmp = *(int *)data))
549			fp->f_flag |= FNONBLOCK;
550		else
551			fp->f_flag &= ~FNONBLOCK;
552		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
553		break;
554
555	case FIOASYNC:
556		if ((tmp = *(int *)data))
557			fp->f_flag |= FASYNC;
558		else
559			fp->f_flag &= ~FASYNC;
560		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
561		break;
562
563	default:
564		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
565		/*
566		 * Copy any data to user, size was
567		 * already set and checked above.
568		 */
569		if (error == 0 && (com&IOC_OUT) && size)
570			error = copyout(data, uap->data, (u_int)size);
571		break;
572	}
573	if (memp)
574		free(memp, M_IOCTLOPS);
575	return (error);
576}
577
578static int	nselcoll;
579int	selwait;
580
581/*
582 * Select system call.
583 */
584#ifndef _SYS_SYSPROTO_H_
585struct select_args {
586	int	nd;
587	fd_set	*in, *ou, *ex;
588	struct	timeval *tv;
589};
590#endif
591int
592select(p, uap)
593	register struct proc *p;
594	register struct select_args *uap;
595{
596	/*
597	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
598	 * infds with the new FD_SETSIZE of 1024, and more than enough for
599	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
600	 * of 256.
601	 */
602	fd_mask s_selbits[howmany(2048, NFDBITS)];
603	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
604	struct timeval atv, rtv, ttv;
605	int s, ncoll, error, timo;
606	u_int nbufbytes, ncpbytes, nfdbits;
607
608	if (uap->nd < 0)
609		return (EINVAL);
610	if (uap->nd > p->p_fd->fd_nfiles)
611		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
612
613	/*
614	 * Allocate just enough bits for the non-null fd_sets.  Use the
615	 * preallocated auto buffer if possible.
616	 */
617	nfdbits = roundup(uap->nd, NFDBITS);
618	ncpbytes = nfdbits / NBBY;
619	nbufbytes = 0;
620	if (uap->in != NULL)
621		nbufbytes += 2 * ncpbytes;
622	if (uap->ou != NULL)
623		nbufbytes += 2 * ncpbytes;
624	if (uap->ex != NULL)
625		nbufbytes += 2 * ncpbytes;
626	if (nbufbytes <= sizeof s_selbits)
627		selbits = &s_selbits[0];
628	else
629		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
630
631	/*
632	 * Assign pointers into the bit buffers and fetch the input bits.
633	 * Put the output buffers together so that they can be bzeroed
634	 * together.
635	 */
636	sbp = selbits;
637#define	getbits(name, x) \
638	do {								\
639		if (uap->name == NULL)					\
640			ibits[x] = NULL;				\
641		else {							\
642			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
643			obits[x] = sbp;					\
644			sbp += ncpbytes / sizeof *sbp;			\
645			error = copyin(uap->name, ibits[x], ncpbytes);	\
646			if (error != 0)					\
647				goto done;				\
648		}							\
649	} while (0)
650	getbits(in, 0);
651	getbits(ou, 1);
652	getbits(ex, 2);
653#undef	getbits
654	if (nbufbytes != 0)
655		bzero(selbits, nbufbytes / 2);
656
657	if (uap->tv) {
658		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
659			sizeof (atv));
660		if (error)
661			goto done;
662		if (itimerfix(&atv)) {
663			error = EINVAL;
664			goto done;
665		}
666		getmicrouptime(&rtv);
667		timevaladd(&atv, &rtv);
668	} else
669		atv.tv_sec = 0;
670	timo = 0;
671retry:
672	ncoll = nselcoll;
673	p->p_flag |= P_SELECT;
674	error = selscan(p, ibits, obits, uap->nd);
675	if (error || p->p_retval[0])
676		goto done;
677	if (atv.tv_sec) {
678		getmicrouptime(&rtv);
679		if (timevalcmp(&rtv, &atv, >=))
680			goto done;
681		ttv = atv;
682		timevalsub(&ttv, &rtv);
683		timo = ttv.tv_sec > 24 * 60 * 60 ?
684		    24 * 60 * 60 * hz : tvtohz(&ttv);
685	}
686	s = splhigh();
687	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
688		splx(s);
689		goto retry;
690	}
691	p->p_flag &= ~P_SELECT;
692	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
693	splx(s);
694	if (error == 0)
695		goto retry;
696done:
697	p->p_flag &= ~P_SELECT;
698	/* select is not restarted after signals... */
699	if (error == ERESTART)
700		error = EINTR;
701	if (error == EWOULDBLOCK)
702		error = 0;
703#define	putbits(name, x) \
704	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
705		error = error2;
706	if (error == 0) {
707		int error2;
708
709		putbits(in, 0);
710		putbits(ou, 1);
711		putbits(ex, 2);
712#undef putbits
713	}
714	if (selbits != &s_selbits[0])
715		free(selbits, M_SELECT);
716	return (error);
717}
718
719static int
720selscan(p, ibits, obits, nfd)
721	struct proc *p;
722	fd_mask **ibits, **obits;
723	int nfd;
724{
725	register struct filedesc *fdp = p->p_fd;
726	register int msk, i, j, fd;
727	register fd_mask bits;
728	struct file *fp;
729	int n = 0;
730	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
731	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
732
733	for (msk = 0; msk < 3; msk++) {
734		if (ibits[msk] == NULL)
735			continue;
736		for (i = 0; i < nfd; i += NFDBITS) {
737			bits = ibits[msk][i/NFDBITS];
738			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
739				bits &= ~(1 << j);
740				fp = fdp->fd_ofiles[fd];
741				if (fp == NULL)
742					return (EBADF);
743				if ((*fp->f_ops->fo_poll)(fp, flag[msk],
744				    fp->f_cred, p)) {
745					obits[msk][(fd)/NFDBITS] |=
746						(1 << ((fd) % NFDBITS));
747					n++;
748				}
749			}
750		}
751	}
752	p->p_retval[0] = n;
753	return (0);
754}
755
756/*
757 * Poll system call.
758 */
759#ifndef _SYS_SYSPROTO_H_
760struct poll_args {
761	struct pollfd *fds;
762	u_int	nfds;
763	int	timeout;
764};
765#endif
766int
767poll(p, uap)
768	register struct proc *p;
769	register struct poll_args *uap;
770{
771	caddr_t bits;
772	char smallbits[32 * sizeof(struct pollfd)];
773	struct timeval atv, rtv, ttv;
774	int s, ncoll, error = 0, timo;
775	size_t ni;
776
777	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
778		/* forgiving; slightly wrong */
779		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
780	}
781	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
782	if (ni > sizeof(smallbits))
783		bits = malloc(ni, M_TEMP, M_WAITOK);
784	else
785		bits = smallbits;
786	error = copyin(SCARG(uap, fds), bits, ni);
787	if (error)
788		goto done;
789	if (SCARG(uap, timeout) != INFTIM) {
790		atv.tv_sec = SCARG(uap, timeout) / 1000;
791		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
792		if (itimerfix(&atv)) {
793			error = EINVAL;
794			goto done;
795		}
796		getmicrouptime(&rtv);
797		timevaladd(&atv, &rtv);
798	} else
799		atv.tv_sec = 0;
800	timo = 0;
801retry:
802	ncoll = nselcoll;
803	p->p_flag |= P_SELECT;
804	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
805	if (error || p->p_retval[0])
806		goto done;
807	if (atv.tv_sec) {
808		getmicrouptime(&rtv);
809		if (timevalcmp(&rtv, &atv, >=))
810			goto done;
811		ttv = atv;
812		timevalsub(&ttv, &rtv);
813		timo = ttv.tv_sec > 24 * 60 * 60 ?
814		    24 * 60 * 60 * hz : tvtohz(&ttv);
815	}
816	s = splhigh();
817	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
818		splx(s);
819		goto retry;
820	}
821	p->p_flag &= ~P_SELECT;
822	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
823	splx(s);
824	if (error == 0)
825		goto retry;
826done:
827	p->p_flag &= ~P_SELECT;
828	/* poll is not restarted after signals... */
829	if (error == ERESTART)
830		error = EINTR;
831	if (error == EWOULDBLOCK)
832		error = 0;
833	if (error == 0) {
834		error = copyout(bits, SCARG(uap, fds), ni);
835		if (error)
836			goto out;
837	}
838out:
839	if (ni > sizeof(smallbits))
840		free(bits, M_TEMP);
841	return (error);
842}
843
844static int
845pollscan(p, fds, nfd)
846	struct proc *p;
847	struct pollfd *fds;
848	int nfd;
849{
850	register struct filedesc *fdp = p->p_fd;
851	int i;
852	struct file *fp;
853	int n = 0;
854
855	for (i = 0; i < nfd; i++, fds++) {
856		if (fds->fd >= fdp->fd_nfiles) {
857			fds->revents = POLLNVAL;
858			n++;
859		} else if (fds->fd < 0) {
860			fds->revents = 0;
861		} else {
862			fp = fdp->fd_ofiles[fds->fd];
863			if (fp == 0) {
864				fds->revents = POLLNVAL;
865				n++;
866			} else {
867				/*
868				 * Note: backend also returns POLLHUP and
869				 * POLLERR if appropriate.
870				 */
871				fds->revents = (*fp->f_ops->fo_poll)(fp,
872				    fds->events, fp->f_cred, p);
873				if (fds->revents != 0)
874					n++;
875			}
876		}
877	}
878	p->p_retval[0] = n;
879	return (0);
880}
881
882/*
883 * OpenBSD poll system call.
884 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
885 */
886#ifndef _SYS_SYSPROTO_H_
887struct openbsd_poll_args {
888	struct pollfd *fds;
889	u_int	nfds;
890	int	timeout;
891};
892#endif
893int
894openbsd_poll(p, uap)
895	register struct proc *p;
896	register struct openbsd_poll_args *uap;
897{
898	return (poll(p, (struct poll_args *)uap));
899}
900
901/*ARGSUSED*/
902int
903seltrue(dev, events, p)
904	dev_t dev;
905	int events;
906	struct proc *p;
907{
908
909	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
910}
911
912/*
913 * Record a select request.
914 */
915void
916selrecord(selector, sip)
917	struct proc *selector;
918	struct selinfo *sip;
919{
920	struct proc *p;
921	pid_t mypid;
922
923	mypid = selector->p_pid;
924	if (sip->si_pid == mypid)
925		return;
926	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
927	    p->p_wchan == (caddr_t)&selwait)
928		sip->si_flags |= SI_COLL;
929	else
930		sip->si_pid = mypid;
931}
932
933/*
934 * Do a wakeup when a selectable event occurs.
935 */
936void
937selwakeup(sip)
938	register struct selinfo *sip;
939{
940	register struct proc *p;
941	int s;
942
943	if (sip->si_pid == 0)
944		return;
945	if (sip->si_flags & SI_COLL) {
946		nselcoll++;
947		sip->si_flags &= ~SI_COLL;
948		wakeup((caddr_t)&selwait);
949	}
950	p = pfind(sip->si_pid);
951	sip->si_pid = 0;
952	if (p != NULL) {
953		s = splhigh();
954		if (p->p_wchan == (caddr_t)&selwait) {
955			if (p->p_stat == SSLEEP)
956				setrunnable(p);
957			else
958				unsleep(p);
959		} else if (p->p_flag & P_SELECT)
960			p->p_flag &= ~P_SELECT;
961		splx(s);
962	}
963}
964