sys_generic.c revision 72146
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 72146 2001-02-07 23:28:01Z peter $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#ifdef KTRACE
65#include <sys/ktrace.h>
66#endif
67#include <vm/vm.h>
68#include <vm/vm_page.h>
69
70#include <machine/limits.h>
71
72static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75
76static int	pollscan __P((struct proc *, struct pollfd *, int));
77static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
78static int	dofileread __P((struct proc *, struct file *, int, void *,
79		    size_t, off_t, int));
80static int	dofilewrite __P((struct proc *, struct file *, int,
81		    const void *, size_t, off_t, int));
82
83struct file*
84holdfp(fdp, fd, flag)
85	struct filedesc* fdp;
86	int fd, flag;
87{
88	struct file* fp;
89
90	if (((u_int)fd) >= fdp->fd_nfiles ||
91	    (fp = fdp->fd_ofiles[fd]) == NULL ||
92	    (fp->f_flag & flag) == 0) {
93		return (NULL);
94	}
95	fhold(fp);
96	return (fp);
97}
98
99/*
100 * Read system call.
101 */
102#ifndef _SYS_SYSPROTO_H_
103struct read_args {
104	int	fd;
105	void	*buf;
106	size_t	nbyte;
107};
108#endif
109int
110read(p, uap)
111	struct proc *p;
112	register struct read_args *uap;
113{
114	register struct file *fp;
115	int error;
116
117	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
118		return (EBADF);
119	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
120	fdrop(fp, p);
121	return(error);
122}
123
124/*
125 * Pread system call
126 */
127#ifndef _SYS_SYSPROTO_H_
128struct pread_args {
129	int	fd;
130	void	*buf;
131	size_t	nbyte;
132	int	pad;
133	off_t	offset;
134};
135#endif
136int
137pread(p, uap)
138	struct proc *p;
139	register struct pread_args *uap;
140{
141	register struct file *fp;
142	int error;
143
144	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
145		return (EBADF);
146	if (fp->f_type != DTYPE_VNODE) {
147		error = ESPIPE;
148	} else {
149	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
150		uap->offset, FOF_OFFSET);
151	}
152	fdrop(fp, p);
153	return(error);
154}
155
156/*
157 * Code common for read and pread
158 */
159int
160dofileread(p, fp, fd, buf, nbyte, offset, flags)
161	struct proc *p;
162	struct file *fp;
163	int fd, flags;
164	void *buf;
165	size_t nbyte;
166	off_t offset;
167{
168	struct uio auio;
169	struct iovec aiov;
170	long cnt, error = 0;
171#ifdef KTRACE
172	struct iovec ktriov;
173	struct uio ktruio;
174	int didktr = 0;
175#endif
176
177	aiov.iov_base = (caddr_t)buf;
178	aiov.iov_len = nbyte;
179	auio.uio_iov = &aiov;
180	auio.uio_iovcnt = 1;
181	auio.uio_offset = offset;
182	if (nbyte > INT_MAX)
183		return (EINVAL);
184	auio.uio_resid = nbyte;
185	auio.uio_rw = UIO_READ;
186	auio.uio_segflg = UIO_USERSPACE;
187	auio.uio_procp = p;
188#ifdef KTRACE
189	/*
190	 * if tracing, save a copy of iovec
191	 */
192	if (KTRPOINT(p, KTR_GENIO)) {
193		ktriov = aiov;
194		ktruio = auio;
195		didktr = 1;
196	}
197#endif
198	cnt = nbyte;
199
200	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
201		if (auio.uio_resid != cnt && (error == ERESTART ||
202		    error == EINTR || error == EWOULDBLOCK))
203			error = 0;
204	}
205	cnt -= auio.uio_resid;
206#ifdef KTRACE
207	if (didktr && error == 0) {
208		ktruio.uio_iov = &ktriov;
209		ktruio.uio_resid = cnt;
210		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
211	}
212#endif
213	p->p_retval[0] = cnt;
214	return (error);
215}
216
217/*
218 * Scatter read system call.
219 */
220#ifndef _SYS_SYSPROTO_H_
221struct readv_args {
222	int	fd;
223	struct	iovec *iovp;
224	u_int	iovcnt;
225};
226#endif
227int
228readv(p, uap)
229	struct proc *p;
230	register struct readv_args *uap;
231{
232	register struct file *fp;
233	register struct filedesc *fdp = p->p_fd;
234	struct uio auio;
235	register struct iovec *iov;
236	struct iovec *needfree;
237	struct iovec aiov[UIO_SMALLIOV];
238	long i, cnt, error = 0;
239	u_int iovlen;
240#ifdef KTRACE
241	struct iovec *ktriov = NULL;
242	struct uio ktruio;
243#endif
244
245	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
246		return (EBADF);
247	/* note: can't use iovlen until iovcnt is validated */
248	iovlen = uap->iovcnt * sizeof (struct iovec);
249	if (uap->iovcnt > UIO_SMALLIOV) {
250		if (uap->iovcnt > UIO_MAXIOV)
251			return (EINVAL);
252		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
253		needfree = iov;
254	} else {
255		iov = aiov;
256		needfree = NULL;
257	}
258	auio.uio_iov = iov;
259	auio.uio_iovcnt = uap->iovcnt;
260	auio.uio_rw = UIO_READ;
261	auio.uio_segflg = UIO_USERSPACE;
262	auio.uio_procp = p;
263	auio.uio_offset = -1;
264	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
265		goto done;
266	auio.uio_resid = 0;
267	for (i = 0; i < uap->iovcnt; i++) {
268		if (iov->iov_len > INT_MAX - auio.uio_resid) {
269			error = EINVAL;
270			goto done;
271		}
272		auio.uio_resid += iov->iov_len;
273		iov++;
274	}
275#ifdef KTRACE
276	/*
277	 * if tracing, save a copy of iovec
278	 */
279	if (KTRPOINT(p, KTR_GENIO))  {
280		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
281		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
282		ktruio = auio;
283	}
284#endif
285	cnt = auio.uio_resid;
286	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
287		if (auio.uio_resid != cnt && (error == ERESTART ||
288		    error == EINTR || error == EWOULDBLOCK))
289			error = 0;
290	}
291	cnt -= auio.uio_resid;
292#ifdef KTRACE
293	if (ktriov != NULL) {
294		if (error == 0) {
295			ktruio.uio_iov = ktriov;
296			ktruio.uio_resid = cnt;
297			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
298			    error);
299		}
300		FREE(ktriov, M_TEMP);
301	}
302#endif
303	p->p_retval[0] = cnt;
304done:
305	fdrop(fp, p);
306	if (needfree)
307		FREE(needfree, M_IOV);
308	return (error);
309}
310
311/*
312 * Write system call
313 */
314#ifndef _SYS_SYSPROTO_H_
315struct write_args {
316	int	fd;
317	const void *buf;
318	size_t	nbyte;
319};
320#endif
321int
322write(p, uap)
323	struct proc *p;
324	register struct write_args *uap;
325{
326	register struct file *fp;
327	int error;
328
329	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
330		return (EBADF);
331	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
332	fdrop(fp, p);
333	return(error);
334}
335
336/*
337 * Pwrite system call
338 */
339#ifndef _SYS_SYSPROTO_H_
340struct pwrite_args {
341	int	fd;
342	const void *buf;
343	size_t	nbyte;
344	int	pad;
345	off_t	offset;
346};
347#endif
348int
349pwrite(p, uap)
350	struct proc *p;
351	register struct pwrite_args *uap;
352{
353	register struct file *fp;
354	int error;
355
356	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
357		return (EBADF);
358	if (fp->f_type != DTYPE_VNODE) {
359		error = ESPIPE;
360	} else {
361	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
362		uap->offset, FOF_OFFSET);
363	}
364	fdrop(fp, p);
365	return(error);
366}
367
368static int
369dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
370	struct proc *p;
371	struct file *fp;
372	int fd, flags;
373	const void *buf;
374	size_t nbyte;
375	off_t offset;
376{
377	struct uio auio;
378	struct iovec aiov;
379	long cnt, error = 0;
380#ifdef KTRACE
381	struct iovec ktriov;
382	struct uio ktruio;
383	int didktr = 0;
384#endif
385
386	aiov.iov_base = (void *)(uintptr_t)buf;
387	aiov.iov_len = nbyte;
388	auio.uio_iov = &aiov;
389	auio.uio_iovcnt = 1;
390	auio.uio_offset = offset;
391	if (nbyte > INT_MAX)
392		return (EINVAL);
393	auio.uio_resid = nbyte;
394	auio.uio_rw = UIO_WRITE;
395	auio.uio_segflg = UIO_USERSPACE;
396	auio.uio_procp = p;
397#ifdef KTRACE
398	/*
399	 * if tracing, save a copy of iovec and uio
400	 */
401	if (KTRPOINT(p, KTR_GENIO)) {
402		ktriov = aiov;
403		ktruio = auio;
404		didktr = 1;
405	}
406#endif
407	cnt = nbyte;
408	if (fp->f_type == DTYPE_VNODE)
409		bwillwrite();
410	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
411		if (auio.uio_resid != cnt && (error == ERESTART ||
412		    error == EINTR || error == EWOULDBLOCK))
413			error = 0;
414		if (error == EPIPE)
415			psignal(p, SIGPIPE);
416	}
417	cnt -= auio.uio_resid;
418#ifdef KTRACE
419	if (didktr && error == 0) {
420		ktruio.uio_iov = &ktriov;
421		ktruio.uio_resid = cnt;
422		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
423	}
424#endif
425	p->p_retval[0] = cnt;
426	return (error);
427}
428
429/*
430 * Gather write system call
431 */
432#ifndef _SYS_SYSPROTO_H_
433struct writev_args {
434	int	fd;
435	struct	iovec *iovp;
436	u_int	iovcnt;
437};
438#endif
439int
440writev(p, uap)
441	struct proc *p;
442	register struct writev_args *uap;
443{
444	register struct file *fp;
445	register struct filedesc *fdp = p->p_fd;
446	struct uio auio;
447	register struct iovec *iov;
448	struct iovec *needfree;
449	struct iovec aiov[UIO_SMALLIOV];
450	long i, cnt, error = 0;
451	u_int iovlen;
452#ifdef KTRACE
453	struct iovec *ktriov = NULL;
454	struct uio ktruio;
455#endif
456
457	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
458		return (EBADF);
459	/* note: can't use iovlen until iovcnt is validated */
460	iovlen = uap->iovcnt * sizeof (struct iovec);
461	if (uap->iovcnt > UIO_SMALLIOV) {
462		if (uap->iovcnt > UIO_MAXIOV) {
463			needfree = NULL;
464			error = EINVAL;
465			goto done;
466		}
467		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
468		needfree = iov;
469	} else {
470		iov = aiov;
471		needfree = NULL;
472	}
473	auio.uio_iov = iov;
474	auio.uio_iovcnt = uap->iovcnt;
475	auio.uio_rw = UIO_WRITE;
476	auio.uio_segflg = UIO_USERSPACE;
477	auio.uio_procp = p;
478	auio.uio_offset = -1;
479	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
480		goto done;
481	auio.uio_resid = 0;
482	for (i = 0; i < uap->iovcnt; i++) {
483		if (iov->iov_len > INT_MAX - auio.uio_resid) {
484			error = EINVAL;
485			goto done;
486		}
487		auio.uio_resid += iov->iov_len;
488		iov++;
489	}
490#ifdef KTRACE
491	/*
492	 * if tracing, save a copy of iovec and uio
493	 */
494	if (KTRPOINT(p, KTR_GENIO))  {
495		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
496		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
497		ktruio = auio;
498	}
499#endif
500	cnt = auio.uio_resid;
501	if (fp->f_type == DTYPE_VNODE)
502		bwillwrite();
503	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
504		if (auio.uio_resid != cnt && (error == ERESTART ||
505		    error == EINTR || error == EWOULDBLOCK))
506			error = 0;
507		if (error == EPIPE)
508			psignal(p, SIGPIPE);
509	}
510	cnt -= auio.uio_resid;
511#ifdef KTRACE
512	if (ktriov != NULL) {
513		if (error == 0) {
514			ktruio.uio_iov = ktriov;
515			ktruio.uio_resid = cnt;
516			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
517			    error);
518		}
519		FREE(ktriov, M_TEMP);
520	}
521#endif
522	p->p_retval[0] = cnt;
523done:
524	fdrop(fp, p);
525	if (needfree)
526		FREE(needfree, M_IOV);
527	return (error);
528}
529
530/*
531 * Ioctl system call
532 */
533#ifndef _SYS_SYSPROTO_H_
534struct ioctl_args {
535	int	fd;
536	u_long	com;
537	caddr_t	data;
538};
539#endif
540/* ARGSUSED */
541int
542ioctl(p, uap)
543	struct proc *p;
544	register struct ioctl_args *uap;
545{
546	register struct file *fp;
547	register struct filedesc *fdp;
548	register u_long com;
549	int error;
550	register u_int size;
551	caddr_t data, memp;
552	int tmp;
553#define STK_PARAMS	128
554	union {
555	    char stkbuf[STK_PARAMS];
556	    long align;
557	} ubuf;
558
559	fdp = p->p_fd;
560	if ((u_int)uap->fd >= fdp->fd_nfiles ||
561	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
562		return (EBADF);
563
564	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
565		return (EBADF);
566
567	switch (com = uap->com) {
568	case FIONCLEX:
569		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
570		return (0);
571	case FIOCLEX:
572		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
573		return (0);
574	}
575
576	/*
577	 * Interpret high order word to find amount of data to be
578	 * copied to/from the user's address space.
579	 */
580	size = IOCPARM_LEN(com);
581	if (size > IOCPARM_MAX)
582		return (ENOTTY);
583
584	fhold(fp);
585
586	memp = NULL;
587	if (size > sizeof (ubuf.stkbuf)) {
588		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
589		data = memp;
590	} else {
591		data = ubuf.stkbuf;
592	}
593	if (com&IOC_IN) {
594		if (size) {
595			error = copyin(uap->data, data, (u_int)size);
596			if (error) {
597				if (memp)
598					free(memp, M_IOCTLOPS);
599				fdrop(fp, p);
600				return (error);
601			}
602		} else {
603			*(caddr_t *)data = uap->data;
604		}
605	} else if ((com&IOC_OUT) && size) {
606		/*
607		 * Zero the buffer so the user always
608		 * gets back something deterministic.
609		 */
610		bzero(data, size);
611	} else if (com&IOC_VOID) {
612		*(caddr_t *)data = uap->data;
613	}
614
615	switch (com) {
616
617	case FIONBIO:
618		if ((tmp = *(int *)data))
619			fp->f_flag |= FNONBLOCK;
620		else
621			fp->f_flag &= ~FNONBLOCK;
622		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
623		break;
624
625	case FIOASYNC:
626		if ((tmp = *(int *)data))
627			fp->f_flag |= FASYNC;
628		else
629			fp->f_flag &= ~FASYNC;
630		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
631		break;
632
633	default:
634		error = fo_ioctl(fp, com, data, p);
635		/*
636		 * Copy any data to user, size was
637		 * already set and checked above.
638		 */
639		if (error == 0 && (com&IOC_OUT) && size)
640			error = copyout(data, uap->data, (u_int)size);
641		break;
642	}
643	if (memp)
644		free(memp, M_IOCTLOPS);
645	fdrop(fp, p);
646	return (error);
647}
648
649static int	nselcoll;	/* Select collisions since boot */
650int	selwait;
651SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
652
653/*
654 * Select system call.
655 */
656#ifndef _SYS_SYSPROTO_H_
657struct select_args {
658	int	nd;
659	fd_set	*in, *ou, *ex;
660	struct	timeval *tv;
661};
662#endif
663int
664select(p, uap)
665	register struct proc *p;
666	register struct select_args *uap;
667{
668	/*
669	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
670	 * infds with the new FD_SETSIZE of 1024, and more than enough for
671	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
672	 * of 256.
673	 */
674	fd_mask s_selbits[howmany(2048, NFDBITS)];
675	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
676	struct timeval atv, rtv, ttv;
677	int s, ncoll, error, timo;
678	u_int nbufbytes, ncpbytes, nfdbits;
679
680	if (uap->nd < 0)
681		return (EINVAL);
682	if (uap->nd > p->p_fd->fd_nfiles)
683		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
684
685	/*
686	 * Allocate just enough bits for the non-null fd_sets.  Use the
687	 * preallocated auto buffer if possible.
688	 */
689	nfdbits = roundup(uap->nd, NFDBITS);
690	ncpbytes = nfdbits / NBBY;
691	nbufbytes = 0;
692	if (uap->in != NULL)
693		nbufbytes += 2 * ncpbytes;
694	if (uap->ou != NULL)
695		nbufbytes += 2 * ncpbytes;
696	if (uap->ex != NULL)
697		nbufbytes += 2 * ncpbytes;
698	if (nbufbytes <= sizeof s_selbits)
699		selbits = &s_selbits[0];
700	else
701		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
702
703	/*
704	 * Assign pointers into the bit buffers and fetch the input bits.
705	 * Put the output buffers together so that they can be bzeroed
706	 * together.
707	 */
708	sbp = selbits;
709#define	getbits(name, x) \
710	do {								\
711		if (uap->name == NULL)					\
712			ibits[x] = NULL;				\
713		else {							\
714			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
715			obits[x] = sbp;					\
716			sbp += ncpbytes / sizeof *sbp;			\
717			error = copyin(uap->name, ibits[x], ncpbytes);	\
718			if (error != 0)	{				\
719				PROC_LOCK(p);				\
720				goto done;				\
721			}						\
722		}							\
723	} while (0)
724	getbits(in, 0);
725	getbits(ou, 1);
726	getbits(ex, 2);
727#undef	getbits
728	if (nbufbytes != 0)
729		bzero(selbits, nbufbytes / 2);
730
731	if (uap->tv) {
732		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
733			sizeof (atv));
734		if (error) {
735			PROC_LOCK(p);
736			goto done;
737		}
738		if (itimerfix(&atv)) {
739			error = EINVAL;
740			PROC_LOCK(p);
741			goto done;
742		}
743		getmicrouptime(&rtv);
744		timevaladd(&atv, &rtv);
745	} else {
746		atv.tv_sec = 0;
747		atv.tv_usec = 0;
748	}
749	timo = 0;
750	PROC_LOCK(p);
751retry:
752	ncoll = nselcoll;
753	p->p_flag |= P_SELECT;
754	PROC_UNLOCK(p);
755	error = selscan(p, ibits, obits, uap->nd);
756	PROC_LOCK(p);
757	if (error || p->p_retval[0])
758		goto done;
759	if (atv.tv_sec || atv.tv_usec) {
760		getmicrouptime(&rtv);
761		if (timevalcmp(&rtv, &atv, >=))
762			goto done;
763		ttv = atv;
764		timevalsub(&ttv, &rtv);
765		timo = ttv.tv_sec > 24 * 60 * 60 ?
766		    24 * 60 * 60 * hz : tvtohz(&ttv);
767	}
768	s = splhigh();
769	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
770		splx(s);
771		goto retry;
772	}
773	p->p_flag &= ~P_SELECT;
774
775	error = msleep((caddr_t)&selwait, &p->p_mtx, PSOCK | PCATCH, "select",
776	    timo);
777
778	splx(s);
779	if (error == 0)
780		goto retry;
781done:
782	p->p_flag &= ~P_SELECT;
783	PROC_UNLOCK(p);
784	/* select is not restarted after signals... */
785	if (error == ERESTART)
786		error = EINTR;
787	if (error == EWOULDBLOCK)
788		error = 0;
789#define	putbits(name, x) \
790	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
791		error = error2;
792	if (error == 0) {
793		int error2;
794
795		putbits(in, 0);
796		putbits(ou, 1);
797		putbits(ex, 2);
798#undef putbits
799	}
800	if (selbits != &s_selbits[0])
801		free(selbits, M_SELECT);
802	return (error);
803}
804
805static int
806selscan(p, ibits, obits, nfd)
807	struct proc *p;
808	fd_mask **ibits, **obits;
809	int nfd;
810{
811	struct filedesc *fdp = p->p_fd;
812	int msk, i, fd;
813	fd_mask bits;
814	struct file *fp;
815	int n = 0;
816	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
817	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
818
819	for (msk = 0; msk < 3; msk++) {
820		if (ibits[msk] == NULL)
821			continue;
822		for (i = 0; i < nfd; i += NFDBITS) {
823			bits = ibits[msk][i/NFDBITS];
824			/* ffs(int mask) not portable, fd_mask is long */
825			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
826				if (!(bits & 1))
827					continue;
828				fp = fdp->fd_ofiles[fd];
829				if (fp == NULL)
830					return (EBADF);
831				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
832					obits[msk][(fd)/NFDBITS] |=
833					    ((fd_mask)1 << ((fd) % NFDBITS));
834					n++;
835				}
836			}
837		}
838	}
839	p->p_retval[0] = n;
840	return (0);
841}
842
843/*
844 * Poll system call.
845 */
846#ifndef _SYS_SYSPROTO_H_
847struct poll_args {
848	struct pollfd *fds;
849	u_int	nfds;
850	int	timeout;
851};
852#endif
853int
854poll(p, uap)
855	register struct proc *p;
856	register struct poll_args *uap;
857{
858	caddr_t bits;
859	char smallbits[32 * sizeof(struct pollfd)];
860	struct timeval atv, rtv, ttv;
861	int s, ncoll, error = 0, timo, lim, nfds;
862	size_t ni;
863
864	nfds = SCARG(uap, nfds);
865	/*
866	 * This is kinda bogus.  We have fd limits, but that doesn't
867	 * map too well to the size of the pfd[] array.  Make sure
868	 * we let the process use at least FD_SETSIZE entries.
869	 * The specs say we only have to support OPEN_MAX entries (64).
870	 */
871	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
872	lim = min(lim, FD_SETSIZE);
873	if (nfds > lim)
874		return (EINVAL);
875	ni = nfds * sizeof(struct pollfd);
876	if (ni > sizeof(smallbits))
877		bits = malloc(ni, M_TEMP, M_WAITOK);
878	else
879		bits = smallbits;
880	error = copyin(SCARG(uap, fds), bits, ni);
881	PROC_LOCK(p);
882	if (error)
883		goto done;
884	if (SCARG(uap, timeout) != INFTIM) {
885		atv.tv_sec = SCARG(uap, timeout) / 1000;
886		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
887		if (itimerfix(&atv)) {
888			error = EINVAL;
889			goto done;
890		}
891		getmicrouptime(&rtv);
892		timevaladd(&atv, &rtv);
893	} else {
894		atv.tv_sec = 0;
895		atv.tv_usec = 0;
896	}
897	timo = 0;
898retry:
899	ncoll = nselcoll;
900	p->p_flag |= P_SELECT;
901	PROC_UNLOCK(p);
902	error = pollscan(p, (struct pollfd *)bits, nfds);
903	PROC_LOCK(p);
904	if (error || p->p_retval[0])
905		goto done;
906	if (atv.tv_sec || atv.tv_usec) {
907		getmicrouptime(&rtv);
908		if (timevalcmp(&rtv, &atv, >=))
909			goto done;
910		ttv = atv;
911		timevalsub(&ttv, &rtv);
912		timo = ttv.tv_sec > 24 * 60 * 60 ?
913		    24 * 60 * 60 * hz : tvtohz(&ttv);
914	}
915	s = splhigh();
916	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
917		splx(s);
918		goto retry;
919	}
920	p->p_flag &= ~P_SELECT;
921	error = msleep((caddr_t)&selwait, &p->p_mtx, PSOCK | PCATCH, "poll",
922	    timo);
923	splx(s);
924	if (error == 0)
925		goto retry;
926done:
927	p->p_flag &= ~P_SELECT;
928	PROC_UNLOCK(p);
929	/* poll is not restarted after signals... */
930	if (error == ERESTART)
931		error = EINTR;
932	if (error == EWOULDBLOCK)
933		error = 0;
934	if (error == 0) {
935		error = copyout(bits, SCARG(uap, fds), ni);
936		if (error)
937			goto out;
938	}
939out:
940	if (ni > sizeof(smallbits))
941		free(bits, M_TEMP);
942	return (error);
943}
944
945static int
946pollscan(p, fds, nfd)
947	struct proc *p;
948	struct pollfd *fds;
949	int nfd;
950{
951	register struct filedesc *fdp = p->p_fd;
952	int i;
953	struct file *fp;
954	int n = 0;
955
956	for (i = 0; i < nfd; i++, fds++) {
957		if (fds->fd >= fdp->fd_nfiles) {
958			fds->revents = POLLNVAL;
959			n++;
960		} else if (fds->fd < 0) {
961			fds->revents = 0;
962		} else {
963			fp = fdp->fd_ofiles[fds->fd];
964			if (fp == NULL) {
965				fds->revents = POLLNVAL;
966				n++;
967			} else {
968				/*
969				 * Note: backend also returns POLLHUP and
970				 * POLLERR if appropriate.
971				 */
972				fds->revents = fo_poll(fp, fds->events,
973				    fp->f_cred, p);
974				if (fds->revents != 0)
975					n++;
976			}
977		}
978	}
979	p->p_retval[0] = n;
980	return (0);
981}
982
983/*
984 * OpenBSD poll system call.
985 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
986 */
987#ifndef _SYS_SYSPROTO_H_
988struct openbsd_poll_args {
989	struct pollfd *fds;
990	u_int	nfds;
991	int	timeout;
992};
993#endif
994int
995openbsd_poll(p, uap)
996	register struct proc *p;
997	register struct openbsd_poll_args *uap;
998{
999	return (poll(p, (struct poll_args *)uap));
1000}
1001
1002/*ARGSUSED*/
1003int
1004seltrue(dev, events, p)
1005	dev_t dev;
1006	int events;
1007	struct proc *p;
1008{
1009
1010	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1011}
1012
1013/*
1014 * Record a select request.
1015 */
1016void
1017selrecord(selector, sip)
1018	struct proc *selector;
1019	struct selinfo *sip;
1020{
1021	struct proc *p;
1022	pid_t mypid;
1023
1024	mypid = selector->p_pid;
1025	if (sip->si_pid == mypid)
1026		return;
1027	if (sip->si_pid && (p = pfind(sip->si_pid))) {
1028		mtx_enter(&sched_lock, MTX_SPIN);
1029	    	if (p->p_wchan == (caddr_t)&selwait) {
1030			mtx_exit(&sched_lock, MTX_SPIN);
1031			sip->si_flags |= SI_COLL;
1032			return;
1033		}
1034		mtx_exit(&sched_lock, MTX_SPIN);
1035	}
1036	sip->si_pid = mypid;
1037}
1038
1039/*
1040 * Do a wakeup when a selectable event occurs.
1041 */
1042void
1043selwakeup(sip)
1044	register struct selinfo *sip;
1045{
1046	register struct proc *p;
1047
1048	if (sip->si_pid == 0)
1049		return;
1050	if (sip->si_flags & SI_COLL) {
1051		nselcoll++;
1052		sip->si_flags &= ~SI_COLL;
1053		wakeup((caddr_t)&selwait);
1054	}
1055	p = pfind(sip->si_pid);
1056	sip->si_pid = 0;
1057	if (p != NULL) {
1058		mtx_enter(&sched_lock, MTX_SPIN);
1059		if (p->p_wchan == (caddr_t)&selwait) {
1060			if (p->p_stat == SSLEEP)
1061				setrunnable(p);
1062			else
1063				unsleep(p);
1064			mtx_exit(&sched_lock, MTX_SPIN);
1065		} else {
1066			mtx_exit(&sched_lock, MTX_SPIN);
1067			PROC_LOCK(p);
1068			p->p_flag &= ~P_SELECT;
1069			PROC_UNLOCK(p);
1070		}
1071	}
1072}
1073