sys_generic.c revision 63974
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 63974 2000-07-28 22:17:42Z peter $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/sysctl.h>
59#include <sys/sysent.h>
60#ifdef KTRACE
61#include <sys/ktrace.h>
62#endif
63
64#include <machine/limits.h>
65
66static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
67static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
68MALLOC_DEFINE(M_IOV, "iov", "large iov's");
69
70static int	pollscan __P((struct proc *, struct pollfd *, int));
71static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
72static int	dofileread __P((struct proc *, struct file *, int, void *,
73		    size_t, off_t, int));
74static int	dofilewrite __P((struct proc *, struct file *, int,
75		    const void *, size_t, off_t, int));
76
77struct file*
78getfp(fdp, fd, flag)
79	struct filedesc* fdp;
80	int fd, flag;
81{
82	struct file* fp;
83
84	if (((u_int)fd) >= fdp->fd_nfiles ||
85	    (fp = fdp->fd_ofiles[fd]) == NULL ||
86	    (fp->f_flag & flag) == 0)
87		return (NULL);
88	return (fp);
89}
90
91/*
92 * Read system call.
93 */
94#ifndef _SYS_SYSPROTO_H_
95struct read_args {
96	int	fd;
97	void	*buf;
98	size_t	nbyte;
99};
100#endif
101int
102read(p, uap)
103	struct proc *p;
104	register struct read_args *uap;
105{
106	register struct file *fp;
107
108	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
109		return (EBADF);
110	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
111}
112
113/*
114 * Pread system call
115 */
116#ifndef _SYS_SYSPROTO_H_
117struct pread_args {
118	int	fd;
119	void	*buf;
120	size_t	nbyte;
121	int	pad;
122	off_t	offset;
123};
124#endif
125int
126pread(p, uap)
127	struct proc *p;
128	register struct pread_args *uap;
129{
130	register struct file *fp;
131
132	if ((fp = getfp(p->p_fd, uap->fd, FREAD)) == NULL)
133		return (EBADF);
134	if (fp->f_type != DTYPE_VNODE)
135		return (ESPIPE);
136	return (dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
137	    FOF_OFFSET));
138}
139
140/*
141 * Code common for read and pread
142 */
143int
144dofileread(p, fp, fd, buf, nbyte, offset, flags)
145	struct proc *p;
146	struct file *fp;
147	int fd, flags;
148	void *buf;
149	size_t nbyte;
150	off_t offset;
151{
152	struct uio auio;
153	struct iovec aiov;
154	long cnt, error = 0;
155#ifdef KTRACE
156	struct iovec ktriov;
157	struct uio ktruio;
158	int didktr = 0;
159#endif
160
161	aiov.iov_base = (caddr_t)buf;
162	aiov.iov_len = nbyte;
163	auio.uio_iov = &aiov;
164	auio.uio_iovcnt = 1;
165	auio.uio_offset = offset;
166	if (nbyte > INT_MAX)
167		return (EINVAL);
168	auio.uio_resid = nbyte;
169	auio.uio_rw = UIO_READ;
170	auio.uio_segflg = UIO_USERSPACE;
171	auio.uio_procp = p;
172#ifdef KTRACE
173	/*
174	 * if tracing, save a copy of iovec
175	 */
176	if (KTRPOINT(p, KTR_GENIO)) {
177		ktriov = aiov;
178		ktruio = auio;
179		didktr = 1;
180	}
181#endif
182	cnt = nbyte;
183	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)))
184		if (auio.uio_resid != cnt && (error == ERESTART ||
185		    error == EINTR || error == EWOULDBLOCK))
186			error = 0;
187	cnt -= auio.uio_resid;
188#ifdef KTRACE
189	if (didktr && error == 0) {
190		ktruio.uio_iov = &ktriov;
191		ktruio.uio_resid = cnt;
192		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
193	}
194#endif
195	p->p_retval[0] = cnt;
196	return (error);
197}
198
199/*
200 * Scatter read system call.
201 */
202#ifndef _SYS_SYSPROTO_H_
203struct readv_args {
204	int	fd;
205	struct	iovec *iovp;
206	u_int	iovcnt;
207};
208#endif
209int
210readv(p, uap)
211	struct proc *p;
212	register struct readv_args *uap;
213{
214	register struct file *fp;
215	register struct filedesc *fdp = p->p_fd;
216	struct uio auio;
217	register struct iovec *iov;
218	struct iovec *needfree;
219	struct iovec aiov[UIO_SMALLIOV];
220	long i, cnt, error = 0;
221	u_int iovlen;
222#ifdef KTRACE
223	struct iovec *ktriov = NULL;
224	struct uio ktruio;
225#endif
226
227	if ((fp = getfp(fdp, uap->fd, FREAD)) == NULL)
228		return (EBADF);
229	/* note: can't use iovlen until iovcnt is validated */
230	iovlen = uap->iovcnt * sizeof (struct iovec);
231	if (uap->iovcnt > UIO_SMALLIOV) {
232		if (uap->iovcnt > UIO_MAXIOV)
233			return (EINVAL);
234		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
235		needfree = iov;
236	} else {
237		iov = aiov;
238		needfree = NULL;
239	}
240	auio.uio_iov = iov;
241	auio.uio_iovcnt = uap->iovcnt;
242	auio.uio_rw = UIO_READ;
243	auio.uio_segflg = UIO_USERSPACE;
244	auio.uio_procp = p;
245	auio.uio_offset = -1;
246	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
247		goto done;
248	auio.uio_resid = 0;
249	for (i = 0; i < uap->iovcnt; i++) {
250		if (iov->iov_len > INT_MAX - auio.uio_resid) {
251			error = EINVAL;
252			goto done;
253		}
254		auio.uio_resid += iov->iov_len;
255		iov++;
256	}
257#ifdef KTRACE
258	/*
259	 * if tracing, save a copy of iovec
260	 */
261	if (KTRPOINT(p, KTR_GENIO))  {
262		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
263		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
264		ktruio = auio;
265	}
266#endif
267	cnt = auio.uio_resid;
268	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)))
269		if (auio.uio_resid != cnt && (error == ERESTART ||
270		    error == EINTR || error == EWOULDBLOCK))
271			error = 0;
272	cnt -= auio.uio_resid;
273#ifdef KTRACE
274	if (ktriov != NULL) {
275		if (error == 0) {
276			ktruio.uio_iov = ktriov;
277			ktruio.uio_resid = cnt;
278			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
279			    error);
280		}
281		FREE(ktriov, M_TEMP);
282	}
283#endif
284	p->p_retval[0] = cnt;
285done:
286	if (needfree)
287		FREE(needfree, M_IOV);
288	return (error);
289}
290
291/*
292 * Write system call
293 */
294#ifndef _SYS_SYSPROTO_H_
295struct write_args {
296	int	fd;
297	const void *buf;
298	size_t	nbyte;
299};
300#endif
301int
302write(p, uap)
303	struct proc *p;
304	register struct write_args *uap;
305{
306	register struct file *fp;
307
308	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
309		return (EBADF);
310	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0));
311}
312
313/*
314 * Pwrite system call
315 */
316#ifndef _SYS_SYSPROTO_H_
317struct pwrite_args {
318	int	fd;
319	const void *buf;
320	size_t	nbyte;
321	int	pad;
322	off_t	offset;
323};
324#endif
325int
326pwrite(p, uap)
327	struct proc *p;
328	register struct pwrite_args *uap;
329{
330	register struct file *fp;
331
332	if ((fp = getfp(p->p_fd, uap->fd, FWRITE)) == NULL)
333		return (EBADF);
334	if (fp->f_type != DTYPE_VNODE)
335		return (ESPIPE);
336	return (dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, uap->offset,
337	    FOF_OFFSET));
338}
339
340static int
341dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
342	struct proc *p;
343	struct file *fp;
344	int fd, flags;
345	const void *buf;
346	size_t nbyte;
347	off_t offset;
348{
349	struct uio auio;
350	struct iovec aiov;
351	long cnt, error = 0;
352#ifdef KTRACE
353	struct iovec ktriov;
354	struct uio ktruio;
355	int didktr = 0;
356#endif
357
358	aiov.iov_base = (void *)(uintptr_t)buf;
359	aiov.iov_len = nbyte;
360	auio.uio_iov = &aiov;
361	auio.uio_iovcnt = 1;
362	auio.uio_offset = offset;
363	if (nbyte > INT_MAX)
364		return (EINVAL);
365	auio.uio_resid = nbyte;
366	auio.uio_rw = UIO_WRITE;
367	auio.uio_segflg = UIO_USERSPACE;
368	auio.uio_procp = p;
369#ifdef KTRACE
370	/*
371	 * if tracing, save a copy of iovec and uio
372	 */
373	if (KTRPOINT(p, KTR_GENIO)) {
374		ktriov = aiov;
375		ktruio = auio;
376		didktr = 1;
377	}
378#endif
379	cnt = nbyte;
380	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
381		if (auio.uio_resid != cnt && (error == ERESTART ||
382		    error == EINTR || error == EWOULDBLOCK))
383			error = 0;
384		if (error == EPIPE)
385			psignal(p, SIGPIPE);
386	}
387	cnt -= auio.uio_resid;
388#ifdef KTRACE
389	if (didktr && error == 0) {
390		ktruio.uio_iov = &ktriov;
391		ktruio.uio_resid = cnt;
392		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
393	}
394#endif
395	p->p_retval[0] = cnt;
396	return (error);
397}
398
399/*
400 * Gather write system call
401 */
402#ifndef _SYS_SYSPROTO_H_
403struct writev_args {
404	int	fd;
405	struct	iovec *iovp;
406	u_int	iovcnt;
407};
408#endif
409int
410writev(p, uap)
411	struct proc *p;
412	register struct writev_args *uap;
413{
414	register struct file *fp;
415	register struct filedesc *fdp = p->p_fd;
416	struct uio auio;
417	register struct iovec *iov;
418	struct iovec *needfree;
419	struct iovec aiov[UIO_SMALLIOV];
420	long i, cnt, error = 0;
421	u_int iovlen;
422#ifdef KTRACE
423	struct iovec *ktriov = NULL;
424	struct uio ktruio;
425#endif
426
427	if ((fp = getfp(fdp, uap->fd, FWRITE)) == NULL)
428		return (EBADF);
429	fhold(fp);
430	/* note: can't use iovlen until iovcnt is validated */
431	iovlen = uap->iovcnt * sizeof (struct iovec);
432	if (uap->iovcnt > UIO_SMALLIOV) {
433		if (uap->iovcnt > UIO_MAXIOV) {
434			needfree = NULL;
435			error = EINVAL;
436			goto done;
437		}
438		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
439		needfree = iov;
440	} else {
441		iov = aiov;
442		needfree = NULL;
443	}
444	auio.uio_iov = iov;
445	auio.uio_iovcnt = uap->iovcnt;
446	auio.uio_rw = UIO_WRITE;
447	auio.uio_segflg = UIO_USERSPACE;
448	auio.uio_procp = p;
449	auio.uio_offset = -1;
450	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
451		goto done;
452	auio.uio_resid = 0;
453	for (i = 0; i < uap->iovcnt; i++) {
454		if (iov->iov_len > INT_MAX - auio.uio_resid) {
455			error = EINVAL;
456			goto done;
457		}
458		auio.uio_resid += iov->iov_len;
459		iov++;
460	}
461#ifdef KTRACE
462	/*
463	 * if tracing, save a copy of iovec and uio
464	 */
465	if (KTRPOINT(p, KTR_GENIO))  {
466		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
467		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
468		ktruio = auio;
469	}
470#endif
471	cnt = auio.uio_resid;
472	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
473		if (auio.uio_resid != cnt && (error == ERESTART ||
474		    error == EINTR || error == EWOULDBLOCK))
475			error = 0;
476		if (error == EPIPE)
477			psignal(p, SIGPIPE);
478	}
479	cnt -= auio.uio_resid;
480#ifdef KTRACE
481	if (ktriov != NULL) {
482		if (error == 0) {
483			ktruio.uio_iov = ktriov;
484			ktruio.uio_resid = cnt;
485			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
486			    error);
487		}
488		FREE(ktriov, M_TEMP);
489	}
490#endif
491	p->p_retval[0] = cnt;
492done:
493	fdrop(fp, p);
494	if (needfree)
495		FREE(needfree, M_IOV);
496	return (error);
497}
498
499/*
500 * Ioctl system call
501 */
502#ifndef _SYS_SYSPROTO_H_
503struct ioctl_args {
504	int	fd;
505	u_long	com;
506	caddr_t	data;
507};
508#endif
509/* ARGSUSED */
510int
511ioctl(p, uap)
512	struct proc *p;
513	register struct ioctl_args *uap;
514{
515	register struct file *fp;
516	register struct filedesc *fdp;
517	register u_long com;
518	int error;
519	register u_int size;
520	caddr_t data, memp;
521	int tmp;
522#define STK_PARAMS	128
523	union {
524	    char stkbuf[STK_PARAMS];
525	    long align;
526	} ubuf;
527
528	fdp = p->p_fd;
529	if ((u_int)uap->fd >= fdp->fd_nfiles ||
530	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
531		return (EBADF);
532
533	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
534		return (EBADF);
535
536	switch (com = uap->com) {
537	case FIONCLEX:
538		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
539		return (0);
540	case FIOCLEX:
541		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
542		return (0);
543	}
544
545	/*
546	 * Interpret high order word to find amount of data to be
547	 * copied to/from the user's address space.
548	 */
549	size = IOCPARM_LEN(com);
550	if (size > IOCPARM_MAX)
551		return (ENOTTY);
552	memp = NULL;
553	if (size > sizeof (ubuf.stkbuf)) {
554		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
555		data = memp;
556	} else
557		data = ubuf.stkbuf;
558	if (com&IOC_IN) {
559		if (size) {
560			error = copyin(uap->data, data, (u_int)size);
561			if (error) {
562				if (memp)
563					free(memp, M_IOCTLOPS);
564				return (error);
565			}
566		} else
567			*(caddr_t *)data = uap->data;
568	} else if ((com&IOC_OUT) && size)
569		/*
570		 * Zero the buffer so the user always
571		 * gets back something deterministic.
572		 */
573		bzero(data, size);
574	else if (com&IOC_VOID)
575		*(caddr_t *)data = uap->data;
576
577	switch (com) {
578
579	case FIONBIO:
580		if ((tmp = *(int *)data))
581			fp->f_flag |= FNONBLOCK;
582		else
583			fp->f_flag &= ~FNONBLOCK;
584		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
585		break;
586
587	case FIOASYNC:
588		if ((tmp = *(int *)data))
589			fp->f_flag |= FASYNC;
590		else
591			fp->f_flag &= ~FASYNC;
592		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
593		break;
594
595	default:
596		error = fo_ioctl(fp, com, data, p);
597		/*
598		 * Copy any data to user, size was
599		 * already set and checked above.
600		 */
601		if (error == 0 && (com&IOC_OUT) && size)
602			error = copyout(data, uap->data, (u_int)size);
603		break;
604	}
605	if (memp)
606		free(memp, M_IOCTLOPS);
607	return (error);
608}
609
610static int	nselcoll;	/* Select collisions since boot */
611int	selwait;
612SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
613
614/*
615 * Select system call.
616 */
617#ifndef _SYS_SYSPROTO_H_
618struct select_args {
619	int	nd;
620	fd_set	*in, *ou, *ex;
621	struct	timeval *tv;
622};
623#endif
624int
625select(p, uap)
626	register struct proc *p;
627	register struct select_args *uap;
628{
629	/*
630	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
631	 * infds with the new FD_SETSIZE of 1024, and more than enough for
632	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
633	 * of 256.
634	 */
635	fd_mask s_selbits[howmany(2048, NFDBITS)];
636	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
637	struct timeval atv, rtv, ttv;
638	int s, ncoll, error, timo;
639	u_int nbufbytes, ncpbytes, nfdbits;
640
641	if (uap->nd < 0)
642		return (EINVAL);
643	if (uap->nd > p->p_fd->fd_nfiles)
644		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
645
646	/*
647	 * Allocate just enough bits for the non-null fd_sets.  Use the
648	 * preallocated auto buffer if possible.
649	 */
650	nfdbits = roundup(uap->nd, NFDBITS);
651	ncpbytes = nfdbits / NBBY;
652	nbufbytes = 0;
653	if (uap->in != NULL)
654		nbufbytes += 2 * ncpbytes;
655	if (uap->ou != NULL)
656		nbufbytes += 2 * ncpbytes;
657	if (uap->ex != NULL)
658		nbufbytes += 2 * ncpbytes;
659	if (nbufbytes <= sizeof s_selbits)
660		selbits = &s_selbits[0];
661	else
662		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
663
664	/*
665	 * Assign pointers into the bit buffers and fetch the input bits.
666	 * Put the output buffers together so that they can be bzeroed
667	 * together.
668	 */
669	sbp = selbits;
670#define	getbits(name, x) \
671	do {								\
672		if (uap->name == NULL)					\
673			ibits[x] = NULL;				\
674		else {							\
675			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
676			obits[x] = sbp;					\
677			sbp += ncpbytes / sizeof *sbp;			\
678			error = copyin(uap->name, ibits[x], ncpbytes);	\
679			if (error != 0)					\
680				goto done;				\
681		}							\
682	} while (0)
683	getbits(in, 0);
684	getbits(ou, 1);
685	getbits(ex, 2);
686#undef	getbits
687	if (nbufbytes != 0)
688		bzero(selbits, nbufbytes / 2);
689
690	if (uap->tv) {
691		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
692			sizeof (atv));
693		if (error)
694			goto done;
695		if (itimerfix(&atv)) {
696			error = EINVAL;
697			goto done;
698		}
699		getmicrouptime(&rtv);
700		timevaladd(&atv, &rtv);
701	} else {
702		atv.tv_sec = 0;
703		atv.tv_usec = 0;
704	}
705	timo = 0;
706retry:
707	ncoll = nselcoll;
708	p->p_flag |= P_SELECT;
709	error = selscan(p, ibits, obits, uap->nd);
710	if (error || p->p_retval[0])
711		goto done;
712	if (atv.tv_sec || atv.tv_usec) {
713		getmicrouptime(&rtv);
714		if (timevalcmp(&rtv, &atv, >=))
715			goto done;
716		ttv = atv;
717		timevalsub(&ttv, &rtv);
718		timo = ttv.tv_sec > 24 * 60 * 60 ?
719		    24 * 60 * 60 * hz : tvtohz(&ttv);
720	}
721	s = splhigh();
722	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
723		splx(s);
724		goto retry;
725	}
726	p->p_flag &= ~P_SELECT;
727
728	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
729
730	splx(s);
731	if (error == 0)
732		goto retry;
733done:
734	p->p_flag &= ~P_SELECT;
735	/* select is not restarted after signals... */
736	if (error == ERESTART)
737		error = EINTR;
738	if (error == EWOULDBLOCK)
739		error = 0;
740#define	putbits(name, x) \
741	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
742		error = error2;
743	if (error == 0) {
744		int error2;
745
746		putbits(in, 0);
747		putbits(ou, 1);
748		putbits(ex, 2);
749#undef putbits
750	}
751	if (selbits != &s_selbits[0])
752		free(selbits, M_SELECT);
753	return (error);
754}
755
756static int
757selscan(p, ibits, obits, nfd)
758	struct proc *p;
759	fd_mask **ibits, **obits;
760	int nfd;
761{
762	struct filedesc *fdp = p->p_fd;
763	int msk, i, fd;
764	fd_mask bits;
765	struct file *fp;
766	int n = 0;
767	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
768	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
769
770	for (msk = 0; msk < 3; msk++) {
771		if (ibits[msk] == NULL)
772			continue;
773		for (i = 0; i < nfd; i += NFDBITS) {
774			bits = ibits[msk][i/NFDBITS];
775			/* ffs(int mask) not portable, fd_mask is long */
776			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
777				if (!(bits & 1))
778					continue;
779				fp = fdp->fd_ofiles[fd];
780				if (fp == NULL)
781					return (EBADF);
782				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
783					obits[msk][(fd)/NFDBITS] |=
784					    ((fd_mask)1 << ((fd) % NFDBITS));
785					n++;
786				}
787			}
788		}
789	}
790	p->p_retval[0] = n;
791	return (0);
792}
793
794/*
795 * Poll system call.
796 */
797#ifndef _SYS_SYSPROTO_H_
798struct poll_args {
799	struct pollfd *fds;
800	u_int	nfds;
801	int	timeout;
802};
803#endif
804int
805poll(p, uap)
806	register struct proc *p;
807	register struct poll_args *uap;
808{
809	caddr_t bits;
810	char smallbits[32 * sizeof(struct pollfd)];
811	struct timeval atv, rtv, ttv;
812	int s, ncoll, error = 0, timo;
813	size_t ni;
814
815	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
816		/* forgiving; slightly wrong */
817		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
818	}
819	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
820	if (ni > sizeof(smallbits))
821		bits = malloc(ni, M_TEMP, M_WAITOK);
822	else
823		bits = smallbits;
824	error = copyin(SCARG(uap, fds), bits, ni);
825	if (error)
826		goto done;
827	if (SCARG(uap, timeout) != INFTIM) {
828		atv.tv_sec = SCARG(uap, timeout) / 1000;
829		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
830		if (itimerfix(&atv)) {
831			error = EINVAL;
832			goto done;
833		}
834		getmicrouptime(&rtv);
835		timevaladd(&atv, &rtv);
836	} else {
837		atv.tv_sec = 0;
838		atv.tv_usec = 0;
839	}
840	timo = 0;
841retry:
842	ncoll = nselcoll;
843	p->p_flag |= P_SELECT;
844	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds));
845	if (error || p->p_retval[0])
846		goto done;
847	if (atv.tv_sec || atv.tv_usec) {
848		getmicrouptime(&rtv);
849		if (timevalcmp(&rtv, &atv, >=))
850			goto done;
851		ttv = atv;
852		timevalsub(&ttv, &rtv);
853		timo = ttv.tv_sec > 24 * 60 * 60 ?
854		    24 * 60 * 60 * hz : tvtohz(&ttv);
855	}
856	s = splhigh();
857	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
858		splx(s);
859		goto retry;
860	}
861	p->p_flag &= ~P_SELECT;
862	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
863	splx(s);
864	if (error == 0)
865		goto retry;
866done:
867	p->p_flag &= ~P_SELECT;
868	/* poll is not restarted after signals... */
869	if (error == ERESTART)
870		error = EINTR;
871	if (error == EWOULDBLOCK)
872		error = 0;
873	if (error == 0) {
874		error = copyout(bits, SCARG(uap, fds), ni);
875		if (error)
876			goto out;
877	}
878out:
879	if (ni > sizeof(smallbits))
880		free(bits, M_TEMP);
881	return (error);
882}
883
884static int
885pollscan(p, fds, nfd)
886	struct proc *p;
887	struct pollfd *fds;
888	int nfd;
889{
890	register struct filedesc *fdp = p->p_fd;
891	int i;
892	struct file *fp;
893	int n = 0;
894
895	for (i = 0; i < nfd; i++, fds++) {
896		if (fds->fd >= fdp->fd_nfiles) {
897			fds->revents = POLLNVAL;
898			n++;
899		} else if (fds->fd < 0) {
900			fds->revents = 0;
901		} else {
902			fp = fdp->fd_ofiles[fds->fd];
903			if (fp == 0) {
904				fds->revents = POLLNVAL;
905				n++;
906			} else {
907				/*
908				 * Note: backend also returns POLLHUP and
909				 * POLLERR if appropriate.
910				 */
911				fds->revents = fo_poll(fp, fds->events,
912				    fp->f_cred, p);
913				if (fds->revents != 0)
914					n++;
915			}
916		}
917	}
918	p->p_retval[0] = n;
919	return (0);
920}
921
922/*
923 * OpenBSD poll system call.
924 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
925 */
926#ifndef _SYS_SYSPROTO_H_
927struct openbsd_poll_args {
928	struct pollfd *fds;
929	u_int	nfds;
930	int	timeout;
931};
932#endif
933int
934openbsd_poll(p, uap)
935	register struct proc *p;
936	register struct openbsd_poll_args *uap;
937{
938	return (poll(p, (struct poll_args *)uap));
939}
940
941/*ARGSUSED*/
942int
943seltrue(dev, events, p)
944	dev_t dev;
945	int events;
946	struct proc *p;
947{
948
949	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
950}
951
952/*
953 * Record a select request.
954 */
955void
956selrecord(selector, sip)
957	struct proc *selector;
958	struct selinfo *sip;
959{
960	struct proc *p;
961	pid_t mypid;
962
963	mypid = selector->p_pid;
964	if (sip->si_pid == mypid)
965		return;
966	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
967	    p->p_wchan == (caddr_t)&selwait)
968		sip->si_flags |= SI_COLL;
969	else
970		sip->si_pid = mypid;
971}
972
973/*
974 * Do a wakeup when a selectable event occurs.
975 */
976void
977selwakeup(sip)
978	register struct selinfo *sip;
979{
980	register struct proc *p;
981	int s;
982
983	if (sip->si_pid == 0)
984		return;
985	if (sip->si_flags & SI_COLL) {
986		nselcoll++;
987		sip->si_flags &= ~SI_COLL;
988		wakeup((caddr_t)&selwait);
989	}
990	p = pfind(sip->si_pid);
991	sip->si_pid = 0;
992	if (p != NULL) {
993		s = splhigh();
994		if (p->p_wchan == (caddr_t)&selwait) {
995			if (p->p_stat == SSLEEP)
996				setrunnable(p);
997			else
998				unsleep(p);
999		} else if (p->p_flag & P_SELECT)
1000			p->p_flag &= ~P_SELECT;
1001		splx(s);
1002	}
1003}
1004