sys_generic.c revision 89306
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39 * $FreeBSD: head/sys/kern/sys_generic.c 89306 2002-01-13 11:58:06Z alfred $
40 */
41
42#include "opt_ktrace.h"
43
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/sysproto.h>
47#include <sys/filedesc.h>
48#include <sys/filio.h>
49#include <sys/fcntl.h>
50#include <sys/file.h>
51#include <sys/proc.h>
52#include <sys/signalvar.h>
53#include <sys/socketvar.h>
54#include <sys/uio.h>
55#include <sys/kernel.h>
56#include <sys/malloc.h>
57#include <sys/poll.h>
58#include <sys/resourcevar.h>
59#include <sys/selinfo.h>
60#include <sys/sysctl.h>
61#include <sys/sysent.h>
62#include <sys/bio.h>
63#include <sys/buf.h>
64#include <sys/condvar.h>
65#ifdef KTRACE
66#include <sys/ktrace.h>
67#endif
68#include <vm/vm.h>
69#include <vm/vm_page.h>
70
71#include <machine/limits.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int	pollscan __P((struct thread *, struct pollfd *, u_int));
78static int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
79static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
80static int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
81static int	dofileread __P((struct thread *, struct file *, int, void *,
82		    size_t, off_t, int));
83static int	dofilewrite __P((struct thread *, struct file *, int,
84		    const void *, size_t, off_t, int));
85
86struct file*
87holdfp(fdp, fd, flag)
88	struct filedesc* fdp;
89	int fd, flag;
90{
91	struct file* fp;
92
93	FILEDESC_LOCK(fdp);
94	if (((u_int)fd) >= fdp->fd_nfiles ||
95	    (fp = fdp->fd_ofiles[fd]) == NULL) {
96		FILEDESC_UNLOCK(fdp);
97		return (NULL);
98	}
99	FILE_LOCK(fp);
100	FILEDESC_UNLOCK(fdp);
101	if ((fp->f_flag & flag) == 0) {
102		FILE_UNLOCK(fp);
103		return (NULL);
104	}
105	fp->f_count++;
106	FILE_UNLOCK(fp);
107	return (fp);
108}
109
110/*
111 * Read system call.
112 */
113#ifndef _SYS_SYSPROTO_H_
114struct read_args {
115	int	fd;
116	void	*buf;
117	size_t	nbyte;
118};
119#endif
120/*
121 * MPSAFE
122 */
123int
124read(td, uap)
125	struct thread *td;
126	struct read_args *uap;
127{
128	struct file *fp;
129	int error;
130
131	mtx_lock(&Giant);
132	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
133		error = dofileread(td, fp, uap->fd, uap->buf,
134			    uap->nbyte, (off_t)-1, 0);
135		fdrop(fp, td);
136	}
137	mtx_unlock(&Giant);
138	return(error);
139}
140
141/*
142 * Pread system call
143 */
144#ifndef _SYS_SYSPROTO_H_
145struct pread_args {
146	int	fd;
147	void	*buf;
148	size_t	nbyte;
149	int	pad;
150	off_t	offset;
151};
152#endif
153/*
154 * MPSAFE
155 */
156int
157pread(td, uap)
158	struct thread *td;
159	struct pread_args *uap;
160{
161	struct file *fp;
162	int error;
163
164	fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD);
165	if (fp == NULL)
166		return (EBADF);
167	if (fp->f_type != DTYPE_VNODE) {
168		error = ESPIPE;
169	} else {
170		mtx_lock(&Giant);
171		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
172			    uap->offset, FOF_OFFSET);
173		mtx_unlock(&Giant);
174	}
175	fdrop(fp, td);
176	return(error);
177}
178
179/*
180 * Code common for read and pread
181 */
182int
183dofileread(td, fp, fd, buf, nbyte, offset, flags)
184	struct thread *td;
185	struct file *fp;
186	int fd, flags;
187	void *buf;
188	size_t nbyte;
189	off_t offset;
190{
191	struct uio auio;
192	struct iovec aiov;
193	long cnt, error = 0;
194#ifdef KTRACE
195	struct iovec ktriov;
196	struct uio ktruio;
197	int didktr = 0;
198#endif
199
200	aiov.iov_base = (caddr_t)buf;
201	aiov.iov_len = nbyte;
202	auio.uio_iov = &aiov;
203	auio.uio_iovcnt = 1;
204	auio.uio_offset = offset;
205	if (nbyte > INT_MAX)
206		return (EINVAL);
207	auio.uio_resid = nbyte;
208	auio.uio_rw = UIO_READ;
209	auio.uio_segflg = UIO_USERSPACE;
210	auio.uio_td = td;
211#ifdef KTRACE
212	/*
213	 * if tracing, save a copy of iovec
214	 */
215	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
216		ktriov = aiov;
217		ktruio = auio;
218		didktr = 1;
219	}
220#endif
221	cnt = nbyte;
222
223	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
224		if (auio.uio_resid != cnt && (error == ERESTART ||
225		    error == EINTR || error == EWOULDBLOCK))
226			error = 0;
227	}
228	cnt -= auio.uio_resid;
229#ifdef KTRACE
230	if (didktr && error == 0) {
231		ktruio.uio_iov = &ktriov;
232		ktruio.uio_resid = cnt;
233		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
234	}
235#endif
236	td->td_retval[0] = cnt;
237	return (error);
238}
239
240/*
241 * Scatter read system call.
242 */
243#ifndef _SYS_SYSPROTO_H_
244struct readv_args {
245	int	fd;
246	struct	iovec *iovp;
247	u_int	iovcnt;
248};
249#endif
250/*
251 * MPSAFE
252 */
253int
254readv(td, uap)
255	struct thread *td;
256	struct readv_args *uap;
257{
258	struct file *fp;
259	struct uio auio;
260	struct iovec *iov;
261	struct iovec *needfree;
262	struct iovec aiov[UIO_SMALLIOV];
263	long i, cnt, error = 0;
264	u_int iovlen;
265#ifdef KTRACE
266	struct iovec *ktriov = NULL;
267	struct uio ktruio;
268#endif
269	mtx_lock(&Giant);
270
271	if ((error = fget_read(td, uap->fd, &fp)) != 0)
272		goto done2;
273	/* note: can't use iovlen until iovcnt is validated */
274	iovlen = uap->iovcnt * sizeof (struct iovec);
275	if (uap->iovcnt > UIO_SMALLIOV) {
276		if (uap->iovcnt > UIO_MAXIOV) {
277			error = EINVAL;
278			goto done2;
279		}
280		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
281		needfree = iov;
282	} else {
283		iov = aiov;
284		needfree = NULL;
285	}
286	auio.uio_iov = iov;
287	auio.uio_iovcnt = uap->iovcnt;
288	auio.uio_rw = UIO_READ;
289	auio.uio_segflg = UIO_USERSPACE;
290	auio.uio_td = td;
291	auio.uio_offset = -1;
292	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
293		goto done;
294	auio.uio_resid = 0;
295	for (i = 0; i < uap->iovcnt; i++) {
296		if (iov->iov_len > INT_MAX - auio.uio_resid) {
297			error = EINVAL;
298			goto done;
299		}
300		auio.uio_resid += iov->iov_len;
301		iov++;
302	}
303#ifdef KTRACE
304	/*
305	 * if tracing, save a copy of iovec
306	 */
307	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
308		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
309		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
310		ktruio = auio;
311	}
312#endif
313	cnt = auio.uio_resid;
314	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
315		if (auio.uio_resid != cnt && (error == ERESTART ||
316		    error == EINTR || error == EWOULDBLOCK))
317			error = 0;
318	}
319	cnt -= auio.uio_resid;
320#ifdef KTRACE
321	if (ktriov != NULL) {
322		if (error == 0) {
323			ktruio.uio_iov = ktriov;
324			ktruio.uio_resid = cnt;
325			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
326			    error);
327		}
328		FREE(ktriov, M_TEMP);
329	}
330#endif
331	td->td_retval[0] = cnt;
332done:
333	fdrop(fp, td);
334	if (needfree)
335		FREE(needfree, M_IOV);
336done2:
337	mtx_unlock(&Giant);
338	return (error);
339}
340
341/*
342 * Write system call
343 */
344#ifndef _SYS_SYSPROTO_H_
345struct write_args {
346	int	fd;
347	const void *buf;
348	size_t	nbyte;
349};
350#endif
351/*
352 * MPSAFE
353 */
354int
355write(td, uap)
356	struct thread *td;
357	struct write_args *uap;
358{
359	struct file *fp;
360	int error;
361
362	mtx_lock(&Giant);
363	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
364		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
365			    (off_t)-1, 0);
366		fdrop(fp, td);
367	} else {
368		error = EBADF;	/* XXX this can't be right */
369	}
370	mtx_unlock(&Giant);
371	return(error);
372}
373
374/*
375 * Pwrite system call
376 */
377#ifndef _SYS_SYSPROTO_H_
378struct pwrite_args {
379	int	fd;
380	const void *buf;
381	size_t	nbyte;
382	int	pad;
383	off_t	offset;
384};
385#endif
386/*
387 * MPSAFE
388 */
389int
390pwrite(td, uap)
391	struct thread *td;
392	struct pwrite_args *uap;
393{
394	struct file *fp;
395	int error;
396
397	mtx_lock(&Giant);
398	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
399		if (fp->f_type == DTYPE_VNODE) {
400			error = dofilewrite(td, fp, uap->fd, uap->buf,
401				    uap->nbyte, uap->offset, FOF_OFFSET);
402		} else {
403			error = ESPIPE;
404		}
405		fdrop(fp, td);
406	} else {
407		error = EBADF;	/* this can't be right */
408	}
409	return(error);
410}
411
412static int
413dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
414	struct thread *td;
415	struct file *fp;
416	int fd, flags;
417	const void *buf;
418	size_t nbyte;
419	off_t offset;
420{
421	struct uio auio;
422	struct iovec aiov;
423	long cnt, error = 0;
424#ifdef KTRACE
425	struct iovec ktriov;
426	struct uio ktruio;
427	int didktr = 0;
428#endif
429
430	aiov.iov_base = (void *)(uintptr_t)buf;
431	aiov.iov_len = nbyte;
432	auio.uio_iov = &aiov;
433	auio.uio_iovcnt = 1;
434	auio.uio_offset = offset;
435	if (nbyte > INT_MAX)
436		return (EINVAL);
437	auio.uio_resid = nbyte;
438	auio.uio_rw = UIO_WRITE;
439	auio.uio_segflg = UIO_USERSPACE;
440	auio.uio_td = td;
441#ifdef KTRACE
442	/*
443	 * if tracing, save a copy of iovec and uio
444	 */
445	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
446		ktriov = aiov;
447		ktruio = auio;
448		didktr = 1;
449	}
450#endif
451	cnt = nbyte;
452	if (fp->f_type == DTYPE_VNODE)
453		bwillwrite();
454	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
455		if (auio.uio_resid != cnt && (error == ERESTART ||
456		    error == EINTR || error == EWOULDBLOCK))
457			error = 0;
458		if (error == EPIPE) {
459			PROC_LOCK(td->td_proc);
460			psignal(td->td_proc, SIGPIPE);
461			PROC_UNLOCK(td->td_proc);
462		}
463	}
464	cnt -= auio.uio_resid;
465#ifdef KTRACE
466	if (didktr && error == 0) {
467		ktruio.uio_iov = &ktriov;
468		ktruio.uio_resid = cnt;
469		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
470	}
471#endif
472	td->td_retval[0] = cnt;
473	return (error);
474}
475
476/*
477 * Gather write system call
478 */
479#ifndef _SYS_SYSPROTO_H_
480struct writev_args {
481	int	fd;
482	struct	iovec *iovp;
483	u_int	iovcnt;
484};
485#endif
486/*
487 * MPSAFE
488 */
489int
490writev(td, uap)
491	struct thread *td;
492	register struct writev_args *uap;
493{
494	struct file *fp;
495	struct uio auio;
496	register struct iovec *iov;
497	struct iovec *needfree;
498	struct iovec aiov[UIO_SMALLIOV];
499	long i, cnt, error = 0;
500	u_int iovlen;
501#ifdef KTRACE
502	struct iovec *ktriov = NULL;
503	struct uio ktruio;
504#endif
505
506	mtx_lock(&Giant);
507	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
508		error = EBADF;
509		goto done2;
510	}
511	/* note: can't use iovlen until iovcnt is validated */
512	iovlen = uap->iovcnt * sizeof (struct iovec);
513	if (uap->iovcnt > UIO_SMALLIOV) {
514		if (uap->iovcnt > UIO_MAXIOV) {
515			needfree = NULL;
516			error = EINVAL;
517			goto done;
518		}
519		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
520		needfree = iov;
521	} else {
522		iov = aiov;
523		needfree = NULL;
524	}
525	auio.uio_iov = iov;
526	auio.uio_iovcnt = uap->iovcnt;
527	auio.uio_rw = UIO_WRITE;
528	auio.uio_segflg = UIO_USERSPACE;
529	auio.uio_td = td;
530	auio.uio_offset = -1;
531	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
532		goto done;
533	auio.uio_resid = 0;
534	for (i = 0; i < uap->iovcnt; i++) {
535		if (iov->iov_len > INT_MAX - auio.uio_resid) {
536			error = EINVAL;
537			goto done;
538		}
539		auio.uio_resid += iov->iov_len;
540		iov++;
541	}
542#ifdef KTRACE
543	/*
544	 * if tracing, save a copy of iovec and uio
545	 */
546	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
547		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
548		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
549		ktruio = auio;
550	}
551#endif
552	cnt = auio.uio_resid;
553	if (fp->f_type == DTYPE_VNODE)
554		bwillwrite();
555	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
556		if (auio.uio_resid != cnt && (error == ERESTART ||
557		    error == EINTR || error == EWOULDBLOCK))
558			error = 0;
559		if (error == EPIPE) {
560			PROC_LOCK(td->td_proc);
561			psignal(td->td_proc, SIGPIPE);
562			PROC_UNLOCK(td->td_proc);
563		}
564	}
565	cnt -= auio.uio_resid;
566#ifdef KTRACE
567	if (ktriov != NULL) {
568		if (error == 0) {
569			ktruio.uio_iov = ktriov;
570			ktruio.uio_resid = cnt;
571			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
572			    error);
573		}
574		FREE(ktriov, M_TEMP);
575	}
576#endif
577	td->td_retval[0] = cnt;
578done:
579	fdrop(fp, td);
580	if (needfree)
581		FREE(needfree, M_IOV);
582done2:
583	mtx_unlock(&Giant);
584	return (error);
585}
586
587/*
588 * Ioctl system call
589 */
590#ifndef _SYS_SYSPROTO_H_
591struct ioctl_args {
592	int	fd;
593	u_long	com;
594	caddr_t	data;
595};
596#endif
597/*
598 * MPSAFE
599 */
600/* ARGSUSED */
601int
602ioctl(td, uap)
603	struct thread *td;
604	register struct ioctl_args *uap;
605{
606	register struct file *fp;
607	register struct filedesc *fdp;
608	register u_long com;
609	int error = 0;
610	register u_int size;
611	caddr_t data, memp;
612	int tmp;
613#define STK_PARAMS	128
614	union {
615	    char stkbuf[STK_PARAMS];
616	    long align;
617	} ubuf;
618
619	fp = ffind_hold(td, uap->fd);
620	if (fp == NULL)
621		return (EBADF);
622	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
623		fdrop(fp, td);
624		return (EBADF);
625	}
626	fdp = td->td_proc->p_fd;
627	switch (com = uap->com) {
628	case FIONCLEX:
629		FILEDESC_LOCK(fdp);
630		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
631		FILEDESC_UNLOCK(fdp);
632		fdrop(fp, td);
633		return (0);
634	case FIOCLEX:
635		FILEDESC_LOCK(fdp);
636		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
637		FILEDESC_UNLOCK(fdp);
638		fdrop(fp, td);
639		return (0);
640	}
641
642	/*
643	 * Interpret high order word to find amount of data to be
644	 * copied to/from the user's address space.
645	 */
646	size = IOCPARM_LEN(com);
647	if (size > IOCPARM_MAX) {
648		fdrop(fp, td);
649		return (ENOTTY);
650	}
651
652	mtx_lock(&Giant);
653	memp = NULL;
654	if (size > sizeof (ubuf.stkbuf)) {
655		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
656		data = memp;
657	} else {
658		data = ubuf.stkbuf;
659	}
660	if (com&IOC_IN) {
661		if (size) {
662			error = copyin(uap->data, data, (u_int)size);
663			if (error) {
664				if (memp)
665					free(memp, M_IOCTLOPS);
666				fdrop(fp, td);
667				goto done;
668			}
669		} else {
670			*(caddr_t *)data = uap->data;
671		}
672	} else if ((com&IOC_OUT) && size) {
673		/*
674		 * Zero the buffer so the user always
675		 * gets back something deterministic.
676		 */
677		bzero(data, size);
678	} else if (com&IOC_VOID) {
679		*(caddr_t *)data = uap->data;
680	}
681
682	switch (com) {
683
684	case FIONBIO:
685		FILE_LOCK(fp);
686		if ((tmp = *(int *)data))
687			fp->f_flag |= FNONBLOCK;
688		else
689			fp->f_flag &= ~FNONBLOCK;
690		FILE_UNLOCK(fp);
691		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
692		break;
693
694	case FIOASYNC:
695		FILE_LOCK(fp);
696		if ((tmp = *(int *)data))
697			fp->f_flag |= FASYNC;
698		else
699			fp->f_flag &= ~FASYNC;
700		FILE_UNLOCK(fp);
701		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
702		break;
703
704	default:
705		error = fo_ioctl(fp, com, data, td);
706		/*
707		 * Copy any data to user, size was
708		 * already set and checked above.
709		 */
710		if (error == 0 && (com&IOC_OUT) && size)
711			error = copyout(data, uap->data, (u_int)size);
712		break;
713	}
714	if (memp)
715		free(memp, M_IOCTLOPS);
716	fdrop(fp, td);
717done:
718	mtx_unlock(&Giant);
719	return (error);
720}
721
722static int	nselcoll;	/* Select collisions since boot */
723struct cv	selwait;
724SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
725
726/*
727 * Select system call.
728 */
729#ifndef _SYS_SYSPROTO_H_
730struct select_args {
731	int	nd;
732	fd_set	*in, *ou, *ex;
733	struct	timeval *tv;
734};
735#endif
736/*
737 * MPSAFE
738 */
739int
740select(td, uap)
741	register struct thread *td;
742	register struct select_args *uap;
743{
744	struct filedesc *fdp;
745	/*
746	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
747	 * infds with the new FD_SETSIZE of 1024, and more than enough for
748	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
749	 * of 256.
750	 */
751	fd_mask s_selbits[howmany(2048, NFDBITS)];
752	fd_mask s_heldbits[howmany(2048, NFDBITS)];
753	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
754	struct timeval atv, rtv, ttv;
755	int ncoll, error, timo, i;
756	u_int nbufbytes, ncpbytes, nfdbits;
757
758	if (uap->nd < 0)
759		return (EINVAL);
760	fdp = td->td_proc->p_fd;
761	mtx_lock(&Giant);
762	FILEDESC_LOCK(fdp);
763
764	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
765		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
766	FILEDESC_UNLOCK(fdp);
767
768	/*
769	 * Allocate just enough bits for the non-null fd_sets.  Use the
770	 * preallocated auto buffer if possible.
771	 */
772	nfdbits = roundup(uap->nd, NFDBITS);
773	ncpbytes = nfdbits / NBBY;
774	nbufbytes = 0;
775	if (uap->in != NULL)
776		nbufbytes += 2 * ncpbytes;
777	if (uap->ou != NULL)
778		nbufbytes += 2 * ncpbytes;
779	if (uap->ex != NULL)
780		nbufbytes += 2 * ncpbytes;
781	if (nbufbytes <= sizeof s_selbits)
782		selbits = &s_selbits[0];
783	else
784		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
785	if (2 * ncpbytes <= sizeof s_heldbits) {
786		bzero(s_heldbits, sizeof(s_heldbits));
787		heldbits = &s_heldbits[0];
788	} else
789		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
790
791	/*
792	 * Assign pointers into the bit buffers and fetch the input bits.
793	 * Put the output buffers together so that they can be bzeroed
794	 * together.
795	 */
796	sbp = selbits;
797	hibits = heldbits + ncpbytes / sizeof *heldbits;
798	hobits = heldbits;
799#define	getbits(name, x) \
800	do {								\
801		if (uap->name == NULL)					\
802			ibits[x] = NULL;				\
803		else {							\
804			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
805			obits[x] = sbp;					\
806			sbp += ncpbytes / sizeof *sbp;			\
807			error = copyin(uap->name, ibits[x], ncpbytes);	\
808			if (error != 0)					\
809				goto done_noproclock;			\
810			for (i = 0;					\
811			     i < ncpbytes / sizeof ibits[i][0];		\
812			     i++)					\
813				hibits[i] |= ibits[x][i];		\
814		}							\
815	} while (0)
816	getbits(in, 0);
817	getbits(ou, 1);
818	getbits(ex, 2);
819#undef	getbits
820	if (nbufbytes != 0)
821		bzero(selbits, nbufbytes / 2);
822
823	if (uap->tv) {
824		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
825			sizeof (atv));
826		if (error)
827			goto done_noproclock;
828		if (itimerfix(&atv)) {
829			error = EINVAL;
830			goto done_noproclock;
831		}
832		getmicrouptime(&rtv);
833		timevaladd(&atv, &rtv);
834	} else {
835		atv.tv_sec = 0;
836		atv.tv_usec = 0;
837	}
838	selholddrop(td, hibits, hobits, uap->nd, 1);
839	timo = 0;
840	PROC_LOCK(td->td_proc);
841retry:
842	ncoll = nselcoll;
843	mtx_lock_spin(&sched_lock);
844	td->td_flags |= TDF_SELECT;
845	mtx_unlock_spin(&sched_lock);
846	PROC_UNLOCK(td->td_proc);
847	error = selscan(td, ibits, obits, uap->nd);
848	PROC_LOCK(td->td_proc);
849	if (error || td->td_retval[0])
850		goto done;
851	if (atv.tv_sec || atv.tv_usec) {
852		getmicrouptime(&rtv);
853		if (timevalcmp(&rtv, &atv, >=)) {
854			/*
855			 * An event of our interest may occur during locking a process.
856			 * In order to avoid missing the event that occured during locking
857			 * the process, test TDF_SELECT and rescan file descriptors if
858			 * necessary.
859			 */
860			mtx_lock_spin(&sched_lock);
861			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
862				ncoll = nselcoll;
863				td->td_flags |= TDF_SELECT;
864				mtx_unlock_spin(&sched_lock);
865				PROC_UNLOCK(td->td_proc);
866				error = selscan(td, ibits, obits, uap->nd);
867				PROC_LOCK(td->td_proc);
868			} else
869				mtx_unlock_spin(&sched_lock);
870			goto done;
871		}
872		ttv = atv;
873		timevalsub(&ttv, &rtv);
874		timo = ttv.tv_sec > 24 * 60 * 60 ?
875		    24 * 60 * 60 * hz : tvtohz(&ttv);
876	}
877	mtx_lock_spin(&sched_lock);
878	td->td_flags &= ~TDF_SELECT;
879	mtx_unlock_spin(&sched_lock);
880
881	if (timo > 0)
882		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
883	else
884		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
885
886	if (error == 0)
887		goto retry;
888
889done:
890	mtx_lock_spin(&sched_lock);
891	td->td_flags &= ~TDF_SELECT;
892	mtx_unlock_spin(&sched_lock);
893	PROC_UNLOCK(td->td_proc);
894	selholddrop(td, hibits, hobits, uap->nd, 0);
895done_noproclock:
896	/* select is not restarted after signals... */
897	if (error == ERESTART)
898		error = EINTR;
899	if (error == EWOULDBLOCK)
900		error = 0;
901#define	putbits(name, x) \
902	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
903		error = error2;
904	if (error == 0) {
905		int error2;
906
907		putbits(in, 0);
908		putbits(ou, 1);
909		putbits(ex, 2);
910#undef putbits
911	}
912	if (selbits != &s_selbits[0])
913		free(selbits, M_SELECT);
914	if (heldbits != &s_heldbits[0])
915		free(heldbits, M_SELECT);
916
917	mtx_unlock(&Giant);
918	return (error);
919}
920
921/*
922 * Used to hold then release a group of fds for select(2).
923 * Hold (hold == 1) or release (hold == 0) a group of filedescriptors.
924 * if holding then use ibits setting the bits in obits, otherwise use obits.
925 */
926static int
927selholddrop(td, ibits, obits, nfd, hold)
928	struct thread *td;
929	fd_mask *ibits, *obits;
930	int nfd, hold;
931{
932	struct filedesc *fdp = td->td_proc->p_fd;
933	int i, fd;
934	fd_mask bits;
935	struct file *fp;
936
937	FILEDESC_LOCK(fdp);
938	for (i = 0; i < nfd; i += NFDBITS) {
939		if (hold)
940			bits = ibits[i/NFDBITS];
941		else
942			bits = obits[i/NFDBITS];
943		/* ffs(int mask) not portable, fd_mask is long */
944		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
945			if (!(bits & 1))
946				continue;
947			fp = fdp->fd_ofiles[fd];
948			if (fp == NULL) {
949				FILEDESC_UNLOCK(fdp);
950				return (EBADF);
951			}
952			if (hold) {
953				fhold(fp);
954				obits[(fd)/NFDBITS] |=
955				    ((fd_mask)1 << ((fd) % NFDBITS));
956			} else {
957				/* XXX: optimize by making a special
958				 * version of fdrop that only unlocks
959				 * the filedesc if needed?  This would
960				 * redcuce the number of lock/unlock
961				 * pairs by quite a bit.
962				 */
963				FILEDESC_UNLOCK(fdp);
964				fdrop(fp, td);
965				FILEDESC_LOCK(fdp);
966			}
967		}
968	}
969	FILEDESC_UNLOCK(fdp);
970	return (0);
971}
972
973static int
974selscan(td, ibits, obits, nfd)
975	struct thread *td;
976	fd_mask **ibits, **obits;
977	int nfd;
978{
979	int msk, i, fd;
980	fd_mask bits;
981	struct file *fp;
982	int n = 0;
983	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
984	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
985
986	for (msk = 0; msk < 3; msk++) {
987		if (ibits[msk] == NULL)
988			continue;
989		for (i = 0; i < nfd; i += NFDBITS) {
990			bits = ibits[msk][i/NFDBITS];
991			/* ffs(int mask) not portable, fd_mask is long */
992			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
993				if (!(bits & 1))
994					continue;
995				fp = ffind_hold(td, fd);
996				if (fp == NULL)
997					return (EBADF);
998				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
999					obits[msk][(fd)/NFDBITS] |=
1000					    ((fd_mask)1 << ((fd) % NFDBITS));
1001					n++;
1002				}
1003				fdrop(fp, td);
1004			}
1005		}
1006	}
1007	td->td_retval[0] = n;
1008	return (0);
1009}
1010
1011/*
1012 * Poll system call.
1013 */
1014#ifndef _SYS_SYSPROTO_H_
1015struct poll_args {
1016	struct pollfd *fds;
1017	u_int	nfds;
1018	int	timeout;
1019};
1020#endif
1021/*
1022 * MPSAFE
1023 */
1024int
1025poll(td, uap)
1026	struct thread *td;
1027	struct poll_args *uap;
1028{
1029	caddr_t bits;
1030	char smallbits[32 * sizeof(struct pollfd)];
1031	struct timeval atv, rtv, ttv;
1032	int ncoll, error = 0, timo;
1033	u_int nfds;
1034	size_t ni;
1035	struct pollfd p_heldbits[32];
1036	struct pollfd *heldbits;
1037
1038	nfds = SCARG(uap, nfds);
1039
1040	mtx_lock(&Giant);
1041	/*
1042	 * This is kinda bogus.  We have fd limits, but that is not
1043	 * really related to the size of the pollfd array.  Make sure
1044	 * we let the process use at least FD_SETSIZE entries and at
1045	 * least enough for the current limits.  We want to be reasonably
1046	 * safe, but not overly restrictive.
1047	 */
1048	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
1049	    (nfds > FD_SETSIZE)) {
1050		error = EINVAL;
1051		goto done2;
1052	}
1053	ni = nfds * sizeof(struct pollfd);
1054	if (ni > sizeof(smallbits))
1055		bits = malloc(ni, M_TEMP, M_WAITOK);
1056	else
1057		bits = smallbits;
1058	if (ni > sizeof(p_heldbits))
1059		heldbits = malloc(ni, M_TEMP, M_WAITOK);
1060	else {
1061		bzero(p_heldbits, sizeof(p_heldbits));
1062		heldbits = p_heldbits;
1063	}
1064	error = copyin(SCARG(uap, fds), bits, ni);
1065	if (error)
1066		goto done_noproclock;
1067	bcopy(bits, heldbits, ni);
1068	if (SCARG(uap, timeout) != INFTIM) {
1069		atv.tv_sec = SCARG(uap, timeout) / 1000;
1070		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
1071		if (itimerfix(&atv)) {
1072			error = EINVAL;
1073			goto done_noproclock;
1074		}
1075		getmicrouptime(&rtv);
1076		timevaladd(&atv, &rtv);
1077	} else {
1078		atv.tv_sec = 0;
1079		atv.tv_usec = 0;
1080	}
1081	pollholddrop(td, heldbits, nfds, 1);
1082	timo = 0;
1083	PROC_LOCK(td->td_proc);
1084retry:
1085	ncoll = nselcoll;
1086	mtx_lock_spin(&sched_lock);
1087	td->td_flags |= TDF_SELECT;
1088	mtx_unlock_spin(&sched_lock);
1089	PROC_UNLOCK(td->td_proc);
1090	error = pollscan(td, (struct pollfd *)bits, nfds);
1091	PROC_LOCK(td->td_proc);
1092	if (error || td->td_retval[0])
1093		goto done;
1094	if (atv.tv_sec || atv.tv_usec) {
1095		getmicrouptime(&rtv);
1096		if (timevalcmp(&rtv, &atv, >=)) {
1097			/*
1098			 * An event of our interest may occur during locking a process.
1099			 * In order to avoid missing the event that occured during locking
1100			 * the process, test TDF_SELECT and rescan file descriptors if
1101			 * necessary.
1102			 */
1103			mtx_lock_spin(&sched_lock);
1104			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
1105				ncoll = nselcoll;
1106				td->td_flags |= TDF_SELECT;
1107				mtx_unlock_spin(&sched_lock);
1108				PROC_UNLOCK(td->td_proc);
1109				error = pollscan(td, (struct pollfd *)bits, nfds);
1110				PROC_LOCK(td->td_proc);
1111			} else
1112				mtx_unlock_spin(&sched_lock);
1113			goto done;
1114		}
1115		ttv = atv;
1116		timevalsub(&ttv, &rtv);
1117		timo = ttv.tv_sec > 24 * 60 * 60 ?
1118		    24 * 60 * 60 * hz : tvtohz(&ttv);
1119	}
1120	mtx_lock_spin(&sched_lock);
1121	td->td_flags &= ~TDF_SELECT;
1122	mtx_unlock_spin(&sched_lock);
1123	if (timo > 0)
1124		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
1125	else
1126		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
1127	if (error == 0)
1128		goto retry;
1129
1130done:
1131	mtx_lock_spin(&sched_lock);
1132	td->td_flags &= ~TDF_SELECT;
1133	mtx_unlock_spin(&sched_lock);
1134	PROC_UNLOCK(td->td_proc);
1135	pollholddrop(td, heldbits, nfds, 0);
1136done_noproclock:
1137	/* poll is not restarted after signals... */
1138	if (error == ERESTART)
1139		error = EINTR;
1140	if (error == EWOULDBLOCK)
1141		error = 0;
1142	if (error == 0) {
1143		error = copyout(bits, SCARG(uap, fds), ni);
1144		if (error)
1145			goto out;
1146	}
1147out:
1148	if (ni > sizeof(smallbits))
1149		free(bits, M_TEMP);
1150	if (ni > sizeof(p_heldbits))
1151		free(heldbits, M_TEMP);
1152done2:
1153	mtx_unlock(&Giant);
1154	return (error);
1155}
1156
1157static int
1158pollholddrop(td, fds, nfd, hold)
1159	struct thread *td;
1160	struct pollfd *fds;
1161	u_int nfd;
1162	int hold;
1163{
1164	register struct filedesc *fdp = td->td_proc->p_fd;
1165	int i;
1166	struct file *fp;
1167
1168	FILEDESC_LOCK(fdp);
1169	for (i = 0; i < nfd; i++, fds++) {
1170		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
1171			fp = fdp->fd_ofiles[fds->fd];
1172			if (hold) {
1173				if (fp != NULL) {
1174					fhold(fp);
1175					fds->revents = 1;
1176				} else
1177					fds->revents = 0;
1178			} else if(fp != NULL && fds->revents) {
1179				FILE_LOCK(fp);
1180				FILEDESC_UNLOCK(fdp);
1181				fdrop_locked(fp, td);
1182				FILEDESC_LOCK(fdp);
1183			}
1184		}
1185	}
1186	FILEDESC_UNLOCK(fdp);
1187	return (0);
1188}
1189
1190static int
1191pollscan(td, fds, nfd)
1192	struct thread *td;
1193	struct pollfd *fds;
1194	u_int nfd;
1195{
1196	register struct filedesc *fdp = td->td_proc->p_fd;
1197	int i;
1198	struct file *fp;
1199	int n = 0;
1200
1201	for (i = 0; i < nfd; i++, fds++) {
1202		FILEDESC_LOCK(fdp);
1203		if (fds->fd >= fdp->fd_nfiles) {
1204			fds->revents = POLLNVAL;
1205			n++;
1206			FILEDESC_UNLOCK(fdp);
1207		} else if (fds->fd < 0) {
1208			fds->revents = 0;
1209			FILEDESC_UNLOCK(fdp);
1210		} else {
1211			fp = fdp->fd_ofiles[fds->fd];
1212			FILEDESC_UNLOCK(fdp);
1213			if (fp == NULL) {
1214				fds->revents = POLLNVAL;
1215				n++;
1216			} else {
1217				/*
1218				 * Note: backend also returns POLLHUP and
1219				 * POLLERR if appropriate.
1220				 */
1221				fds->revents = fo_poll(fp, fds->events,
1222				    fp->f_cred, td);
1223				if (fds->revents != 0)
1224					n++;
1225			}
1226		}
1227	}
1228	td->td_retval[0] = n;
1229	return (0);
1230}
1231
1232/*
1233 * OpenBSD poll system call.
1234 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1235 */
1236#ifndef _SYS_SYSPROTO_H_
1237struct openbsd_poll_args {
1238	struct pollfd *fds;
1239	u_int	nfds;
1240	int	timeout;
1241};
1242#endif
1243/*
1244 * MPSAFE
1245 */
1246int
1247openbsd_poll(td, uap)
1248	register struct thread *td;
1249	register struct openbsd_poll_args *uap;
1250{
1251	return (poll(td, (struct poll_args *)uap));
1252}
1253
1254/*ARGSUSED*/
1255int
1256seltrue(dev, events, td)
1257	dev_t dev;
1258	int events;
1259	struct thread *td;
1260{
1261
1262	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1263}
1264
1265static int
1266find_thread_in_proc(struct proc *p, struct thread *td)
1267{
1268	struct thread *td2;
1269	FOREACH_THREAD_IN_PROC(p, td2) {
1270		if (td2 == td) {
1271			return (1);
1272		}
1273	}
1274	return (0);
1275}
1276
1277/*
1278 * Record a select request.
1279 */
1280void
1281selrecord(selector, sip)
1282	struct thread *selector;
1283	struct selinfo *sip;
1284{
1285	struct proc *p;
1286	pid_t mypid;
1287
1288	mypid = selector->td_proc->p_pid;
1289	if ((sip->si_pid == mypid) &&
1290	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
1291		return;
1292	}
1293	if (sip->si_pid &&
1294	    (p = pfind(sip->si_pid)) &&
1295	    (find_thread_in_proc(p, sip->si_thread))) {
1296		mtx_lock_spin(&sched_lock);
1297	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
1298			mtx_unlock_spin(&sched_lock);
1299			PROC_UNLOCK(p);
1300			sip->si_flags |= SI_COLL;
1301			return;
1302		}
1303		mtx_unlock_spin(&sched_lock);
1304		PROC_UNLOCK(p);
1305	}
1306	sip->si_pid = mypid;
1307	sip->si_thread = selector;
1308}
1309
1310/*
1311 * Do a wakeup when a selectable event occurs.
1312 */
1313void
1314selwakeup(sip)
1315	register struct selinfo *sip;
1316{
1317	struct thread *td;
1318	register struct proc *p;
1319
1320	if (sip->si_pid == 0)
1321		return;
1322	if (sip->si_flags & SI_COLL) {
1323		nselcoll++;
1324		sip->si_flags &= ~SI_COLL;
1325		cv_broadcast(&selwait);
1326	}
1327	p = pfind(sip->si_pid);
1328	sip->si_pid = 0;
1329	td = sip->si_thread;
1330	if (p != NULL) {
1331		if (!find_thread_in_proc(p, td)) {
1332			PROC_UNLOCK(p); /* lock is in pfind() */;
1333			return;
1334		}
1335		mtx_lock_spin(&sched_lock);
1336		if (td->td_wchan == (caddr_t)&selwait) {
1337			if (td->td_proc->p_stat == SSLEEP)
1338				setrunnable(td);
1339			else
1340				cv_waitq_remove(td);
1341		} else
1342			td->td_flags &= ~TDF_SELECT;
1343		mtx_unlock_spin(&sched_lock);
1344		PROC_UNLOCK(p); /* Lock is in pfind() */
1345	}
1346}
1347
1348static void selectinit __P((void *));
1349SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1350
1351/* ARGSUSED*/
1352static void
1353selectinit(dummy)
1354	void *dummy;
1355{
1356	cv_init(&selwait, "select");
1357}
1358