sys_generic.c revision 137806
1/*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 137806 2004-11-17 09:09:55Z phk $");
39
40#include "opt_ktrace.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/sysproto.h>
45#include <sys/filedesc.h>
46#include <sys/filio.h>
47#include <sys/fcntl.h>
48#include <sys/file.h>
49#include <sys/proc.h>
50#include <sys/signalvar.h>
51#include <sys/socketvar.h>
52#include <sys/uio.h>
53#include <sys/kernel.h>
54#include <sys/limits.h>
55#include <sys/malloc.h>
56#include <sys/poll.h>
57#include <sys/resourcevar.h>
58#include <sys/selinfo.h>
59#include <sys/sleepqueue.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/sysent.h>
63#include <sys/vnode.h>
64#include <sys/bio.h>
65#include <sys/buf.h>
66#include <sys/condvar.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70#include <vm/vm.h>
71#include <vm/vm_page.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int	pollscan(struct thread *, struct pollfd *, u_int);
78static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int	dofileread(struct thread *, struct file *, int, void *,
80		    size_t, off_t, int);
81static int	dofilewrite(struct thread *, struct file *, int,
82		    const void *, size_t, off_t, int);
83static void	doselwakeup(struct selinfo *, int);
84
85/*
86 * Read system call.
87 */
88#ifndef _SYS_SYSPROTO_H_
89struct read_args {
90	int	fd;
91	void	*buf;
92	size_t	nbyte;
93};
94#endif
95/*
96 * MPSAFE
97 */
98int
99read(td, uap)
100	struct thread *td;
101	struct read_args *uap;
102{
103	struct file *fp;
104	int error;
105
106	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107		error = dofileread(td, fp, uap->fd, uap->buf,
108			    uap->nbyte, (off_t)-1, 0);
109		fdrop(fp, td);
110	}
111	return(error);
112}
113
114/*
115 * Pread system call
116 */
117#ifndef _SYS_SYSPROTO_H_
118struct pread_args {
119	int	fd;
120	void	*buf;
121	size_t	nbyte;
122	int	pad;
123	off_t	offset;
124};
125#endif
126/*
127 * MPSAFE
128 */
129int
130pread(td, uap)
131	struct thread *td;
132	struct pread_args *uap;
133{
134	struct file *fp;
135	int error;
136
137	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138		return (error);
139	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140		error = ESPIPE;
141	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142		error = EINVAL;
143	else {
144		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145			    uap->offset, FOF_OFFSET);
146	}
147	fdrop(fp, td);
148	return(error);
149}
150
151/*
152 * Code common for read and pread
153 */
154static int
155dofileread(td, fp, fd, buf, nbyte, offset, flags)
156	struct thread *td;
157	struct file *fp;
158	int fd, flags;
159	void *buf;
160	size_t nbyte;
161	off_t offset;
162{
163	struct uio auio;
164	struct iovec aiov;
165	long cnt, error = 0;
166#ifdef KTRACE
167	struct uio *ktruio = NULL;
168#endif
169
170	aiov.iov_base = buf;
171	aiov.iov_len = nbyte;
172	auio.uio_iov = &aiov;
173	auio.uio_iovcnt = 1;
174	auio.uio_offset = offset;
175	if (nbyte > INT_MAX)
176		return (EINVAL);
177	auio.uio_resid = nbyte;
178	auio.uio_rw = UIO_READ;
179	auio.uio_segflg = UIO_USERSPACE;
180	auio.uio_td = td;
181#ifdef KTRACE
182	if (KTRPOINT(td, KTR_GENIO))
183		ktruio = cloneuio(&auio);
184#endif
185	cnt = nbyte;
186
187	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
188		if (auio.uio_resid != cnt && (error == ERESTART ||
189		    error == EINTR || error == EWOULDBLOCK))
190			error = 0;
191	}
192	cnt -= auio.uio_resid;
193#ifdef KTRACE
194	if (ktruio != NULL) {
195		ktruio->uio_resid = cnt;
196		ktrgenio(fd, UIO_READ, ktruio, error);
197	}
198#endif
199	td->td_retval[0] = cnt;
200	return (error);
201}
202
203/*
204 * Scatter read system call.
205 */
206#ifndef _SYS_SYSPROTO_H_
207struct readv_args {
208	int	fd;
209	struct	iovec *iovp;
210	u_int	iovcnt;
211};
212#endif
213/*
214 * MPSAFE
215 */
216int
217readv(struct thread *td, struct readv_args *uap)
218{
219	struct file *fp;
220	struct uio *auio = NULL;
221	long cnt;
222	int error;
223#ifdef KTRACE
224	struct uio *ktruio = NULL;
225#endif
226
227	error = fget_read(td, uap->fd, &fp);
228	if (error)
229		return (error);
230	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
231	if (error) {
232		fdrop(fp, td);
233		return (error);
234	}
235	auio->uio_rw = UIO_READ;
236	auio->uio_td = td;
237#ifdef KTRACE
238	if (KTRPOINT(td, KTR_GENIO))
239		ktruio = cloneuio(auio);
240#endif
241	cnt = auio->uio_resid;
242	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
243		if (auio->uio_resid != cnt && (error == ERESTART ||
244		    error == EINTR || error == EWOULDBLOCK))
245			error = 0;
246	}
247	cnt -= auio->uio_resid;
248#ifdef KTRACE
249	if (ktruio != NULL) {
250		ktruio->uio_resid = cnt;
251		ktrgenio(uap->fd, UIO_READ, ktruio, error);
252	}
253#endif
254	td->td_retval[0] = cnt;
255	free(auio, M_IOV);
256	fdrop(fp, td);
257	return (error);
258}
259
260/*
261 * Write system call
262 */
263#ifndef _SYS_SYSPROTO_H_
264struct write_args {
265	int	fd;
266	const void *buf;
267	size_t	nbyte;
268};
269#endif
270/*
271 * MPSAFE
272 */
273int
274write(td, uap)
275	struct thread *td;
276	struct write_args *uap;
277{
278	struct file *fp;
279	int error;
280
281	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
282		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
283			    (off_t)-1, 0);
284		fdrop(fp, td);
285	} else {
286		error = EBADF;	/* XXX this can't be right */
287	}
288	return(error);
289}
290
291/*
292 * Pwrite system call
293 */
294#ifndef _SYS_SYSPROTO_H_
295struct pwrite_args {
296	int	fd;
297	const void *buf;
298	size_t	nbyte;
299	int	pad;
300	off_t	offset;
301};
302#endif
303/*
304 * MPSAFE
305 */
306int
307pwrite(td, uap)
308	struct thread *td;
309	struct pwrite_args *uap;
310{
311	struct file *fp;
312	int error;
313
314	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
315		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
316			error = ESPIPE;
317		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
318			error = EINVAL;
319		else {
320			error = dofilewrite(td, fp, uap->fd, uap->buf,
321				    uap->nbyte, uap->offset, FOF_OFFSET);
322		}
323		fdrop(fp, td);
324	} else {
325		error = EBADF;	/* this can't be right */
326	}
327	return(error);
328}
329
330static int
331dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
332	struct thread *td;
333	struct file *fp;
334	int fd, flags;
335	const void *buf;
336	size_t nbyte;
337	off_t offset;
338{
339	struct uio auio;
340	struct iovec aiov;
341	long cnt, error = 0;
342#ifdef KTRACE
343	struct uio *ktruio = NULL;
344#endif
345
346	aiov.iov_base = (void *)(uintptr_t)buf;
347	aiov.iov_len = nbyte;
348	auio.uio_iov = &aiov;
349	auio.uio_iovcnt = 1;
350	auio.uio_offset = offset;
351	if (nbyte > INT_MAX)
352		return (EINVAL);
353	auio.uio_resid = nbyte;
354	auio.uio_rw = UIO_WRITE;
355	auio.uio_segflg = UIO_USERSPACE;
356	auio.uio_td = td;
357#ifdef KTRACE
358	if (KTRPOINT(td, KTR_GENIO))
359		ktruio = cloneuio(&auio);
360#endif
361	cnt = nbyte;
362	if (fp->f_type == DTYPE_VNODE)
363		bwillwrite();
364	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
365		if (auio.uio_resid != cnt && (error == ERESTART ||
366		    error == EINTR || error == EWOULDBLOCK))
367			error = 0;
368		/* Socket layer is responsible for issuing SIGPIPE. */
369		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
370			PROC_LOCK(td->td_proc);
371			psignal(td->td_proc, SIGPIPE);
372			PROC_UNLOCK(td->td_proc);
373		}
374	}
375	cnt -= auio.uio_resid;
376#ifdef KTRACE
377	if (ktruio != NULL) {
378		ktruio->uio_resid = cnt;
379		ktrgenio(fd, UIO_WRITE, ktruio, error);
380	}
381#endif
382	td->td_retval[0] = cnt;
383	return (error);
384}
385
386/*
387 * Gather write system call
388 */
389#ifndef _SYS_SYSPROTO_H_
390struct writev_args {
391	int	fd;
392	struct	iovec *iovp;
393	u_int	iovcnt;
394};
395#endif
396/*
397 * MPSAFE
398 */
399int
400writev(struct thread *td, struct writev_args *uap)
401{
402	struct file *fp;
403	struct uio *auio = NULL;
404	long cnt;
405	int error;
406#ifdef KTRACE
407	struct uio *ktruio = NULL;
408#endif
409
410	error = fget_write(td, uap->fd, &fp);
411	if (error)
412		return (EBADF);
413	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
414	if (error) {
415		fdrop(fp, td);
416		return (error);
417	}
418	auio->uio_rw = UIO_WRITE;
419	auio->uio_td = td;
420#ifdef KTRACE
421	if (KTRPOINT(td, KTR_GENIO))
422		ktruio = cloneuio(auio);
423#endif
424	cnt = auio->uio_resid;
425	if (fp->f_type == DTYPE_VNODE)
426		bwillwrite();
427	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
428		if (auio->uio_resid != cnt && (error == ERESTART ||
429		    error == EINTR || error == EWOULDBLOCK))
430			error = 0;
431		if (error == EPIPE) {
432			PROC_LOCK(td->td_proc);
433			psignal(td->td_proc, SIGPIPE);
434			PROC_UNLOCK(td->td_proc);
435		}
436	}
437	cnt -= auio->uio_resid;
438#ifdef KTRACE
439	if (ktruio != NULL) {
440		ktruio->uio_resid = cnt;
441		ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
442	}
443#endif
444	td->td_retval[0] = cnt;
445	fdrop(fp, td);
446	free(auio, M_IOV);
447	return (error);
448}
449
450/*
451 * Ioctl system call
452 */
453#ifndef _SYS_SYSPROTO_H_
454struct ioctl_args {
455	int	fd;
456	u_long	com;
457	caddr_t	data;
458};
459#endif
460/*
461 * MPSAFE
462 */
463/* ARGSUSED */
464int
465ioctl(struct thread *td, struct ioctl_args *uap)
466{
467	struct file *fp;
468	struct filedesc *fdp;
469	u_long com;
470	int error = 0;
471	u_int size;
472	caddr_t data, memp;
473	int tmp;
474
475	if ((error = fget(td, uap->fd, &fp)) != 0)
476		return (error);
477	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
478		fdrop(fp, td);
479		return (EBADF);
480	}
481	fdp = td->td_proc->p_fd;
482	switch (com = uap->com) {
483	case FIONCLEX:
484		FILEDESC_LOCK_FAST(fdp);
485		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
486		FILEDESC_UNLOCK_FAST(fdp);
487		fdrop(fp, td);
488		return (0);
489	case FIOCLEX:
490		FILEDESC_LOCK_FAST(fdp);
491		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
492		FILEDESC_UNLOCK_FAST(fdp);
493		fdrop(fp, td);
494		return (0);
495	}
496
497	/*
498	 * Interpret high order word to find amount of data to be
499	 * copied to/from the user's address space.
500	 */
501	size = IOCPARM_LEN(com);
502	if ((size > IOCPARM_MAX) ||
503	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
504	    ((com & IOC_VOID) && size > 0) ||
505	    ((com & (IOC_IN | IOC_OUT)) && size == 0)) {
506		fdrop(fp, td);
507		return (ENOTTY);
508	}
509
510	if (size > 0) {
511		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
512		data = memp;
513	} else {
514		memp = NULL;
515		data = (void *)&uap->data;
516	}
517	if (com & IOC_IN) {
518		error = copyin(uap->data, data, (u_int)size);
519		if (error) {
520			free(memp, M_IOCTLOPS);
521			fdrop(fp, td);
522			return (error);
523		}
524	} else if (com & IOC_OUT) {
525		/*
526		 * Zero the buffer so the user always
527		 * gets back something deterministic.
528		 */
529		bzero(data, size);
530	}
531
532	if (com == FIONBIO) {
533		FILE_LOCK(fp);
534		if ((tmp = *(int *)data))
535			fp->f_flag |= FNONBLOCK;
536		else
537			fp->f_flag &= ~FNONBLOCK;
538		FILE_UNLOCK(fp);
539		data = (void *)&tmp;
540	} else if (com == FIOASYNC) {
541		FILE_LOCK(fp);
542		if ((tmp = *(int *)data))
543			fp->f_flag |= FASYNC;
544		else
545			fp->f_flag &= ~FASYNC;
546		FILE_UNLOCK(fp);
547		data = (void *)&tmp;
548	}
549
550	error = fo_ioctl(fp, com, data, td->td_ucred, td);
551
552	if (error == 0 && (com & IOC_OUT))
553		error = copyout(data, uap->data, (u_int)size);
554
555	if (memp != NULL)
556		free(memp, M_IOCTLOPS);
557	fdrop(fp, td);
558	return (error);
559}
560
561/*
562 * sellock and selwait are initialized in selectinit() via SYSINIT.
563 */
564struct mtx	sellock;
565struct cv	selwait;
566u_int		nselcoll;	/* Select collisions since boot */
567SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
568
569/*
570 * Select system call.
571 */
572#ifndef _SYS_SYSPROTO_H_
573struct select_args {
574	int	nd;
575	fd_set	*in, *ou, *ex;
576	struct	timeval *tv;
577};
578#endif
579/*
580 * MPSAFE
581 */
582int
583select(td, uap)
584	register struct thread *td;
585	register struct select_args *uap;
586{
587	struct timeval tv, *tvp;
588	int error;
589
590	if (uap->tv != NULL) {
591		error = copyin(uap->tv, &tv, sizeof(tv));
592		if (error)
593			return (error);
594		tvp = &tv;
595	} else
596		tvp = NULL;
597
598	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
599}
600
601int
602kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
603    fd_set *fd_ex, struct timeval *tvp)
604{
605	struct filedesc *fdp;
606	/*
607	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
608	 * infds with the new FD_SETSIZE of 1024, and more than enough for
609	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
610	 * of 256.
611	 */
612	fd_mask s_selbits[howmany(2048, NFDBITS)];
613	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
614	struct timeval atv, rtv, ttv;
615	int error, timo;
616	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
617
618	if (nd < 0)
619		return (EINVAL);
620	fdp = td->td_proc->p_fd;
621
622	FILEDESC_LOCK_FAST(fdp);
623
624	if (nd > td->td_proc->p_fd->fd_nfiles)
625		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
626	FILEDESC_UNLOCK_FAST(fdp);
627
628	/*
629	 * Allocate just enough bits for the non-null fd_sets.  Use the
630	 * preallocated auto buffer if possible.
631	 */
632	nfdbits = roundup(nd, NFDBITS);
633	ncpbytes = nfdbits / NBBY;
634	nbufbytes = 0;
635	if (fd_in != NULL)
636		nbufbytes += 2 * ncpbytes;
637	if (fd_ou != NULL)
638		nbufbytes += 2 * ncpbytes;
639	if (fd_ex != NULL)
640		nbufbytes += 2 * ncpbytes;
641	if (nbufbytes <= sizeof s_selbits)
642		selbits = &s_selbits[0];
643	else
644		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
645
646	/*
647	 * Assign pointers into the bit buffers and fetch the input bits.
648	 * Put the output buffers together so that they can be bzeroed
649	 * together.
650	 */
651	sbp = selbits;
652#define	getbits(name, x) \
653	do {								\
654		if (name == NULL)					\
655			ibits[x] = NULL;				\
656		else {							\
657			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
658			obits[x] = sbp;					\
659			sbp += ncpbytes / sizeof *sbp;			\
660			error = copyin(name, ibits[x], ncpbytes);	\
661			if (error != 0)					\
662				goto done_nosellock;			\
663		}							\
664	} while (0)
665	getbits(fd_in, 0);
666	getbits(fd_ou, 1);
667	getbits(fd_ex, 2);
668#undef	getbits
669	if (nbufbytes != 0)
670		bzero(selbits, nbufbytes / 2);
671
672	if (tvp != NULL) {
673		atv = *tvp;
674		if (itimerfix(&atv)) {
675			error = EINVAL;
676			goto done_nosellock;
677		}
678		getmicrouptime(&rtv);
679		timevaladd(&atv, &rtv);
680	} else {
681		atv.tv_sec = 0;
682		atv.tv_usec = 0;
683	}
684	timo = 0;
685	TAILQ_INIT(&td->td_selq);
686	mtx_lock(&sellock);
687retry:
688	ncoll = nselcoll;
689	mtx_lock_spin(&sched_lock);
690	td->td_flags |= TDF_SELECT;
691	mtx_unlock_spin(&sched_lock);
692	mtx_unlock(&sellock);
693
694	error = selscan(td, ibits, obits, nd);
695	mtx_lock(&sellock);
696	if (error || td->td_retval[0])
697		goto done;
698	if (atv.tv_sec || atv.tv_usec) {
699		getmicrouptime(&rtv);
700		if (timevalcmp(&rtv, &atv, >=))
701			goto done;
702		ttv = atv;
703		timevalsub(&ttv, &rtv);
704		timo = ttv.tv_sec > 24 * 60 * 60 ?
705		    24 * 60 * 60 * hz : tvtohz(&ttv);
706	}
707
708	/*
709	 * An event of interest may occur while we do not hold
710	 * sellock, so check TDF_SELECT and the number of
711	 * collisions and rescan the file descriptors if
712	 * necessary.
713	 */
714	mtx_lock_spin(&sched_lock);
715	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
716		mtx_unlock_spin(&sched_lock);
717		goto retry;
718	}
719	mtx_unlock_spin(&sched_lock);
720
721	if (timo > 0)
722		error = cv_timedwait_sig(&selwait, &sellock, timo);
723	else
724		error = cv_wait_sig(&selwait, &sellock);
725
726	if (error == 0)
727		goto retry;
728
729done:
730	clear_selinfo_list(td);
731	mtx_lock_spin(&sched_lock);
732	td->td_flags &= ~TDF_SELECT;
733	mtx_unlock_spin(&sched_lock);
734	mtx_unlock(&sellock);
735
736done_nosellock:
737	/* select is not restarted after signals... */
738	if (error == ERESTART)
739		error = EINTR;
740	if (error == EWOULDBLOCK)
741		error = 0;
742#define	putbits(name, x) \
743	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
744		error = error2;
745	if (error == 0) {
746		int error2;
747
748		putbits(fd_in, 0);
749		putbits(fd_ou, 1);
750		putbits(fd_ex, 2);
751#undef putbits
752	}
753	if (selbits != &s_selbits[0])
754		free(selbits, M_SELECT);
755
756	return (error);
757}
758
759static int
760selscan(td, ibits, obits, nfd)
761	struct thread *td;
762	fd_mask **ibits, **obits;
763	int nfd;
764{
765	int msk, i, fd;
766	fd_mask bits;
767	struct file *fp;
768	int n = 0;
769	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
770	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
771	struct filedesc *fdp = td->td_proc->p_fd;
772
773	FILEDESC_LOCK(fdp);
774	for (msk = 0; msk < 3; msk++) {
775		if (ibits[msk] == NULL)
776			continue;
777		for (i = 0; i < nfd; i += NFDBITS) {
778			bits = ibits[msk][i/NFDBITS];
779			/* ffs(int mask) not portable, fd_mask is long */
780			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
781				if (!(bits & 1))
782					continue;
783				if ((fp = fget_locked(fdp, fd)) == NULL) {
784					FILEDESC_UNLOCK(fdp);
785					return (EBADF);
786				}
787				if (fo_poll(fp, flag[msk], td->td_ucred,
788				    td)) {
789					obits[msk][(fd)/NFDBITS] |=
790					    ((fd_mask)1 << ((fd) % NFDBITS));
791					n++;
792				}
793			}
794		}
795	}
796	FILEDESC_UNLOCK(fdp);
797	td->td_retval[0] = n;
798	return (0);
799}
800
801/*
802 * Poll system call.
803 */
804#ifndef _SYS_SYSPROTO_H_
805struct poll_args {
806	struct pollfd *fds;
807	u_int	nfds;
808	int	timeout;
809};
810#endif
811/*
812 * MPSAFE
813 */
814int
815poll(td, uap)
816	struct thread *td;
817	struct poll_args *uap;
818{
819	struct pollfd *bits;
820	struct pollfd smallbits[32];
821	struct timeval atv, rtv, ttv;
822	int error = 0, timo;
823	u_int ncoll, nfds;
824	size_t ni;
825
826	nfds = uap->nfds;
827
828	/*
829	 * This is kinda bogus.  We have fd limits, but that is not
830	 * really related to the size of the pollfd array.  Make sure
831	 * we let the process use at least FD_SETSIZE entries and at
832	 * least enough for the current limits.  We want to be reasonably
833	 * safe, but not overly restrictive.
834	 */
835	PROC_LOCK(td->td_proc);
836	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
837	    (nfds > FD_SETSIZE)) {
838		PROC_UNLOCK(td->td_proc);
839		error = EINVAL;
840		goto done2;
841	}
842	PROC_UNLOCK(td->td_proc);
843	ni = nfds * sizeof(struct pollfd);
844	if (ni > sizeof(smallbits))
845		bits = malloc(ni, M_TEMP, M_WAITOK);
846	else
847		bits = smallbits;
848	error = copyin(uap->fds, bits, ni);
849	if (error)
850		goto done_nosellock;
851	if (uap->timeout != INFTIM) {
852		atv.tv_sec = uap->timeout / 1000;
853		atv.tv_usec = (uap->timeout % 1000) * 1000;
854		if (itimerfix(&atv)) {
855			error = EINVAL;
856			goto done_nosellock;
857		}
858		getmicrouptime(&rtv);
859		timevaladd(&atv, &rtv);
860	} else {
861		atv.tv_sec = 0;
862		atv.tv_usec = 0;
863	}
864	timo = 0;
865	TAILQ_INIT(&td->td_selq);
866	mtx_lock(&sellock);
867retry:
868	ncoll = nselcoll;
869	mtx_lock_spin(&sched_lock);
870	td->td_flags |= TDF_SELECT;
871	mtx_unlock_spin(&sched_lock);
872	mtx_unlock(&sellock);
873
874	error = pollscan(td, bits, nfds);
875	mtx_lock(&sellock);
876	if (error || td->td_retval[0])
877		goto done;
878	if (atv.tv_sec || atv.tv_usec) {
879		getmicrouptime(&rtv);
880		if (timevalcmp(&rtv, &atv, >=))
881			goto done;
882		ttv = atv;
883		timevalsub(&ttv, &rtv);
884		timo = ttv.tv_sec > 24 * 60 * 60 ?
885		    24 * 60 * 60 * hz : tvtohz(&ttv);
886	}
887	/*
888	 * An event of interest may occur while we do not hold
889	 * sellock, so check TDF_SELECT and the number of collisions
890	 * and rescan the file descriptors if necessary.
891	 */
892	mtx_lock_spin(&sched_lock);
893	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
894		mtx_unlock_spin(&sched_lock);
895		goto retry;
896	}
897	mtx_unlock_spin(&sched_lock);
898
899	if (timo > 0)
900		error = cv_timedwait_sig(&selwait, &sellock, timo);
901	else
902		error = cv_wait_sig(&selwait, &sellock);
903
904	if (error == 0)
905		goto retry;
906
907done:
908	clear_selinfo_list(td);
909	mtx_lock_spin(&sched_lock);
910	td->td_flags &= ~TDF_SELECT;
911	mtx_unlock_spin(&sched_lock);
912	mtx_unlock(&sellock);
913
914done_nosellock:
915	/* poll is not restarted after signals... */
916	if (error == ERESTART)
917		error = EINTR;
918	if (error == EWOULDBLOCK)
919		error = 0;
920	if (error == 0) {
921		error = copyout(bits, uap->fds, ni);
922		if (error)
923			goto out;
924	}
925out:
926	if (ni > sizeof(smallbits))
927		free(bits, M_TEMP);
928done2:
929	return (error);
930}
931
932static int
933pollscan(td, fds, nfd)
934	struct thread *td;
935	struct pollfd *fds;
936	u_int nfd;
937{
938	register struct filedesc *fdp = td->td_proc->p_fd;
939	int i;
940	struct file *fp;
941	int n = 0;
942
943	FILEDESC_LOCK(fdp);
944	for (i = 0; i < nfd; i++, fds++) {
945		if (fds->fd >= fdp->fd_nfiles) {
946			fds->revents = POLLNVAL;
947			n++;
948		} else if (fds->fd < 0) {
949			fds->revents = 0;
950		} else {
951			fp = fdp->fd_ofiles[fds->fd];
952			if (fp == NULL) {
953				fds->revents = POLLNVAL;
954				n++;
955			} else {
956				/*
957				 * Note: backend also returns POLLHUP and
958				 * POLLERR if appropriate.
959				 */
960				fds->revents = fo_poll(fp, fds->events,
961				    td->td_ucred, td);
962				if (fds->revents != 0)
963					n++;
964			}
965		}
966	}
967	FILEDESC_UNLOCK(fdp);
968	td->td_retval[0] = n;
969	return (0);
970}
971
972/*
973 * OpenBSD poll system call.
974 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
975 */
976#ifndef _SYS_SYSPROTO_H_
977struct openbsd_poll_args {
978	struct pollfd *fds;
979	u_int	nfds;
980	int	timeout;
981};
982#endif
983/*
984 * MPSAFE
985 */
986int
987openbsd_poll(td, uap)
988	register struct thread *td;
989	register struct openbsd_poll_args *uap;
990{
991	return (poll(td, (struct poll_args *)uap));
992}
993
994/*
995 * Remove the references to the thread from all of the objects
996 * we were polling.
997 *
998 * This code assumes that the underlying owner of the selinfo
999 * structure will hold sellock before it changes it, and that
1000 * it will unlink itself from our list if it goes away.
1001 */
1002void
1003clear_selinfo_list(td)
1004	struct thread *td;
1005{
1006	struct selinfo *si;
1007
1008	mtx_assert(&sellock, MA_OWNED);
1009	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1010		si->si_thread = NULL;
1011	TAILQ_INIT(&td->td_selq);
1012}
1013
1014/*
1015 * Record a select request.
1016 */
1017void
1018selrecord(selector, sip)
1019	struct thread *selector;
1020	struct selinfo *sip;
1021{
1022
1023	mtx_lock(&sellock);
1024	/*
1025	 * If the selinfo's thread pointer is NULL then take ownership of it.
1026	 *
1027	 * If the thread pointer is not NULL and it points to another
1028	 * thread, then we have a collision.
1029	 *
1030	 * If the thread pointer is not NULL and points back to us then leave
1031	 * it alone as we've already added pointed it at us and added it to
1032	 * our list.
1033	 */
1034	if (sip->si_thread == NULL) {
1035		sip->si_thread = selector;
1036		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1037	} else if (sip->si_thread != selector) {
1038		sip->si_flags |= SI_COLL;
1039	}
1040
1041	mtx_unlock(&sellock);
1042}
1043
1044/* Wake up a selecting thread. */
1045void
1046selwakeup(sip)
1047	struct selinfo *sip;
1048{
1049	doselwakeup(sip, -1);
1050}
1051
1052/* Wake up a selecting thread, and set its priority. */
1053void
1054selwakeuppri(sip, pri)
1055	struct selinfo *sip;
1056	int pri;
1057{
1058	doselwakeup(sip, pri);
1059}
1060
1061/*
1062 * Do a wakeup when a selectable event occurs.
1063 */
1064static void
1065doselwakeup(sip, pri)
1066	struct selinfo *sip;
1067	int pri;
1068{
1069	struct thread *td;
1070
1071	mtx_lock(&sellock);
1072	td = sip->si_thread;
1073	if ((sip->si_flags & SI_COLL) != 0) {
1074		nselcoll++;
1075		sip->si_flags &= ~SI_COLL;
1076		cv_broadcastpri(&selwait, pri);
1077	}
1078	if (td == NULL) {
1079		mtx_unlock(&sellock);
1080		return;
1081	}
1082	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1083	sip->si_thread = NULL;
1084	mtx_lock_spin(&sched_lock);
1085	td->td_flags &= ~TDF_SELECT;
1086	mtx_unlock_spin(&sched_lock);
1087	sleepq_remove(td, &selwait);
1088	mtx_unlock(&sellock);
1089}
1090
1091static void selectinit(void *);
1092SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1093
1094/* ARGSUSED*/
1095static void
1096selectinit(dummy)
1097	void *dummy;
1098{
1099	cv_init(&selwait, "select");
1100	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1101}
1102