sys_generic.c revision 144445
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 144445 2005-03-31 22:51:18Z jhb $");
39
40#include "opt_ktrace.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/sysproto.h>
45#include <sys/filedesc.h>
46#include <sys/filio.h>
47#include <sys/fcntl.h>
48#include <sys/file.h>
49#include <sys/proc.h>
50#include <sys/signalvar.h>
51#include <sys/socketvar.h>
52#include <sys/uio.h>
53#include <sys/kernel.h>
54#include <sys/limits.h>
55#include <sys/malloc.h>
56#include <sys/poll.h>
57#include <sys/resourcevar.h>
58#include <sys/selinfo.h>
59#include <sys/sleepqueue.h>
60#include <sys/syscallsubr.h>
61#include <sys/sysctl.h>
62#include <sys/sysent.h>
63#include <sys/vnode.h>
64#include <sys/bio.h>
65#include <sys/buf.h>
66#include <sys/condvar.h>
67#ifdef KTRACE
68#include <sys/ktrace.h>
69#endif
70#include <vm/vm.h>
71#include <vm/vm_page.h>
72
73static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76
77static int	pollscan(struct thread *, struct pollfd *, u_int);
78static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
79static int	dofileread(struct thread *, struct file *, int, void *,
80		    size_t, off_t, int);
81static int	dofilewrite(struct thread *, struct file *, int,
82		    const void *, size_t, off_t, int);
83static void	doselwakeup(struct selinfo *, int);
84
85/*
86 * Read system call.
87 */
88#ifndef _SYS_SYSPROTO_H_
89struct read_args {
90	int	fd;
91	void	*buf;
92	size_t	nbyte;
93};
94#endif
95/*
96 * MPSAFE
97 */
98int
99read(td, uap)
100	struct thread *td;
101	struct read_args *uap;
102{
103	struct file *fp;
104	int error;
105
106	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
107		error = dofileread(td, fp, uap->fd, uap->buf,
108			    uap->nbyte, (off_t)-1, 0);
109		fdrop(fp, td);
110	}
111	return(error);
112}
113
114/*
115 * Pread system call
116 */
117#ifndef _SYS_SYSPROTO_H_
118struct pread_args {
119	int	fd;
120	void	*buf;
121	size_t	nbyte;
122	int	pad;
123	off_t	offset;
124};
125#endif
126/*
127 * MPSAFE
128 */
129int
130pread(td, uap)
131	struct thread *td;
132	struct pread_args *uap;
133{
134	struct file *fp;
135	int error;
136
137	if ((error = fget_read(td, uap->fd, &fp)) != 0)
138		return (error);
139	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
140		error = ESPIPE;
141	else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
142		error = EINVAL;
143	else {
144		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
145			    uap->offset, FOF_OFFSET);
146	}
147	fdrop(fp, td);
148	return(error);
149}
150
151/*
152 * Code common for read and pread
153 */
154static int
155dofileread(td, fp, fd, buf, nbyte, offset, flags)
156	struct thread *td;
157	struct file *fp;
158	int fd, flags;
159	void *buf;
160	size_t nbyte;
161	off_t offset;
162{
163	struct uio auio;
164	struct iovec aiov;
165	ssize_t cnt;
166	long error = 0;
167#ifdef KTRACE
168	struct uio *ktruio = NULL;
169#endif
170
171	/* Finish zero length reads right here */
172	if (nbyte == 0) {
173		td->td_retval[0] = 0;
174		return(0);
175	}
176	aiov.iov_base = buf;
177	aiov.iov_len = nbyte;
178	auio.uio_iov = &aiov;
179	auio.uio_iovcnt = 1;
180	auio.uio_offset = offset;
181	if (nbyte > INT_MAX)
182		return (EINVAL);
183	auio.uio_resid = nbyte;
184	auio.uio_rw = UIO_READ;
185	auio.uio_segflg = UIO_USERSPACE;
186	auio.uio_td = td;
187#ifdef KTRACE
188	if (KTRPOINT(td, KTR_GENIO))
189		ktruio = cloneuio(&auio);
190#endif
191	cnt = nbyte;
192
193	if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
194		if (auio.uio_resid != cnt && (error == ERESTART ||
195		    error == EINTR || error == EWOULDBLOCK))
196			error = 0;
197	}
198	cnt -= auio.uio_resid;
199#ifdef KTRACE
200	if (ktruio != NULL) {
201		ktruio->uio_resid = cnt;
202		ktrgenio(fd, UIO_READ, ktruio, error);
203	}
204#endif
205	td->td_retval[0] = cnt;
206	return (error);
207}
208
209/*
210 * Scatter read system call.
211 */
212#ifndef _SYS_SYSPROTO_H_
213struct readv_args {
214	int	fd;
215	struct	iovec *iovp;
216	u_int	iovcnt;
217};
218#endif
219/*
220 * MPSAFE
221 */
222int
223readv(struct thread *td, struct readv_args *uap)
224{
225	struct uio *auio;
226	int error;
227
228	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
229	if (error)
230		return (error);
231	error = kern_readv(td, uap->fd, auio);
232	free(auio, M_IOV);
233	return (error);
234}
235
236int
237kern_readv(struct thread *td, int fd, struct uio *auio)
238{
239	struct file *fp;
240	long cnt;
241	int error;
242#ifdef KTRACE
243	struct uio *ktruio = NULL;
244#endif
245
246	error = fget_read(td, fd, &fp);
247	if (error)
248		return (error);
249	/* Finish zero length reads right here */
250	if (auio->uio_resid == 0) {
251		td->td_retval[0] = 0;
252		fdrop(fp, td);
253		return(0);
254	}
255	auio->uio_rw = UIO_READ;
256	auio->uio_td = td;
257#ifdef KTRACE
258	if (KTRPOINT(td, KTR_GENIO))
259		ktruio = cloneuio(auio);
260#endif
261	cnt = auio->uio_resid;
262	if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
263		if (auio->uio_resid != cnt && (error == ERESTART ||
264		    error == EINTR || error == EWOULDBLOCK))
265			error = 0;
266	}
267	cnt -= auio->uio_resid;
268#ifdef KTRACE
269	if (ktruio != NULL) {
270		ktruio->uio_resid = cnt;
271		ktrgenio(fd, UIO_READ, ktruio, error);
272	}
273#endif
274	td->td_retval[0] = cnt;
275	fdrop(fp, td);
276	return (error);
277}
278
279/*
280 * Write system call
281 */
282#ifndef _SYS_SYSPROTO_H_
283struct write_args {
284	int	fd;
285	const void *buf;
286	size_t	nbyte;
287};
288#endif
289/*
290 * MPSAFE
291 */
292int
293write(td, uap)
294	struct thread *td;
295	struct write_args *uap;
296{
297	struct file *fp;
298	int error;
299
300	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
301		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
302			    (off_t)-1, 0);
303		fdrop(fp, td);
304	} else {
305		error = EBADF;	/* XXX this can't be right */
306	}
307	return(error);
308}
309
310/*
311 * Pwrite system call
312 */
313#ifndef _SYS_SYSPROTO_H_
314struct pwrite_args {
315	int	fd;
316	const void *buf;
317	size_t	nbyte;
318	int	pad;
319	off_t	offset;
320};
321#endif
322/*
323 * MPSAFE
324 */
325int
326pwrite(td, uap)
327	struct thread *td;
328	struct pwrite_args *uap;
329{
330	struct file *fp;
331	int error;
332
333	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
334		if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
335			error = ESPIPE;
336		else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
337			error = EINVAL;
338		else {
339			error = dofilewrite(td, fp, uap->fd, uap->buf,
340				    uap->nbyte, uap->offset, FOF_OFFSET);
341		}
342		fdrop(fp, td);
343	} else {
344		error = EBADF;	/* this can't be right */
345	}
346	return(error);
347}
348
349static int
350dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
351	struct thread *td;
352	struct file *fp;
353	int fd, flags;
354	const void *buf;
355	size_t nbyte;
356	off_t offset;
357{
358	struct uio auio;
359	struct iovec aiov;
360	ssize_t cnt;
361	long error = 0;
362#ifdef KTRACE
363	struct uio *ktruio = NULL;
364#endif
365
366	aiov.iov_base = (void *)(uintptr_t)buf;
367	aiov.iov_len = nbyte;
368	auio.uio_iov = &aiov;
369	auio.uio_iovcnt = 1;
370	auio.uio_offset = offset;
371	if (nbyte > INT_MAX)
372		return (EINVAL);
373	auio.uio_resid = nbyte;
374	auio.uio_rw = UIO_WRITE;
375	auio.uio_segflg = UIO_USERSPACE;
376	auio.uio_td = td;
377#ifdef KTRACE
378	if (KTRPOINT(td, KTR_GENIO))
379		ktruio = cloneuio(&auio);
380#endif
381	cnt = nbyte;
382	if (fp->f_type == DTYPE_VNODE)
383		bwillwrite();
384	if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
385		if (auio.uio_resid != cnt && (error == ERESTART ||
386		    error == EINTR || error == EWOULDBLOCK))
387			error = 0;
388		/* Socket layer is responsible for issuing SIGPIPE. */
389		if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
390			PROC_LOCK(td->td_proc);
391			psignal(td->td_proc, SIGPIPE);
392			PROC_UNLOCK(td->td_proc);
393		}
394	}
395	cnt -= auio.uio_resid;
396#ifdef KTRACE
397	if (ktruio != NULL) {
398		ktruio->uio_resid = cnt;
399		ktrgenio(fd, UIO_WRITE, ktruio, error);
400	}
401#endif
402	td->td_retval[0] = cnt;
403	return (error);
404}
405
406/*
407 * Gather write system call
408 */
409#ifndef _SYS_SYSPROTO_H_
410struct writev_args {
411	int	fd;
412	struct	iovec *iovp;
413	u_int	iovcnt;
414};
415#endif
416/*
417 * MPSAFE
418 */
419int
420writev(struct thread *td, struct writev_args *uap)
421{
422	struct uio *auio;
423	int error;
424
425	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
426	if (error)
427		return (error);
428	error = kern_writev(td, uap->fd, auio);
429	free(auio, M_IOV);
430	return (error);
431}
432
433int
434kern_writev(struct thread *td, int fd, struct uio *auio)
435{
436	struct file *fp;
437	long cnt;
438	int error;
439#ifdef KTRACE
440	struct uio *ktruio = NULL;
441#endif
442
443	error = fget_write(td, fd, &fp);
444	if (error)
445		return (EBADF);
446	auio->uio_rw = UIO_WRITE;
447	auio->uio_td = td;
448#ifdef KTRACE
449	if (KTRPOINT(td, KTR_GENIO))
450		ktruio = cloneuio(auio);
451#endif
452	cnt = auio->uio_resid;
453	if (fp->f_type == DTYPE_VNODE)
454		bwillwrite();
455	if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
456		if (auio->uio_resid != cnt && (error == ERESTART ||
457		    error == EINTR || error == EWOULDBLOCK))
458			error = 0;
459		if (error == EPIPE) {
460			PROC_LOCK(td->td_proc);
461			psignal(td->td_proc, SIGPIPE);
462			PROC_UNLOCK(td->td_proc);
463		}
464	}
465	cnt -= auio->uio_resid;
466#ifdef KTRACE
467	if (ktruio != NULL) {
468		ktruio->uio_resid = cnt;
469		ktrgenio(fd, UIO_WRITE, ktruio, error);
470	}
471#endif
472	td->td_retval[0] = cnt;
473	fdrop(fp, td);
474	return (error);
475}
476
477/*
478 * Ioctl system call
479 */
480#ifndef _SYS_SYSPROTO_H_
481struct ioctl_args {
482	int	fd;
483	u_long	com;
484	caddr_t	data;
485};
486#endif
487/*
488 * MPSAFE
489 */
490/* ARGSUSED */
491int
492ioctl(struct thread *td, struct ioctl_args *uap)
493{
494	struct file *fp;
495	struct filedesc *fdp;
496	u_long com;
497	int error = 0;
498	u_int size;
499	caddr_t data, memp;
500	int tmp;
501
502	if (uap->com > 0xffffffff) {
503		printf(
504		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
505		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
506		uap->com &= 0xffffffff;
507	}
508	if ((error = fget(td, uap->fd, &fp)) != 0)
509		return (error);
510	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
511		fdrop(fp, td);
512		return (EBADF);
513	}
514	fdp = td->td_proc->p_fd;
515	switch (com = uap->com) {
516	case FIONCLEX:
517		FILEDESC_LOCK_FAST(fdp);
518		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
519		FILEDESC_UNLOCK_FAST(fdp);
520		fdrop(fp, td);
521		return (0);
522	case FIOCLEX:
523		FILEDESC_LOCK_FAST(fdp);
524		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
525		FILEDESC_UNLOCK_FAST(fdp);
526		fdrop(fp, td);
527		return (0);
528	}
529
530	/*
531	 * Interpret high order word to find amount of data to be
532	 * copied to/from the user's address space.
533	 */
534	size = IOCPARM_LEN(com);
535	if ((size > IOCPARM_MAX) ||
536	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
537	    ((com & IOC_VOID) && size > 0) ||
538	    ((com & (IOC_IN | IOC_OUT)) && size == 0)) {
539		fdrop(fp, td);
540		return (ENOTTY);
541	}
542
543	if (size > 0) {
544		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
545		data = memp;
546	} else {
547		memp = NULL;
548		data = (void *)&uap->data;
549	}
550	if (com & IOC_IN) {
551		error = copyin(uap->data, data, (u_int)size);
552		if (error) {
553			free(memp, M_IOCTLOPS);
554			fdrop(fp, td);
555			return (error);
556		}
557	} else if (com & IOC_OUT) {
558		/*
559		 * Zero the buffer so the user always
560		 * gets back something deterministic.
561		 */
562		bzero(data, size);
563	}
564
565	if (com == FIONBIO) {
566		FILE_LOCK(fp);
567		if ((tmp = *(int *)data))
568			fp->f_flag |= FNONBLOCK;
569		else
570			fp->f_flag &= ~FNONBLOCK;
571		FILE_UNLOCK(fp);
572		data = (void *)&tmp;
573	} else if (com == FIOASYNC) {
574		FILE_LOCK(fp);
575		if ((tmp = *(int *)data))
576			fp->f_flag |= FASYNC;
577		else
578			fp->f_flag &= ~FASYNC;
579		FILE_UNLOCK(fp);
580		data = (void *)&tmp;
581	}
582
583	error = fo_ioctl(fp, com, data, td->td_ucred, td);
584
585	if (error == 0 && (com & IOC_OUT))
586		error = copyout(data, uap->data, (u_int)size);
587
588	if (memp != NULL)
589		free(memp, M_IOCTLOPS);
590	fdrop(fp, td);
591	return (error);
592}
593
594/*
595 * sellock and selwait are initialized in selectinit() via SYSINIT.
596 */
597struct mtx	sellock;
598struct cv	selwait;
599u_int		nselcoll;	/* Select collisions since boot */
600SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
601
602/*
603 * Select system call.
604 */
605#ifndef _SYS_SYSPROTO_H_
606struct select_args {
607	int	nd;
608	fd_set	*in, *ou, *ex;
609	struct	timeval *tv;
610};
611#endif
612/*
613 * MPSAFE
614 */
615int
616select(td, uap)
617	register struct thread *td;
618	register struct select_args *uap;
619{
620	struct timeval tv, *tvp;
621	int error;
622
623	if (uap->tv != NULL) {
624		error = copyin(uap->tv, &tv, sizeof(tv));
625		if (error)
626			return (error);
627		tvp = &tv;
628	} else
629		tvp = NULL;
630
631	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
632}
633
634int
635kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
636    fd_set *fd_ex, struct timeval *tvp)
637{
638	struct filedesc *fdp;
639	/*
640	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
641	 * infds with the new FD_SETSIZE of 1024, and more than enough for
642	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
643	 * of 256.
644	 */
645	fd_mask s_selbits[howmany(2048, NFDBITS)];
646	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
647	struct timeval atv, rtv, ttv;
648	int error, timo;
649	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
650
651	if (nd < 0)
652		return (EINVAL);
653	fdp = td->td_proc->p_fd;
654
655	FILEDESC_LOCK_FAST(fdp);
656
657	if (nd > td->td_proc->p_fd->fd_nfiles)
658		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
659	FILEDESC_UNLOCK_FAST(fdp);
660
661	/*
662	 * Allocate just enough bits for the non-null fd_sets.  Use the
663	 * preallocated auto buffer if possible.
664	 */
665	nfdbits = roundup(nd, NFDBITS);
666	ncpbytes = nfdbits / NBBY;
667	nbufbytes = 0;
668	if (fd_in != NULL)
669		nbufbytes += 2 * ncpbytes;
670	if (fd_ou != NULL)
671		nbufbytes += 2 * ncpbytes;
672	if (fd_ex != NULL)
673		nbufbytes += 2 * ncpbytes;
674	if (nbufbytes <= sizeof s_selbits)
675		selbits = &s_selbits[0];
676	else
677		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
678
679	/*
680	 * Assign pointers into the bit buffers and fetch the input bits.
681	 * Put the output buffers together so that they can be bzeroed
682	 * together.
683	 */
684	sbp = selbits;
685#define	getbits(name, x) \
686	do {								\
687		if (name == NULL)					\
688			ibits[x] = NULL;				\
689		else {							\
690			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
691			obits[x] = sbp;					\
692			sbp += ncpbytes / sizeof *sbp;			\
693			error = copyin(name, ibits[x], ncpbytes);	\
694			if (error != 0)					\
695				goto done_nosellock;			\
696		}							\
697	} while (0)
698	getbits(fd_in, 0);
699	getbits(fd_ou, 1);
700	getbits(fd_ex, 2);
701#undef	getbits
702	if (nbufbytes != 0)
703		bzero(selbits, nbufbytes / 2);
704
705	if (tvp != NULL) {
706		atv = *tvp;
707		if (itimerfix(&atv)) {
708			error = EINVAL;
709			goto done_nosellock;
710		}
711		getmicrouptime(&rtv);
712		timevaladd(&atv, &rtv);
713	} else {
714		atv.tv_sec = 0;
715		atv.tv_usec = 0;
716	}
717	timo = 0;
718	TAILQ_INIT(&td->td_selq);
719	mtx_lock(&sellock);
720retry:
721	ncoll = nselcoll;
722	mtx_lock_spin(&sched_lock);
723	td->td_flags |= TDF_SELECT;
724	mtx_unlock_spin(&sched_lock);
725	mtx_unlock(&sellock);
726
727	error = selscan(td, ibits, obits, nd);
728	mtx_lock(&sellock);
729	if (error || td->td_retval[0])
730		goto done;
731	if (atv.tv_sec || atv.tv_usec) {
732		getmicrouptime(&rtv);
733		if (timevalcmp(&rtv, &atv, >=))
734			goto done;
735		ttv = atv;
736		timevalsub(&ttv, &rtv);
737		timo = ttv.tv_sec > 24 * 60 * 60 ?
738		    24 * 60 * 60 * hz : tvtohz(&ttv);
739	}
740
741	/*
742	 * An event of interest may occur while we do not hold
743	 * sellock, so check TDF_SELECT and the number of
744	 * collisions and rescan the file descriptors if
745	 * necessary.
746	 */
747	mtx_lock_spin(&sched_lock);
748	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
749		mtx_unlock_spin(&sched_lock);
750		goto retry;
751	}
752	mtx_unlock_spin(&sched_lock);
753
754	if (timo > 0)
755		error = cv_timedwait_sig(&selwait, &sellock, timo);
756	else
757		error = cv_wait_sig(&selwait, &sellock);
758
759	if (error == 0)
760		goto retry;
761
762done:
763	clear_selinfo_list(td);
764	mtx_lock_spin(&sched_lock);
765	td->td_flags &= ~TDF_SELECT;
766	mtx_unlock_spin(&sched_lock);
767	mtx_unlock(&sellock);
768
769done_nosellock:
770	/* select is not restarted after signals... */
771	if (error == ERESTART)
772		error = EINTR;
773	if (error == EWOULDBLOCK)
774		error = 0;
775#define	putbits(name, x) \
776	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
777		error = error2;
778	if (error == 0) {
779		int error2;
780
781		putbits(fd_in, 0);
782		putbits(fd_ou, 1);
783		putbits(fd_ex, 2);
784#undef putbits
785	}
786	if (selbits != &s_selbits[0])
787		free(selbits, M_SELECT);
788
789	return (error);
790}
791
792static int
793selscan(td, ibits, obits, nfd)
794	struct thread *td;
795	fd_mask **ibits, **obits;
796	int nfd;
797{
798	int msk, i, fd;
799	fd_mask bits;
800	struct file *fp;
801	int n = 0;
802	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
803	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
804	struct filedesc *fdp = td->td_proc->p_fd;
805
806	FILEDESC_LOCK(fdp);
807	for (msk = 0; msk < 3; msk++) {
808		if (ibits[msk] == NULL)
809			continue;
810		for (i = 0; i < nfd; i += NFDBITS) {
811			bits = ibits[msk][i/NFDBITS];
812			/* ffs(int mask) not portable, fd_mask is long */
813			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
814				if (!(bits & 1))
815					continue;
816				if ((fp = fget_locked(fdp, fd)) == NULL) {
817					FILEDESC_UNLOCK(fdp);
818					return (EBADF);
819				}
820				if (fo_poll(fp, flag[msk], td->td_ucred,
821				    td)) {
822					obits[msk][(fd)/NFDBITS] |=
823					    ((fd_mask)1 << ((fd) % NFDBITS));
824					n++;
825				}
826			}
827		}
828	}
829	FILEDESC_UNLOCK(fdp);
830	td->td_retval[0] = n;
831	return (0);
832}
833
834/*
835 * Poll system call.
836 */
837#ifndef _SYS_SYSPROTO_H_
838struct poll_args {
839	struct pollfd *fds;
840	u_int	nfds;
841	int	timeout;
842};
843#endif
844/*
845 * MPSAFE
846 */
847int
848poll(td, uap)
849	struct thread *td;
850	struct poll_args *uap;
851{
852	struct pollfd *bits;
853	struct pollfd smallbits[32];
854	struct timeval atv, rtv, ttv;
855	int error = 0, timo;
856	u_int ncoll, nfds;
857	size_t ni;
858
859	nfds = uap->nfds;
860
861	/*
862	 * This is kinda bogus.  We have fd limits, but that is not
863	 * really related to the size of the pollfd array.  Make sure
864	 * we let the process use at least FD_SETSIZE entries and at
865	 * least enough for the current limits.  We want to be reasonably
866	 * safe, but not overly restrictive.
867	 */
868	PROC_LOCK(td->td_proc);
869	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
870	    (nfds > FD_SETSIZE)) {
871		PROC_UNLOCK(td->td_proc);
872		error = EINVAL;
873		goto done2;
874	}
875	PROC_UNLOCK(td->td_proc);
876	ni = nfds * sizeof(struct pollfd);
877	if (ni > sizeof(smallbits))
878		bits = malloc(ni, M_TEMP, M_WAITOK);
879	else
880		bits = smallbits;
881	error = copyin(uap->fds, bits, ni);
882	if (error)
883		goto done_nosellock;
884	if (uap->timeout != INFTIM) {
885		atv.tv_sec = uap->timeout / 1000;
886		atv.tv_usec = (uap->timeout % 1000) * 1000;
887		if (itimerfix(&atv)) {
888			error = EINVAL;
889			goto done_nosellock;
890		}
891		getmicrouptime(&rtv);
892		timevaladd(&atv, &rtv);
893	} else {
894		atv.tv_sec = 0;
895		atv.tv_usec = 0;
896	}
897	timo = 0;
898	TAILQ_INIT(&td->td_selq);
899	mtx_lock(&sellock);
900retry:
901	ncoll = nselcoll;
902	mtx_lock_spin(&sched_lock);
903	td->td_flags |= TDF_SELECT;
904	mtx_unlock_spin(&sched_lock);
905	mtx_unlock(&sellock);
906
907	error = pollscan(td, bits, nfds);
908	mtx_lock(&sellock);
909	if (error || td->td_retval[0])
910		goto done;
911	if (atv.tv_sec || atv.tv_usec) {
912		getmicrouptime(&rtv);
913		if (timevalcmp(&rtv, &atv, >=))
914			goto done;
915		ttv = atv;
916		timevalsub(&ttv, &rtv);
917		timo = ttv.tv_sec > 24 * 60 * 60 ?
918		    24 * 60 * 60 * hz : tvtohz(&ttv);
919	}
920	/*
921	 * An event of interest may occur while we do not hold
922	 * sellock, so check TDF_SELECT and the number of collisions
923	 * and rescan the file descriptors if necessary.
924	 */
925	mtx_lock_spin(&sched_lock);
926	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
927		mtx_unlock_spin(&sched_lock);
928		goto retry;
929	}
930	mtx_unlock_spin(&sched_lock);
931
932	if (timo > 0)
933		error = cv_timedwait_sig(&selwait, &sellock, timo);
934	else
935		error = cv_wait_sig(&selwait, &sellock);
936
937	if (error == 0)
938		goto retry;
939
940done:
941	clear_selinfo_list(td);
942	mtx_lock_spin(&sched_lock);
943	td->td_flags &= ~TDF_SELECT;
944	mtx_unlock_spin(&sched_lock);
945	mtx_unlock(&sellock);
946
947done_nosellock:
948	/* poll is not restarted after signals... */
949	if (error == ERESTART)
950		error = EINTR;
951	if (error == EWOULDBLOCK)
952		error = 0;
953	if (error == 0) {
954		error = copyout(bits, uap->fds, ni);
955		if (error)
956			goto out;
957	}
958out:
959	if (ni > sizeof(smallbits))
960		free(bits, M_TEMP);
961done2:
962	return (error);
963}
964
965static int
966pollscan(td, fds, nfd)
967	struct thread *td;
968	struct pollfd *fds;
969	u_int nfd;
970{
971	register struct filedesc *fdp = td->td_proc->p_fd;
972	int i;
973	struct file *fp;
974	int n = 0;
975
976	FILEDESC_LOCK(fdp);
977	for (i = 0; i < nfd; i++, fds++) {
978		if (fds->fd >= fdp->fd_nfiles) {
979			fds->revents = POLLNVAL;
980			n++;
981		} else if (fds->fd < 0) {
982			fds->revents = 0;
983		} else {
984			fp = fdp->fd_ofiles[fds->fd];
985			if (fp == NULL) {
986				fds->revents = POLLNVAL;
987				n++;
988			} else {
989				/*
990				 * Note: backend also returns POLLHUP and
991				 * POLLERR if appropriate.
992				 */
993				fds->revents = fo_poll(fp, fds->events,
994				    td->td_ucred, td);
995				if (fds->revents != 0)
996					n++;
997			}
998		}
999	}
1000	FILEDESC_UNLOCK(fdp);
1001	td->td_retval[0] = n;
1002	return (0);
1003}
1004
1005/*
1006 * OpenBSD poll system call.
1007 * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1008 */
1009#ifndef _SYS_SYSPROTO_H_
1010struct openbsd_poll_args {
1011	struct pollfd *fds;
1012	u_int	nfds;
1013	int	timeout;
1014};
1015#endif
1016/*
1017 * MPSAFE
1018 */
1019int
1020openbsd_poll(td, uap)
1021	register struct thread *td;
1022	register struct openbsd_poll_args *uap;
1023{
1024	return (poll(td, (struct poll_args *)uap));
1025}
1026
1027/*
1028 * Remove the references to the thread from all of the objects
1029 * we were polling.
1030 *
1031 * This code assumes that the underlying owner of the selinfo
1032 * structure will hold sellock before it changes it, and that
1033 * it will unlink itself from our list if it goes away.
1034 */
1035void
1036clear_selinfo_list(td)
1037	struct thread *td;
1038{
1039	struct selinfo *si;
1040
1041	mtx_assert(&sellock, MA_OWNED);
1042	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1043		si->si_thread = NULL;
1044	TAILQ_INIT(&td->td_selq);
1045}
1046
1047/*
1048 * Record a select request.
1049 */
1050void
1051selrecord(selector, sip)
1052	struct thread *selector;
1053	struct selinfo *sip;
1054{
1055
1056	mtx_lock(&sellock);
1057	/*
1058	 * If the selinfo's thread pointer is NULL then take ownership of it.
1059	 *
1060	 * If the thread pointer is not NULL and it points to another
1061	 * thread, then we have a collision.
1062	 *
1063	 * If the thread pointer is not NULL and points back to us then leave
1064	 * it alone as we've already added pointed it at us and added it to
1065	 * our list.
1066	 */
1067	if (sip->si_thread == NULL) {
1068		sip->si_thread = selector;
1069		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1070	} else if (sip->si_thread != selector) {
1071		sip->si_flags |= SI_COLL;
1072	}
1073
1074	mtx_unlock(&sellock);
1075}
1076
1077/* Wake up a selecting thread. */
1078void
1079selwakeup(sip)
1080	struct selinfo *sip;
1081{
1082	doselwakeup(sip, -1);
1083}
1084
1085/* Wake up a selecting thread, and set its priority. */
1086void
1087selwakeuppri(sip, pri)
1088	struct selinfo *sip;
1089	int pri;
1090{
1091	doselwakeup(sip, pri);
1092}
1093
1094/*
1095 * Do a wakeup when a selectable event occurs.
1096 */
1097static void
1098doselwakeup(sip, pri)
1099	struct selinfo *sip;
1100	int pri;
1101{
1102	struct thread *td;
1103
1104	mtx_lock(&sellock);
1105	td = sip->si_thread;
1106	if ((sip->si_flags & SI_COLL) != 0) {
1107		nselcoll++;
1108		sip->si_flags &= ~SI_COLL;
1109		cv_broadcastpri(&selwait, pri);
1110	}
1111	if (td == NULL) {
1112		mtx_unlock(&sellock);
1113		return;
1114	}
1115	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1116	sip->si_thread = NULL;
1117	mtx_lock_spin(&sched_lock);
1118	td->td_flags &= ~TDF_SELECT;
1119	mtx_unlock_spin(&sched_lock);
1120	sleepq_remove(td, &selwait);
1121	mtx_unlock(&sellock);
1122}
1123
1124static void selectinit(void *);
1125SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1126
1127/* ARGSUSED*/
1128static void
1129selectinit(dummy)
1130	void *dummy;
1131{
1132	cv_init(&selwait, "select");
1133	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1134}
1135