sys_generic.c revision 89306
11541Srgrimes/*
21541Srgrimes * Copyright (c) 1982, 1986, 1989, 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes * (c) UNIX System Laboratories, Inc.
51541Srgrimes * All or some portions of this file are derived from material licensed
61541Srgrimes * to the University of California by American Telephone and Telegraph
71541Srgrimes * Co. or Unix System Laboratories, Inc. and are reproduced herein with
81541Srgrimes * the permission of UNIX System Laboratories, Inc.
91541Srgrimes *
101541Srgrimes * Redistribution and use in source and binary forms, with or without
111541Srgrimes * modification, are permitted provided that the following conditions
121541Srgrimes * are met:
131541Srgrimes * 1. Redistributions of source code must retain the above copyright
141541Srgrimes *    notice, this list of conditions and the following disclaimer.
151541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
161541Srgrimes *    notice, this list of conditions and the following disclaimer in the
171541Srgrimes *    documentation and/or other materials provided with the distribution.
181541Srgrimes * 3. All advertising materials mentioning features or use of this software
191541Srgrimes *    must display the following acknowledgement:
201541Srgrimes *	This product includes software developed by the University of
211541Srgrimes *	California, Berkeley and its contributors.
221541Srgrimes * 4. Neither the name of the University nor the names of its contributors
231541Srgrimes *    may be used to endorse or promote products derived from this software
241541Srgrimes *    without specific prior written permission.
251541Srgrimes *
261541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
271541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
281541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
291541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
301541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
311541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
321541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
331541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
341541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
351541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
361541Srgrimes * SUCH DAMAGE.
371541Srgrimes *
381541Srgrimes *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
3950477Speter * $FreeBSD: head/sys/kern/sys_generic.c 89306 2002-01-13 11:58:06Z alfred $
401541Srgrimes */
411541Srgrimes
4213203Swollman#include "opt_ktrace.h"
4313203Swollman
441541Srgrimes#include <sys/param.h>
451541Srgrimes#include <sys/systm.h>
4612221Sbde#include <sys/sysproto.h>
471541Srgrimes#include <sys/filedesc.h>
4824206Sbde#include <sys/filio.h>
4924131Sbde#include <sys/fcntl.h>
501541Srgrimes#include <sys/file.h>
511541Srgrimes#include <sys/proc.h>
523308Sphk#include <sys/signalvar.h>
531541Srgrimes#include <sys/socketvar.h>
541541Srgrimes#include <sys/uio.h>
551541Srgrimes#include <sys/kernel.h>
561541Srgrimes#include <sys/malloc.h>
5729351Speter#include <sys/poll.h>
5872146Speter#include <sys/resourcevar.h>
5970834Swollman#include <sys/selinfo.h>
6055478Speter#include <sys/sysctl.h>
6129351Speter#include <sys/sysent.h>
6268883Sdillon#include <sys/bio.h>
6368883Sdillon#include <sys/buf.h>
6476564Stanimura#include <sys/condvar.h>
651541Srgrimes#ifdef KTRACE
661541Srgrimes#include <sys/ktrace.h>
671541Srgrimes#endif
6868883Sdillon#include <vm/vm.h>
6968883Sdillon#include <vm/vm_page.h>
701541Srgrimes
7138517Sdfr#include <machine/limits.h>
7238517Sdfr
7330354Sphkstatic MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
7430354Sphkstatic MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
7530354SphkMALLOC_DEFINE(M_IOV, "iov", "large iov's");
7630309Sphk
7783366Sjulianstatic int	pollscan __P((struct thread *, struct pollfd *, u_int));
7883366Sjulianstatic int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
7983366Sjulianstatic int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
8083366Sjulianstatic int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
8183366Sjulianstatic int	dofileread __P((struct thread *, struct file *, int, void *,
8245311Sdt		    size_t, off_t, int));
8383366Sjulianstatic int	dofilewrite __P((struct thread *, struct file *, int,
8445311Sdt		    const void *, size_t, off_t, int));
853485Sphk
8689306Salfredstruct file*
8789306Salfredholdfp(fdp, fd, flag)
8889306Salfred	struct filedesc* fdp;
8989306Salfred	int fd, flag;
9089306Salfred{
9189306Salfred	struct file* fp;
9289306Salfred
9389306Salfred	FILEDESC_LOCK(fdp);
9489306Salfred	if (((u_int)fd) >= fdp->fd_nfiles ||
9589306Salfred	    (fp = fdp->fd_ofiles[fd]) == NULL) {
9689306Salfred		FILEDESC_UNLOCK(fdp);
9789306Salfred		return (NULL);
9889306Salfred	}
9989306Salfred	FILE_LOCK(fp);
10089306Salfred	FILEDESC_UNLOCK(fdp);
10189306Salfred	if ((fp->f_flag & flag) == 0) {
10289306Salfred		FILE_UNLOCK(fp);
10389306Salfred		return (NULL);
10489306Salfred	}
10589306Salfred	fp->f_count++;
10689306Salfred	FILE_UNLOCK(fp);
10789306Salfred	return (fp);
10889306Salfred}
10989306Salfred
1101541Srgrimes/*
1111541Srgrimes * Read system call.
1121541Srgrimes */
11312221Sbde#ifndef _SYS_SYSPROTO_H_
1141541Srgrimesstruct read_args {
1151541Srgrimes	int	fd;
11638864Sbde	void	*buf;
11738864Sbde	size_t	nbyte;
1181541Srgrimes};
11912221Sbde#endif
12082752Sdillon/*
12182752Sdillon * MPSAFE
12282752Sdillon */
1231549Srgrimesint
12483366Sjulianread(td, uap)
12583366Sjulian	struct thread *td;
12686341Sdillon	struct read_args *uap;
1271541Srgrimes{
12886341Sdillon	struct file *fp;
12968883Sdillon	int error;
1301541Srgrimes
13182752Sdillon	mtx_lock(&Giant);
13286341Sdillon	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
13383366Sjulian		error = dofileread(td, fp, uap->fd, uap->buf,
13482752Sdillon			    uap->nbyte, (off_t)-1, 0);
13583366Sjulian		fdrop(fp, td);
13682752Sdillon	}
13782752Sdillon	mtx_unlock(&Giant);
13868883Sdillon	return(error);
1391541Srgrimes}
1401541Srgrimes
1411541Srgrimes/*
14245311Sdt * Pread system call
14345065Salc */
14445065Salc#ifndef _SYS_SYSPROTO_H_
14545065Salcstruct pread_args {
14645065Salc	int	fd;
14745065Salc	void	*buf;
14845065Salc	size_t	nbyte;
14945311Sdt	int	pad;
15045311Sdt	off_t	offset;
15145065Salc};
15245065Salc#endif
15382752Sdillon/*
15482752Sdillon * MPSAFE
15582752Sdillon */
15645065Salcint
15783366Sjulianpread(td, uap)
15883366Sjulian	struct thread *td;
15986341Sdillon	struct pread_args *uap;
16045065Salc{
16186341Sdillon	struct file *fp;
16268883Sdillon	int error;
16345311Sdt
16489306Salfred	fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD);
16589306Salfred	if (fp == NULL)
16689306Salfred		return (EBADF);
16789306Salfred	if (fp->f_type != DTYPE_VNODE) {
16889306Salfred		error = ESPIPE;
16989306Salfred	} else {
17089306Salfred		mtx_lock(&Giant);
17189306Salfred		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
17289306Salfred			    uap->offset, FOF_OFFSET);
17389306Salfred		mtx_unlock(&Giant);
17468883Sdillon	}
17589306Salfred	fdrop(fp, td);
17668883Sdillon	return(error);
17745311Sdt}
17845311Sdt
17945311Sdt/*
18045311Sdt * Code common for read and pread
18145311Sdt */
18245311Sdtint
18383366Sjuliandofileread(td, fp, fd, buf, nbyte, offset, flags)
18483366Sjulian	struct thread *td;
18545311Sdt	struct file *fp;
18645311Sdt	int fd, flags;
18745311Sdt	void *buf;
18845311Sdt	size_t nbyte;
18945311Sdt	off_t offset;
19045311Sdt{
19145065Salc	struct uio auio;
19245065Salc	struct iovec aiov;
19345065Salc	long cnt, error = 0;
19445065Salc#ifdef KTRACE
19545065Salc	struct iovec ktriov;
19662378Sgreen	struct uio ktruio;
19763905Sgreen	int didktr = 0;
19845065Salc#endif
19945065Salc
20045311Sdt	aiov.iov_base = (caddr_t)buf;
20145311Sdt	aiov.iov_len = nbyte;
20245065Salc	auio.uio_iov = &aiov;
20345065Salc	auio.uio_iovcnt = 1;
20445311Sdt	auio.uio_offset = offset;
20545311Sdt	if (nbyte > INT_MAX)
20645065Salc		return (EINVAL);
20745311Sdt	auio.uio_resid = nbyte;
20845065Salc	auio.uio_rw = UIO_READ;
20945065Salc	auio.uio_segflg = UIO_USERSPACE;
21083366Sjulian	auio.uio_td = td;
21145065Salc#ifdef KTRACE
21245065Salc	/*
21345065Salc	 * if tracing, save a copy of iovec
21445065Salc	 */
21583366Sjulian	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
21645065Salc		ktriov = aiov;
21762378Sgreen		ktruio = auio;
21863905Sgreen		didktr = 1;
21962378Sgreen	}
22045065Salc#endif
22145311Sdt	cnt = nbyte;
22268883Sdillon
22383366Sjulian	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
22445065Salc		if (auio.uio_resid != cnt && (error == ERESTART ||
22545065Salc		    error == EINTR || error == EWOULDBLOCK))
22645065Salc			error = 0;
22768883Sdillon	}
22845065Salc	cnt -= auio.uio_resid;
22945065Salc#ifdef KTRACE
23063905Sgreen	if (didktr && error == 0) {
23162378Sgreen		ktruio.uio_iov = &ktriov;
23262378Sgreen		ktruio.uio_resid = cnt;
23383366Sjulian		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
23462378Sgreen	}
23545065Salc#endif
23683366Sjulian	td->td_retval[0] = cnt;
23745065Salc	return (error);
23845065Salc}
23945065Salc
24045065Salc/*
2411541Srgrimes * Scatter read system call.
2421541Srgrimes */
24312221Sbde#ifndef _SYS_SYSPROTO_H_
2441541Srgrimesstruct readv_args {
24512208Sbde	int	fd;
2461541Srgrimes	struct	iovec *iovp;
2471541Srgrimes	u_int	iovcnt;
2481541Srgrimes};
24912221Sbde#endif
25082752Sdillon/*
25182752Sdillon * MPSAFE
25282752Sdillon */
2531549Srgrimesint
25483366Sjulianreadv(td, uap)
25583366Sjulian	struct thread *td;
25686341Sdillon	struct readv_args *uap;
2571541Srgrimes{
25886341Sdillon	struct file *fp;
2591541Srgrimes	struct uio auio;
26086341Sdillon	struct iovec *iov;
2611541Srgrimes	struct iovec *needfree;
2621541Srgrimes	struct iovec aiov[UIO_SMALLIOV];
2631541Srgrimes	long i, cnt, error = 0;
2641541Srgrimes	u_int iovlen;
2651541Srgrimes#ifdef KTRACE
2661541Srgrimes	struct iovec *ktriov = NULL;
26762378Sgreen	struct uio ktruio;
2681541Srgrimes#endif
26982752Sdillon	mtx_lock(&Giant);
2701541Srgrimes
27186341Sdillon	if ((error = fget_read(td, uap->fd, &fp)) != 0)
27282752Sdillon		goto done2;
2731541Srgrimes	/* note: can't use iovlen until iovcnt is validated */
2741541Srgrimes	iovlen = uap->iovcnt * sizeof (struct iovec);
2751541Srgrimes	if (uap->iovcnt > UIO_SMALLIOV) {
27682752Sdillon		if (uap->iovcnt > UIO_MAXIOV) {
27782752Sdillon			error = EINVAL;
27882752Sdillon			goto done2;
27982752Sdillon		}
2801541Srgrimes		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
2811541Srgrimes		needfree = iov;
2821541Srgrimes	} else {
2831541Srgrimes		iov = aiov;
2841541Srgrimes		needfree = NULL;
2851541Srgrimes	}
2861541Srgrimes	auio.uio_iov = iov;
2871541Srgrimes	auio.uio_iovcnt = uap->iovcnt;
2881541Srgrimes	auio.uio_rw = UIO_READ;
2891541Srgrimes	auio.uio_segflg = UIO_USERSPACE;
29083366Sjulian	auio.uio_td = td;
29126671Sdyson	auio.uio_offset = -1;
2923098Sphk	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
2931541Srgrimes		goto done;
2941541Srgrimes	auio.uio_resid = 0;
2951541Srgrimes	for (i = 0; i < uap->iovcnt; i++) {
29638517Sdfr		if (iov->iov_len > INT_MAX - auio.uio_resid) {
2971541Srgrimes			error = EINVAL;
2981541Srgrimes			goto done;
2991541Srgrimes		}
30038517Sdfr		auio.uio_resid += iov->iov_len;
3011541Srgrimes		iov++;
3021541Srgrimes	}
3031541Srgrimes#ifdef KTRACE
3041541Srgrimes	/*
3051541Srgrimes	 * if tracing, save a copy of iovec
3061541Srgrimes	 */
30783366Sjulian	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
3081541Srgrimes		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
3091541Srgrimes		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
31062378Sgreen		ktruio = auio;
3111541Srgrimes	}
3121541Srgrimes#endif
3131541Srgrimes	cnt = auio.uio_resid;
31483366Sjulian	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
3151541Srgrimes		if (auio.uio_resid != cnt && (error == ERESTART ||
3161541Srgrimes		    error == EINTR || error == EWOULDBLOCK))
3171541Srgrimes			error = 0;
31868883Sdillon	}
3191541Srgrimes	cnt -= auio.uio_resid;
3201541Srgrimes#ifdef KTRACE
3211541Srgrimes	if (ktriov != NULL) {
32262378Sgreen		if (error == 0) {
32362378Sgreen			ktruio.uio_iov = ktriov;
32462378Sgreen			ktruio.uio_resid = cnt;
32583366Sjulian			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
32662378Sgreen			    error);
32762378Sgreen		}
3281541Srgrimes		FREE(ktriov, M_TEMP);
3291541Srgrimes	}
3301541Srgrimes#endif
33183366Sjulian	td->td_retval[0] = cnt;
3321541Srgrimesdone:
33383366Sjulian	fdrop(fp, td);
3341541Srgrimes	if (needfree)
3351541Srgrimes		FREE(needfree, M_IOV);
33682752Sdillondone2:
33782752Sdillon	mtx_unlock(&Giant);
3381541Srgrimes	return (error);
3391541Srgrimes}
3401541Srgrimes
3411541Srgrimes/*
3421541Srgrimes * Write system call
3431541Srgrimes */
34412221Sbde#ifndef _SYS_SYSPROTO_H_
3451541Srgrimesstruct write_args {
3461541Srgrimes	int	fd;
34738864Sbde	const void *buf;
34838864Sbde	size_t	nbyte;
3491541Srgrimes};
35012221Sbde#endif
35182752Sdillon/*
35282752Sdillon * MPSAFE
35382752Sdillon */
3541549Srgrimesint
35583366Sjulianwrite(td, uap)
35683366Sjulian	struct thread *td;
35786341Sdillon	struct write_args *uap;
3581541Srgrimes{
35986341Sdillon	struct file *fp;
36068883Sdillon	int error;
3611541Srgrimes
36282752Sdillon	mtx_lock(&Giant);
36386341Sdillon	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
36483366Sjulian		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
36582752Sdillon			    (off_t)-1, 0);
36683366Sjulian		fdrop(fp, td);
36782752Sdillon	} else {
36886341Sdillon		error = EBADF;	/* XXX this can't be right */
36982752Sdillon	}
37082752Sdillon	mtx_unlock(&Giant);
37168883Sdillon	return(error);
3721541Srgrimes}
3731541Srgrimes
3741541Srgrimes/*
37545311Sdt * Pwrite system call
37645065Salc */
37745065Salc#ifndef _SYS_SYSPROTO_H_
37845065Salcstruct pwrite_args {
37945065Salc	int	fd;
38045065Salc	const void *buf;
38145065Salc	size_t	nbyte;
38245311Sdt	int	pad;
38345311Sdt	off_t	offset;
38445065Salc};
38545065Salc#endif
38682752Sdillon/*
38782752Sdillon * MPSAFE
38882752Sdillon */
38945065Salcint
39083366Sjulianpwrite(td, uap)
39183366Sjulian	struct thread *td;
39286341Sdillon	struct pwrite_args *uap;
39345065Salc{
39486341Sdillon	struct file *fp;
39568883Sdillon	int error;
39645311Sdt
39782752Sdillon	mtx_lock(&Giant);
39886341Sdillon	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
39986341Sdillon		if (fp->f_type == DTYPE_VNODE) {
40086341Sdillon			error = dofilewrite(td, fp, uap->fd, uap->buf,
40186341Sdillon				    uap->nbyte, uap->offset, FOF_OFFSET);
40286341Sdillon		} else {
40386341Sdillon			error = ESPIPE;
40486341Sdillon		}
40583366Sjulian		fdrop(fp, td);
40668883Sdillon	} else {
40786341Sdillon		error = EBADF;	/* this can't be right */
40868883Sdillon	}
40968883Sdillon	return(error);
41045311Sdt}
41145311Sdt
41245311Sdtstatic int
41383366Sjuliandofilewrite(td, fp, fd, buf, nbyte, offset, flags)
41483366Sjulian	struct thread *td;
41545311Sdt	struct file *fp;
41645311Sdt	int fd, flags;
41745311Sdt	const void *buf;
41845311Sdt	size_t nbyte;
41945311Sdt	off_t offset;
42045311Sdt{
42145065Salc	struct uio auio;
42245065Salc	struct iovec aiov;
42345065Salc	long cnt, error = 0;
42445065Salc#ifdef KTRACE
42545065Salc	struct iovec ktriov;
42662378Sgreen	struct uio ktruio;
42763905Sgreen	int didktr = 0;
42845065Salc#endif
42945065Salc
43063974Speter	aiov.iov_base = (void *)(uintptr_t)buf;
43145311Sdt	aiov.iov_len = nbyte;
43245065Salc	auio.uio_iov = &aiov;
43345065Salc	auio.uio_iovcnt = 1;
43445311Sdt	auio.uio_offset = offset;
43545311Sdt	if (nbyte > INT_MAX)
43645065Salc		return (EINVAL);
43745311Sdt	auio.uio_resid = nbyte;
43845065Salc	auio.uio_rw = UIO_WRITE;
43945065Salc	auio.uio_segflg = UIO_USERSPACE;
44083366Sjulian	auio.uio_td = td;
44145065Salc#ifdef KTRACE
44245065Salc	/*
44362378Sgreen	 * if tracing, save a copy of iovec and uio
44445065Salc	 */
44583366Sjulian	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
44645065Salc		ktriov = aiov;
44762378Sgreen		ktruio = auio;
44863905Sgreen		didktr = 1;
44962378Sgreen	}
45045065Salc#endif
45145311Sdt	cnt = nbyte;
45269407Salfred	if (fp->f_type == DTYPE_VNODE)
45369407Salfred		bwillwrite();
45483366Sjulian	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
45545065Salc		if (auio.uio_resid != cnt && (error == ERESTART ||
45645065Salc		    error == EINTR || error == EWOULDBLOCK))
45745065Salc			error = 0;
45873929Sjhb		if (error == EPIPE) {
45983366Sjulian			PROC_LOCK(td->td_proc);
46083366Sjulian			psignal(td->td_proc, SIGPIPE);
46183366Sjulian			PROC_UNLOCK(td->td_proc);
46273929Sjhb		}
46345065Salc	}
46445065Salc	cnt -= auio.uio_resid;
46545065Salc#ifdef KTRACE
46663905Sgreen	if (didktr && error == 0) {
46762378Sgreen		ktruio.uio_iov = &ktriov;
46862378Sgreen		ktruio.uio_resid = cnt;
46983366Sjulian		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
47062378Sgreen	}
47145065Salc#endif
47283366Sjulian	td->td_retval[0] = cnt;
47345065Salc	return (error);
47445065Salc}
47545065Salc
47645065Salc/*
4771541Srgrimes * Gather write system call
4781541Srgrimes */
47912221Sbde#ifndef _SYS_SYSPROTO_H_
4801541Srgrimesstruct writev_args {
4811541Srgrimes	int	fd;
4821541Srgrimes	struct	iovec *iovp;
4831541Srgrimes	u_int	iovcnt;
4841541Srgrimes};
48512221Sbde#endif
48682752Sdillon/*
48782752Sdillon * MPSAFE
48882752Sdillon */
4891549Srgrimesint
49083366Sjulianwritev(td, uap)
49183366Sjulian	struct thread *td;
4921541Srgrimes	register struct writev_args *uap;
4931541Srgrimes{
49486341Sdillon	struct file *fp;
4951541Srgrimes	struct uio auio;
4961541Srgrimes	register struct iovec *iov;
4971541Srgrimes	struct iovec *needfree;
4981541Srgrimes	struct iovec aiov[UIO_SMALLIOV];
4991541Srgrimes	long i, cnt, error = 0;
5001541Srgrimes	u_int iovlen;
5011541Srgrimes#ifdef KTRACE
5021541Srgrimes	struct iovec *ktriov = NULL;
50362378Sgreen	struct uio ktruio;
5041541Srgrimes#endif
5051541Srgrimes
50682752Sdillon	mtx_lock(&Giant);
50786341Sdillon	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
50882752Sdillon		error = EBADF;
50982752Sdillon		goto done2;
51082752Sdillon	}
5111541Srgrimes	/* note: can't use iovlen until iovcnt is validated */
5121541Srgrimes	iovlen = uap->iovcnt * sizeof (struct iovec);
5131541Srgrimes	if (uap->iovcnt > UIO_SMALLIOV) {
51452227Sgreen		if (uap->iovcnt > UIO_MAXIOV) {
51552227Sgreen			needfree = NULL;
51652227Sgreen			error = EINVAL;
51752227Sgreen			goto done;
51852227Sgreen		}
5191541Srgrimes		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
5201541Srgrimes		needfree = iov;
5211541Srgrimes	} else {
5221541Srgrimes		iov = aiov;
5231541Srgrimes		needfree = NULL;
5241541Srgrimes	}
5251541Srgrimes	auio.uio_iov = iov;
5261541Srgrimes	auio.uio_iovcnt = uap->iovcnt;
5271541Srgrimes	auio.uio_rw = UIO_WRITE;
5281541Srgrimes	auio.uio_segflg = UIO_USERSPACE;
52983366Sjulian	auio.uio_td = td;
53026671Sdyson	auio.uio_offset = -1;
5313098Sphk	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
5321541Srgrimes		goto done;
5331541Srgrimes	auio.uio_resid = 0;
5341541Srgrimes	for (i = 0; i < uap->iovcnt; i++) {
53538517Sdfr		if (iov->iov_len > INT_MAX - auio.uio_resid) {
5361541Srgrimes			error = EINVAL;
5371541Srgrimes			goto done;
5381541Srgrimes		}
53938517Sdfr		auio.uio_resid += iov->iov_len;
5401541Srgrimes		iov++;
5411541Srgrimes	}
5421541Srgrimes#ifdef KTRACE
5431541Srgrimes	/*
54462378Sgreen	 * if tracing, save a copy of iovec and uio
5451541Srgrimes	 */
54683366Sjulian	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
5471541Srgrimes		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
5481541Srgrimes		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
54962378Sgreen		ktruio = auio;
5501541Srgrimes	}
5511541Srgrimes#endif
5521541Srgrimes	cnt = auio.uio_resid;
55369733Sdillon	if (fp->f_type == DTYPE_VNODE)
55469733Sdillon		bwillwrite();
55583366Sjulian	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
5561541Srgrimes		if (auio.uio_resid != cnt && (error == ERESTART ||
5571541Srgrimes		    error == EINTR || error == EWOULDBLOCK))
5581541Srgrimes			error = 0;
55973929Sjhb		if (error == EPIPE) {
56083366Sjulian			PROC_LOCK(td->td_proc);
56183366Sjulian			psignal(td->td_proc, SIGPIPE);
56283366Sjulian			PROC_UNLOCK(td->td_proc);
56373929Sjhb		}
5641541Srgrimes	}
5651541Srgrimes	cnt -= auio.uio_resid;
5661541Srgrimes#ifdef KTRACE
5671541Srgrimes	if (ktriov != NULL) {
56862378Sgreen		if (error == 0) {
56962378Sgreen			ktruio.uio_iov = ktriov;
57062378Sgreen			ktruio.uio_resid = cnt;
57183366Sjulian			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
57262378Sgreen			    error);
57362378Sgreen		}
5741541Srgrimes		FREE(ktriov, M_TEMP);
5751541Srgrimes	}
5761541Srgrimes#endif
57783366Sjulian	td->td_retval[0] = cnt;
5781541Srgrimesdone:
57983366Sjulian	fdrop(fp, td);
5801541Srgrimes	if (needfree)
5811541Srgrimes		FREE(needfree, M_IOV);
58282752Sdillondone2:
58382752Sdillon	mtx_unlock(&Giant);
5841541Srgrimes	return (error);
5851541Srgrimes}
5861541Srgrimes
5871541Srgrimes/*
5881541Srgrimes * Ioctl system call
5891541Srgrimes */
59012221Sbde#ifndef _SYS_SYSPROTO_H_
5911541Srgrimesstruct ioctl_args {
5921541Srgrimes	int	fd;
59338517Sdfr	u_long	com;
5941541Srgrimes	caddr_t	data;
5951541Srgrimes};
59612221Sbde#endif
59782752Sdillon/*
59882752Sdillon * MPSAFE
59982752Sdillon */
6001541Srgrimes/* ARGSUSED */
6011549Srgrimesint
60283366Sjulianioctl(td, uap)
60383366Sjulian	struct thread *td;
6041541Srgrimes	register struct ioctl_args *uap;
6051541Srgrimes{
6061541Srgrimes	register struct file *fp;
6071541Srgrimes	register struct filedesc *fdp;
60836846Sdfr	register u_long com;
60982752Sdillon	int error = 0;
6101541Srgrimes	register u_int size;
6111541Srgrimes	caddr_t data, memp;
6121541Srgrimes	int tmp;
6131541Srgrimes#define STK_PARAMS	128
61460269Sdillon	union {
61560269Sdillon	    char stkbuf[STK_PARAMS];
61660269Sdillon	    long align;
61760269Sdillon	} ubuf;
6181541Srgrimes
61989306Salfred	fp = ffind_hold(td, uap->fd);
62089306Salfred	if (fp == NULL)
62189306Salfred		return (EBADF);
62282752Sdillon	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
62389306Salfred		fdrop(fp, td);
62489306Salfred		return (EBADF);
62582752Sdillon	}
62689306Salfred	fdp = td->td_proc->p_fd;
6271541Srgrimes	switch (com = uap->com) {
6281541Srgrimes	case FIONCLEX:
62989306Salfred		FILEDESC_LOCK(fdp);
6301541Srgrimes		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
63189306Salfred		FILEDESC_UNLOCK(fdp);
63289306Salfred		fdrop(fp, td);
63389306Salfred		return (0);
6341541Srgrimes	case FIOCLEX:
63589306Salfred		FILEDESC_LOCK(fdp);
6361541Srgrimes		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
63789306Salfred		FILEDESC_UNLOCK(fdp);
63889306Salfred		fdrop(fp, td);
63989306Salfred		return (0);
6401541Srgrimes	}
6411541Srgrimes
6421541Srgrimes	/*
6431541Srgrimes	 * Interpret high order word to find amount of data to be
6441541Srgrimes	 * copied to/from the user's address space.
6451541Srgrimes	 */
6461541Srgrimes	size = IOCPARM_LEN(com);
64782752Sdillon	if (size > IOCPARM_MAX) {
64889306Salfred		fdrop(fp, td);
64989306Salfred		return (ENOTTY);
65082752Sdillon	}
65168883Sdillon
65289306Salfred	mtx_lock(&Giant);
6531541Srgrimes	memp = NULL;
65460269Sdillon	if (size > sizeof (ubuf.stkbuf)) {
6551541Srgrimes		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
6561541Srgrimes		data = memp;
65768883Sdillon	} else {
65860269Sdillon		data = ubuf.stkbuf;
65968883Sdillon	}
6601541Srgrimes	if (com&IOC_IN) {
6611541Srgrimes		if (size) {
6621541Srgrimes			error = copyin(uap->data, data, (u_int)size);
6631541Srgrimes			if (error) {
6641541Srgrimes				if (memp)
6651541Srgrimes					free(memp, M_IOCTLOPS);
66683366Sjulian				fdrop(fp, td);
66789306Salfred				goto done;
6681541Srgrimes			}
66968883Sdillon		} else {
6701541Srgrimes			*(caddr_t *)data = uap->data;
67168883Sdillon		}
67268883Sdillon	} else if ((com&IOC_OUT) && size) {
6731541Srgrimes		/*
6741541Srgrimes		 * Zero the buffer so the user always
6751541Srgrimes		 * gets back something deterministic.
6761541Srgrimes		 */
6771541Srgrimes		bzero(data, size);
67868883Sdillon	} else if (com&IOC_VOID) {
6791541Srgrimes		*(caddr_t *)data = uap->data;
68068883Sdillon	}
6811541Srgrimes
6821541Srgrimes	switch (com) {
6831541Srgrimes
6841541Srgrimes	case FIONBIO:
68589306Salfred		FILE_LOCK(fp);
6863098Sphk		if ((tmp = *(int *)data))
6871541Srgrimes			fp->f_flag |= FNONBLOCK;
6881541Srgrimes		else
6891541Srgrimes			fp->f_flag &= ~FNONBLOCK;
69089306Salfred		FILE_UNLOCK(fp);
69183366Sjulian		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
6921541Srgrimes		break;
6931541Srgrimes
6941541Srgrimes	case FIOASYNC:
69589306Salfred		FILE_LOCK(fp);
6963098Sphk		if ((tmp = *(int *)data))
6971541Srgrimes			fp->f_flag |= FASYNC;
6981541Srgrimes		else
6991541Srgrimes			fp->f_flag &= ~FASYNC;
70089306Salfred		FILE_UNLOCK(fp);
70183366Sjulian		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
7021541Srgrimes		break;
7031541Srgrimes
7041541Srgrimes	default:
70583366Sjulian		error = fo_ioctl(fp, com, data, td);
7061541Srgrimes		/*
7071541Srgrimes		 * Copy any data to user, size was
7081541Srgrimes		 * already set and checked above.
7091541Srgrimes		 */
7101541Srgrimes		if (error == 0 && (com&IOC_OUT) && size)
7111541Srgrimes			error = copyout(data, uap->data, (u_int)size);
7121541Srgrimes		break;
7131541Srgrimes	}
7141541Srgrimes	if (memp)
7151541Srgrimes		free(memp, M_IOCTLOPS);
71683366Sjulian	fdrop(fp, td);
71789306Salfreddone:
71882752Sdillon	mtx_unlock(&Giant);
7191541Srgrimes	return (error);
7201541Srgrimes}
7211541Srgrimes
72255478Speterstatic int	nselcoll;	/* Select collisions since boot */
72376564Stanimurastruct cv	selwait;
72455478SpeterSYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
7251541Srgrimes
7261541Srgrimes/*
7271541Srgrimes * Select system call.
7281541Srgrimes */
72912221Sbde#ifndef _SYS_SYSPROTO_H_
7301541Srgrimesstruct select_args {
73117702Ssmpatel	int	nd;
7321541Srgrimes	fd_set	*in, *ou, *ex;
7331541Srgrimes	struct	timeval *tv;
7341541Srgrimes};
73512221Sbde#endif
73682752Sdillon/*
73782752Sdillon * MPSAFE
73882752Sdillon */
7391549Srgrimesint
74083366Sjulianselect(td, uap)
74183366Sjulian	register struct thread *td;
7421541Srgrimes	register struct select_args *uap;
7431541Srgrimes{
74489306Salfred	struct filedesc *fdp;
74522945Sbde	/*
74622945Sbde	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
74722945Sbde	 * infds with the new FD_SETSIZE of 1024, and more than enough for
74822945Sbde	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
74922945Sbde	 * of 256.
75022945Sbde	 */
75122945Sbde	fd_mask s_selbits[howmany(2048, NFDBITS)];
75276564Stanimura	fd_mask s_heldbits[howmany(2048, NFDBITS)];
75376564Stanimura	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
75435029Sphk	struct timeval atv, rtv, ttv;
75576564Stanimura	int ncoll, error, timo, i;
75622945Sbde	u_int nbufbytes, ncpbytes, nfdbits;
7571541Srgrimes
75817702Ssmpatel	if (uap->nd < 0)
75917713Ssmpatel		return (EINVAL);
76089306Salfred	fdp = td->td_proc->p_fd;
76182752Sdillon	mtx_lock(&Giant);
76289306Salfred	FILEDESC_LOCK(fdp);
76382752Sdillon
76483366Sjulian	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
76583366Sjulian		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
76689306Salfred	FILEDESC_UNLOCK(fdp);
76717702Ssmpatel
76822945Sbde	/*
76922945Sbde	 * Allocate just enough bits for the non-null fd_sets.  Use the
77022945Sbde	 * preallocated auto buffer if possible.
77122945Sbde	 */
77222945Sbde	nfdbits = roundup(uap->nd, NFDBITS);
77322945Sbde	ncpbytes = nfdbits / NBBY;
77422945Sbde	nbufbytes = 0;
77522945Sbde	if (uap->in != NULL)
77622945Sbde		nbufbytes += 2 * ncpbytes;
77722945Sbde	if (uap->ou != NULL)
77822945Sbde		nbufbytes += 2 * ncpbytes;
77922945Sbde	if (uap->ex != NULL)
78022945Sbde		nbufbytes += 2 * ncpbytes;
78122945Sbde	if (nbufbytes <= sizeof s_selbits)
78222945Sbde		selbits = &s_selbits[0];
78322945Sbde	else
78422945Sbde		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
78576564Stanimura	if (2 * ncpbytes <= sizeof s_heldbits) {
78676564Stanimura		bzero(s_heldbits, sizeof(s_heldbits));
78776564Stanimura		heldbits = &s_heldbits[0];
78876564Stanimura	} else
78976564Stanimura		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
79017702Ssmpatel
79117702Ssmpatel	/*
79222945Sbde	 * Assign pointers into the bit buffers and fetch the input bits.
79322945Sbde	 * Put the output buffers together so that they can be bzeroed
79422945Sbde	 * together.
79517702Ssmpatel	 */
79622945Sbde	sbp = selbits;
79776564Stanimura	hibits = heldbits + ncpbytes / sizeof *heldbits;
79876564Stanimura	hobits = heldbits;
7991541Srgrimes#define	getbits(name, x) \
80022945Sbde	do {								\
80122945Sbde		if (uap->name == NULL)					\
80222945Sbde			ibits[x] = NULL;				\
80322945Sbde		else {							\
80422945Sbde			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
80522945Sbde			obits[x] = sbp;					\
80622945Sbde			sbp += ncpbytes / sizeof *sbp;			\
80722945Sbde			error = copyin(uap->name, ibits[x], ncpbytes);	\
80876564Stanimura			if (error != 0)					\
80976564Stanimura				goto done_noproclock;			\
81076564Stanimura			for (i = 0;					\
81176564Stanimura			     i < ncpbytes / sizeof ibits[i][0];		\
81276564Stanimura			     i++)					\
81376564Stanimura				hibits[i] |= ibits[x][i];		\
81422945Sbde		}							\
81522945Sbde	} while (0)
8161541Srgrimes	getbits(in, 0);
8171541Srgrimes	getbits(ou, 1);
8181541Srgrimes	getbits(ex, 2);
8191541Srgrimes#undef	getbits
82022945Sbde	if (nbufbytes != 0)
82122945Sbde		bzero(selbits, nbufbytes / 2);
8221541Srgrimes
8231541Srgrimes	if (uap->tv) {
8241541Srgrimes		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
8251541Srgrimes			sizeof (atv));
82676564Stanimura		if (error)
82776564Stanimura			goto done_noproclock;
8281541Srgrimes		if (itimerfix(&atv)) {
8291541Srgrimes			error = EINVAL;
83076564Stanimura			goto done_noproclock;
8311541Srgrimes		}
83236119Sphk		getmicrouptime(&rtv);
83335029Sphk		timevaladd(&atv, &rtv);
83463057Sjhb	} else {
83535029Sphk		atv.tv_sec = 0;
83663057Sjhb		atv.tv_usec = 0;
83763057Sjhb	}
83883366Sjulian	selholddrop(td, hibits, hobits, uap->nd, 1);
83935029Sphk	timo = 0;
84083366Sjulian	PROC_LOCK(td->td_proc);
8411541Srgrimesretry:
8421541Srgrimes	ncoll = nselcoll;
84383799Sjhb	mtx_lock_spin(&sched_lock);
84483366Sjulian	td->td_flags |= TDF_SELECT;
84583799Sjhb	mtx_unlock_spin(&sched_lock);
84683366Sjulian	PROC_UNLOCK(td->td_proc);
84783366Sjulian	error = selscan(td, ibits, obits, uap->nd);
84883366Sjulian	PROC_LOCK(td->td_proc);
84983366Sjulian	if (error || td->td_retval[0])
8501541Srgrimes		goto done;
85163049Sjhb	if (atv.tv_sec || atv.tv_usec) {
85236119Sphk		getmicrouptime(&rtv);
85376618Stanimura		if (timevalcmp(&rtv, &atv, >=)) {
85476618Stanimura			/*
85576618Stanimura			 * An event of our interest may occur during locking a process.
85676618Stanimura			 * In order to avoid missing the event that occured during locking
85783366Sjulian			 * the process, test TDF_SELECT and rescan file descriptors if
85876618Stanimura			 * necessary.
85976618Stanimura			 */
86083799Sjhb			mtx_lock_spin(&sched_lock);
86183366Sjulian			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
86276618Stanimura				ncoll = nselcoll;
86383366Sjulian				td->td_flags |= TDF_SELECT;
86483799Sjhb				mtx_unlock_spin(&sched_lock);
86583366Sjulian				PROC_UNLOCK(td->td_proc);
86683366Sjulian				error = selscan(td, ibits, obits, uap->nd);
86783366Sjulian				PROC_LOCK(td->td_proc);
86883799Sjhb			} else
86983799Sjhb				mtx_unlock_spin(&sched_lock);
87035029Sphk			goto done;
87176618Stanimura		}
87235029Sphk		ttv = atv;
87335029Sphk		timevalsub(&ttv, &rtv);
87435029Sphk		timo = ttv.tv_sec > 24 * 60 * 60 ?
87535029Sphk		    24 * 60 * 60 * hz : tvtohz(&ttv);
87635029Sphk	}
87783799Sjhb	mtx_lock_spin(&sched_lock);
87883366Sjulian	td->td_flags &= ~TDF_SELECT;
87983799Sjhb	mtx_unlock_spin(&sched_lock);
88055943Sjasone
88176564Stanimura	if (timo > 0)
88283366Sjulian		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
88376564Stanimura	else
88483366Sjulian		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
88555943Sjasone
8861541Srgrimes	if (error == 0)
8871541Srgrimes		goto retry;
88876564Stanimura
8891541Srgrimesdone:
89083799Sjhb	mtx_lock_spin(&sched_lock);
89183366Sjulian	td->td_flags &= ~TDF_SELECT;
89283799Sjhb	mtx_unlock_spin(&sched_lock);
89383366Sjulian	PROC_UNLOCK(td->td_proc);
89483366Sjulian	selholddrop(td, hibits, hobits, uap->nd, 0);
89576564Stanimuradone_noproclock:
8961541Srgrimes	/* select is not restarted after signals... */
8971541Srgrimes	if (error == ERESTART)
8981541Srgrimes		error = EINTR;
8991541Srgrimes	if (error == EWOULDBLOCK)
9001541Srgrimes		error = 0;
9011541Srgrimes#define	putbits(name, x) \
90222945Sbde	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
9031541Srgrimes		error = error2;
9041541Srgrimes	if (error == 0) {
9051541Srgrimes		int error2;
9061541Srgrimes
9071541Srgrimes		putbits(in, 0);
9081541Srgrimes		putbits(ou, 1);
9091541Srgrimes		putbits(ex, 2);
9101541Srgrimes#undef putbits
9111541Srgrimes	}
91222945Sbde	if (selbits != &s_selbits[0])
91322945Sbde		free(selbits, M_SELECT);
91476564Stanimura	if (heldbits != &s_heldbits[0])
91576564Stanimura		free(heldbits, M_SELECT);
91682752Sdillon
91782752Sdillon	mtx_unlock(&Giant);
9181541Srgrimes	return (error);
9191541Srgrimes}
9201541Srgrimes
92189306Salfred/*
92289306Salfred * Used to hold then release a group of fds for select(2).
92389306Salfred * Hold (hold == 1) or release (hold == 0) a group of filedescriptors.
92489306Salfred * if holding then use ibits setting the bits in obits, otherwise use obits.
92589306Salfred */
92612819Sphkstatic int
92783366Sjulianselholddrop(td, ibits, obits, nfd, hold)
92883366Sjulian	struct thread *td;
92976564Stanimura	fd_mask *ibits, *obits;
93076564Stanimura	int nfd, hold;
93176564Stanimura{
93283366Sjulian	struct filedesc *fdp = td->td_proc->p_fd;
93376564Stanimura	int i, fd;
93476564Stanimura	fd_mask bits;
93576564Stanimura	struct file *fp;
93676564Stanimura
93789306Salfred	FILEDESC_LOCK(fdp);
93876564Stanimura	for (i = 0; i < nfd; i += NFDBITS) {
93976564Stanimura		if (hold)
94076564Stanimura			bits = ibits[i/NFDBITS];
94176564Stanimura		else
94276564Stanimura			bits = obits[i/NFDBITS];
94376564Stanimura		/* ffs(int mask) not portable, fd_mask is long */
94476564Stanimura		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
94576564Stanimura			if (!(bits & 1))
94676564Stanimura				continue;
94776564Stanimura			fp = fdp->fd_ofiles[fd];
94889306Salfred			if (fp == NULL) {
94989306Salfred				FILEDESC_UNLOCK(fdp);
95076564Stanimura				return (EBADF);
95189306Salfred			}
95276564Stanimura			if (hold) {
95376564Stanimura				fhold(fp);
95476564Stanimura				obits[(fd)/NFDBITS] |=
95576564Stanimura				    ((fd_mask)1 << ((fd) % NFDBITS));
95689306Salfred			} else {
95789306Salfred				/* XXX: optimize by making a special
95889306Salfred				 * version of fdrop that only unlocks
95989306Salfred				 * the filedesc if needed?  This would
96089306Salfred				 * redcuce the number of lock/unlock
96189306Salfred				 * pairs by quite a bit.
96289306Salfred				 */
96389306Salfred				FILEDESC_UNLOCK(fdp);
96483366Sjulian				fdrop(fp, td);
96589306Salfred				FILEDESC_LOCK(fdp);
96689306Salfred			}
96776564Stanimura		}
96876564Stanimura	}
96989306Salfred	FILEDESC_UNLOCK(fdp);
97076564Stanimura	return (0);
97176564Stanimura}
97276564Stanimura
97376564Stanimurastatic int
97483366Sjulianselscan(td, ibits, obits, nfd)
97583366Sjulian	struct thread *td;
97617702Ssmpatel	fd_mask **ibits, **obits;
97730994Sphk	int nfd;
9781541Srgrimes{
97957357Speter	int msk, i, fd;
98057357Speter	fd_mask bits;
9811541Srgrimes	struct file *fp;
9821541Srgrimes	int n = 0;
98331364Sbde	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
98431364Sbde	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
9851541Srgrimes
9861541Srgrimes	for (msk = 0; msk < 3; msk++) {
98722945Sbde		if (ibits[msk] == NULL)
98822945Sbde			continue;
9891541Srgrimes		for (i = 0; i < nfd; i += NFDBITS) {
99017702Ssmpatel			bits = ibits[msk][i/NFDBITS];
99157357Speter			/* ffs(int mask) not portable, fd_mask is long */
99257357Speter			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
99357357Speter				if (!(bits & 1))
99457357Speter					continue;
99589306Salfred				fp = ffind_hold(td, fd);
9961541Srgrimes				if (fp == NULL)
9971541Srgrimes					return (EBADF);
99883366Sjulian				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
99922945Sbde					obits[msk][(fd)/NFDBITS] |=
100057357Speter					    ((fd_mask)1 << ((fd) % NFDBITS));
10011541Srgrimes					n++;
10021541Srgrimes				}
100389306Salfred				fdrop(fp, td);
10041541Srgrimes			}
10051541Srgrimes		}
10061541Srgrimes	}
100783366Sjulian	td->td_retval[0] = n;
10081541Srgrimes	return (0);
10091541Srgrimes}
10101541Srgrimes
101129351Speter/*
101229351Speter * Poll system call.
101329351Speter */
101429351Speter#ifndef _SYS_SYSPROTO_H_
101529351Speterstruct poll_args {
101629351Speter	struct pollfd *fds;
101729351Speter	u_int	nfds;
101829351Speter	int	timeout;
101929351Speter};
102029351Speter#endif
102182752Sdillon/*
102282752Sdillon * MPSAFE
102382752Sdillon */
102429351Speterint
102583366Sjulianpoll(td, uap)
102683366Sjulian	struct thread *td;
102773159Sjlemon	struct poll_args *uap;
102829351Speter{
102929351Speter	caddr_t bits;
103029351Speter	char smallbits[32 * sizeof(struct pollfd)];
103135029Sphk	struct timeval atv, rtv, ttv;
103276564Stanimura	int ncoll, error = 0, timo;
103373159Sjlemon	u_int nfds;
103429351Speter	size_t ni;
103576564Stanimura	struct pollfd p_heldbits[32];
103676564Stanimura	struct pollfd *heldbits;
103729351Speter
103872146Speter	nfds = SCARG(uap, nfds);
103982752Sdillon
104082752Sdillon	mtx_lock(&Giant);
104172146Speter	/*
104272203Speter	 * This is kinda bogus.  We have fd limits, but that is not
104372203Speter	 * really related to the size of the pollfd array.  Make sure
104472203Speter	 * we let the process use at least FD_SETSIZE entries and at
104572203Speter	 * least enough for the current limits.  We want to be reasonably
104672203Speter	 * safe, but not overly restrictive.
104772146Speter	 */
104883366Sjulian	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
104983366Sjulian	    (nfds > FD_SETSIZE)) {
105082752Sdillon		error = EINVAL;
105182752Sdillon		goto done2;
105282752Sdillon	}
105372146Speter	ni = nfds * sizeof(struct pollfd);
105429351Speter	if (ni > sizeof(smallbits))
105529351Speter		bits = malloc(ni, M_TEMP, M_WAITOK);
105629351Speter	else
105729351Speter		bits = smallbits;
105876564Stanimura	if (ni > sizeof(p_heldbits))
105976564Stanimura		heldbits = malloc(ni, M_TEMP, M_WAITOK);
106076564Stanimura	else {
106176564Stanimura		bzero(p_heldbits, sizeof(p_heldbits));
106276564Stanimura		heldbits = p_heldbits;
106376564Stanimura	}
106429351Speter	error = copyin(SCARG(uap, fds), bits, ni);
106529351Speter	if (error)
106676564Stanimura		goto done_noproclock;
106776564Stanimura	bcopy(bits, heldbits, ni);
106829351Speter	if (SCARG(uap, timeout) != INFTIM) {
106929351Speter		atv.tv_sec = SCARG(uap, timeout) / 1000;
107029351Speter		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
107129351Speter		if (itimerfix(&atv)) {
107229351Speter			error = EINVAL;
107376564Stanimura			goto done_noproclock;
107429351Speter		}
107536119Sphk		getmicrouptime(&rtv);
107635029Sphk		timevaladd(&atv, &rtv);
107763057Sjhb	} else {
107835029Sphk		atv.tv_sec = 0;
107963057Sjhb		atv.tv_usec = 0;
108063057Sjhb	}
108183366Sjulian	pollholddrop(td, heldbits, nfds, 1);
108235029Sphk	timo = 0;
108383366Sjulian	PROC_LOCK(td->td_proc);
108429351Speterretry:
108529351Speter	ncoll = nselcoll;
108683799Sjhb	mtx_lock_spin(&sched_lock);
108783366Sjulian	td->td_flags |= TDF_SELECT;
108883799Sjhb	mtx_unlock_spin(&sched_lock);
108983366Sjulian	PROC_UNLOCK(td->td_proc);
109083366Sjulian	error = pollscan(td, (struct pollfd *)bits, nfds);
109183366Sjulian	PROC_LOCK(td->td_proc);
109283366Sjulian	if (error || td->td_retval[0])
109329351Speter		goto done;
109463049Sjhb	if (atv.tv_sec || atv.tv_usec) {
109536119Sphk		getmicrouptime(&rtv);
109676618Stanimura		if (timevalcmp(&rtv, &atv, >=)) {
109776618Stanimura			/*
109876618Stanimura			 * An event of our interest may occur during locking a process.
109976618Stanimura			 * In order to avoid missing the event that occured during locking
110083366Sjulian			 * the process, test TDF_SELECT and rescan file descriptors if
110176618Stanimura			 * necessary.
110276618Stanimura			 */
110383799Sjhb			mtx_lock_spin(&sched_lock);
110483366Sjulian			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
110576618Stanimura				ncoll = nselcoll;
110683366Sjulian				td->td_flags |= TDF_SELECT;
110783799Sjhb				mtx_unlock_spin(&sched_lock);
110883366Sjulian				PROC_UNLOCK(td->td_proc);
110983366Sjulian				error = pollscan(td, (struct pollfd *)bits, nfds);
111083366Sjulian				PROC_LOCK(td->td_proc);
111183799Sjhb			} else
111283799Sjhb				mtx_unlock_spin(&sched_lock);
111335029Sphk			goto done;
111476618Stanimura		}
111535029Sphk		ttv = atv;
111635029Sphk		timevalsub(&ttv, &rtv);
111735029Sphk		timo = ttv.tv_sec > 24 * 60 * 60 ?
111835029Sphk		    24 * 60 * 60 * hz : tvtohz(&ttv);
111929351Speter	}
112083799Sjhb	mtx_lock_spin(&sched_lock);
112183366Sjulian	td->td_flags &= ~TDF_SELECT;
112283799Sjhb	mtx_unlock_spin(&sched_lock);
112376564Stanimura	if (timo > 0)
112483366Sjulian		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
112576564Stanimura	else
112683366Sjulian		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
112729351Speter	if (error == 0)
112829351Speter		goto retry;
112976564Stanimura
113029351Speterdone:
113183799Sjhb	mtx_lock_spin(&sched_lock);
113283366Sjulian	td->td_flags &= ~TDF_SELECT;
113383799Sjhb	mtx_unlock_spin(&sched_lock);
113483366Sjulian	PROC_UNLOCK(td->td_proc);
113583366Sjulian	pollholddrop(td, heldbits, nfds, 0);
113676564Stanimuradone_noproclock:
113729351Speter	/* poll is not restarted after signals... */
113829351Speter	if (error == ERESTART)
113929351Speter		error = EINTR;
114029351Speter	if (error == EWOULDBLOCK)
114129351Speter		error = 0;
114229351Speter	if (error == 0) {
114329351Speter		error = copyout(bits, SCARG(uap, fds), ni);
114429351Speter		if (error)
114529351Speter			goto out;
114629351Speter	}
114729351Speterout:
114829351Speter	if (ni > sizeof(smallbits))
114929351Speter		free(bits, M_TEMP);
115076564Stanimura	if (ni > sizeof(p_heldbits))
115176564Stanimura		free(heldbits, M_TEMP);
115282752Sdillondone2:
115382752Sdillon	mtx_unlock(&Giant);
115429351Speter	return (error);
115529351Speter}
115629351Speter
115729351Speterstatic int
115883366Sjulianpollholddrop(td, fds, nfd, hold)
115983366Sjulian	struct thread *td;
116076564Stanimura	struct pollfd *fds;
116176564Stanimura	u_int nfd;
116276564Stanimura	int hold;
116376564Stanimura{
116483366Sjulian	register struct filedesc *fdp = td->td_proc->p_fd;
116576564Stanimura	int i;
116676564Stanimura	struct file *fp;
116776564Stanimura
116889306Salfred	FILEDESC_LOCK(fdp);
116976564Stanimura	for (i = 0; i < nfd; i++, fds++) {
117076564Stanimura		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
117176564Stanimura			fp = fdp->fd_ofiles[fds->fd];
117276564Stanimura			if (hold) {
117376564Stanimura				if (fp != NULL) {
117476564Stanimura					fhold(fp);
117576564Stanimura					fds->revents = 1;
117676564Stanimura				} else
117776564Stanimura					fds->revents = 0;
117889306Salfred			} else if(fp != NULL && fds->revents) {
117989306Salfred				FILE_LOCK(fp);
118089306Salfred				FILEDESC_UNLOCK(fdp);
118189306Salfred				fdrop_locked(fp, td);
118289306Salfred				FILEDESC_LOCK(fdp);
118389306Salfred			}
118476564Stanimura		}
118576564Stanimura	}
118689306Salfred	FILEDESC_UNLOCK(fdp);
118776564Stanimura	return (0);
118876564Stanimura}
118976564Stanimura
119076564Stanimurastatic int
119183366Sjulianpollscan(td, fds, nfd)
119283366Sjulian	struct thread *td;
119329351Speter	struct pollfd *fds;
119473159Sjlemon	u_int nfd;
119529351Speter{
119683366Sjulian	register struct filedesc *fdp = td->td_proc->p_fd;
119729351Speter	int i;
119829351Speter	struct file *fp;
119929351Speter	int n = 0;
120029351Speter
120129351Speter	for (i = 0; i < nfd; i++, fds++) {
120289306Salfred		FILEDESC_LOCK(fdp);
120341632Sjkh		if (fds->fd >= fdp->fd_nfiles) {
120429351Speter			fds->revents = POLLNVAL;
120529351Speter			n++;
120689306Salfred			FILEDESC_UNLOCK(fdp);
120741632Sjkh		} else if (fds->fd < 0) {
120841632Sjkh			fds->revents = 0;
120989306Salfred			FILEDESC_UNLOCK(fdp);
121029351Speter		} else {
121129351Speter			fp = fdp->fd_ofiles[fds->fd];
121289306Salfred			FILEDESC_UNLOCK(fdp);
121368883Sdillon			if (fp == NULL) {
121429351Speter				fds->revents = POLLNVAL;
121529351Speter				n++;
121629351Speter			} else {
121731364Sbde				/*
121831364Sbde				 * Note: backend also returns POLLHUP and
121931364Sbde				 * POLLERR if appropriate.
122031364Sbde				 */
122151418Sgreen				fds->revents = fo_poll(fp, fds->events,
122283366Sjulian				    fp->f_cred, td);
122329351Speter				if (fds->revents != 0)
122429351Speter					n++;
122529351Speter			}
122629351Speter		}
122729351Speter	}
122883366Sjulian	td->td_retval[0] = n;
122929351Speter	return (0);
123029351Speter}
123129351Speter
123229351Speter/*
123329351Speter * OpenBSD poll system call.
123429351Speter * XXX this isn't quite a true representation..  OpenBSD uses select ops.
123529351Speter */
123629351Speter#ifndef _SYS_SYSPROTO_H_
123729351Speterstruct openbsd_poll_args {
123829351Speter	struct pollfd *fds;
123929351Speter	u_int	nfds;
124029351Speter	int	timeout;
124129351Speter};
124229351Speter#endif
124382752Sdillon/*
124482752Sdillon * MPSAFE
124582752Sdillon */
124629351Speterint
124783366Sjulianopenbsd_poll(td, uap)
124883366Sjulian	register struct thread *td;
124929351Speter	register struct openbsd_poll_args *uap;
125029351Speter{
125183366Sjulian	return (poll(td, (struct poll_args *)uap));
125229351Speter}
125329351Speter
12541541Srgrimes/*ARGSUSED*/
12551549Srgrimesint
125683366Sjulianseltrue(dev, events, td)
12571541Srgrimes	dev_t dev;
125829351Speter	int events;
125983366Sjulian	struct thread *td;
12601541Srgrimes{
12611541Srgrimes
126229351Speter	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
12631541Srgrimes}
12641541Srgrimes
126583366Sjulianstatic int
126683366Sjulianfind_thread_in_proc(struct proc *p, struct thread *td)
126783366Sjulian{
126883366Sjulian	struct thread *td2;
126983366Sjulian	FOREACH_THREAD_IN_PROC(p, td2) {
127083366Sjulian		if (td2 == td) {
127183366Sjulian			return (1);
127283366Sjulian		}
127383366Sjulian	}
127483366Sjulian	return (0);
127583366Sjulian}
127683366Sjulian
12771541Srgrimes/*
12781541Srgrimes * Record a select request.
12791541Srgrimes */
12801541Srgrimesvoid
12811541Srgrimesselrecord(selector, sip)
128283366Sjulian	struct thread *selector;
12831541Srgrimes	struct selinfo *sip;
12841541Srgrimes{
12851541Srgrimes	struct proc *p;
12861541Srgrimes	pid_t mypid;
12871541Srgrimes
128883366Sjulian	mypid = selector->td_proc->p_pid;
128983366Sjulian	if ((sip->si_pid == mypid) &&
129083366Sjulian	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
12911541Srgrimes		return;
129283366Sjulian	}
129383366Sjulian	if (sip->si_pid &&
129483366Sjulian	    (p = pfind(sip->si_pid)) &&
129583366Sjulian	    (find_thread_in_proc(p, sip->si_thread))) {
129672200Sbmilekic		mtx_lock_spin(&sched_lock);
129783366Sjulian	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
129872200Sbmilekic			mtx_unlock_spin(&sched_lock);
129975893Sjhb			PROC_UNLOCK(p);
130071566Sjhb			sip->si_flags |= SI_COLL;
130171566Sjhb			return;
130271566Sjhb		}
130372200Sbmilekic		mtx_unlock_spin(&sched_lock);
130475893Sjhb		PROC_UNLOCK(p);
130571566Sjhb	}
130671566Sjhb	sip->si_pid = mypid;
130783366Sjulian	sip->si_thread = selector;
13081541Srgrimes}
13091541Srgrimes
13101541Srgrimes/*
13111541Srgrimes * Do a wakeup when a selectable event occurs.
13121541Srgrimes */
13131541Srgrimesvoid
13141541Srgrimesselwakeup(sip)
13151541Srgrimes	register struct selinfo *sip;
13161541Srgrimes{
131783366Sjulian	struct thread *td;
13181541Srgrimes	register struct proc *p;
13191541Srgrimes
13201541Srgrimes	if (sip->si_pid == 0)
13211541Srgrimes		return;
13221541Srgrimes	if (sip->si_flags & SI_COLL) {
13231541Srgrimes		nselcoll++;
13241541Srgrimes		sip->si_flags &= ~SI_COLL;
132576564Stanimura		cv_broadcast(&selwait);
13261541Srgrimes	}
13271541Srgrimes	p = pfind(sip->si_pid);
13281541Srgrimes	sip->si_pid = 0;
132983366Sjulian	td = sip->si_thread;
13301541Srgrimes	if (p != NULL) {
133183366Sjulian		if (!find_thread_in_proc(p, td)) {
133283366Sjulian			PROC_UNLOCK(p); /* lock is in pfind() */;
133383366Sjulian			return;
133483366Sjulian		}
133572200Sbmilekic		mtx_lock_spin(&sched_lock);
133683366Sjulian		if (td->td_wchan == (caddr_t)&selwait) {
133783366Sjulian			if (td->td_proc->p_stat == SSLEEP)
133883366Sjulian				setrunnable(td);
13391541Srgrimes			else
134083366Sjulian				cv_waitq_remove(td);
134175893Sjhb		} else
134283366Sjulian			td->td_flags &= ~TDF_SELECT;
134375893Sjhb		mtx_unlock_spin(&sched_lock);
134483366Sjulian		PROC_UNLOCK(p); /* Lock is in pfind() */
13451541Srgrimes	}
13461541Srgrimes}
134776564Stanimura
134876564Stanimurastatic void selectinit __P((void *));
134976564StanimuraSYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
135076564Stanimura
135176564Stanimura/* ARGSUSED*/
135276564Stanimurastatic void
135376564Stanimuraselectinit(dummy)
135476564Stanimura	void *dummy;
135576564Stanimura{
135676564Stanimura	cv_init(&selwait, "select");
135776564Stanimura}
1358